#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Regression tests for EPUB parsing behavior.
"""
from pathlib import Path
from tempfile import TemporaryDirectory
import zipfile
from src.epub_parser import EpubParser
def write_text_file(filePath, content):
"""Write a UTF-8 text file"""
filePath.parent.mkdir(parents=True, exist_ok=True)
filePath.write_text(content, encoding='utf-8')
def build_test_epub(epubPath):
"""Build a minimal EPUB fixture for parser regression testing"""
with TemporaryDirectory() as tempDir:
tempPath = Path(tempDir)
write_text_file(
tempPath / 'mimetype',
'application/epub+zip'
)
write_text_file(
tempPath / 'META-INF' / 'container.xml',
'''
This is very bad.
More text.
Tail text.
''' ) write_text_file( tempPath / 'OEBPS' / 'text' / 'notes.xhtml', '''This non-linear note should not appear.
''' ) with zipfile.ZipFile(epubPath, 'w') as zipRef: mimetypePath = tempPath / 'mimetype' zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED) for filePath in sorted(tempPath.rglob('*')): if filePath.is_file() and filePath != mimetypePath: archivePath = filePath.relative_to(tempPath).as_posix() zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED) def test_epub_parser(): """Verify EPUB parsing follows linear spine order and preserves readable text""" with TemporaryDirectory() as tempDir: epubPath = Path(tempDir) / 'fixture.epub' build_test_epub(epubPath) parser = EpubParser() try: book = parser.parse(epubPath) finally: parser.cleanup() print("Testing spine-based chapter extraction...") assert book.title == 'Parser Regression Book' assert book.author == 'BookStorm Test' assert book.get_total_chapters() == 2 assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two'] print("Chapter extraction tests passed") print("\nTesting readable text extraction...") firstChapter = book.get_chapter(0) secondChapter = book.get_chapter(1) assert firstChapter is not None assert secondChapter is not None assert firstChapter.paragraphs == [ 'Visible Heading', 'This is very bad.', 'First item', 'Second item' ] assert secondChapter.paragraphs == [ 'Lead in text.', 'More text.', 'Tail text.' ] assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs) print("Readable text tests passed") print("\nAll EPUB parser tests passed!") if __name__ == "__main__": test_epub_parser()