#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Regression tests for EPUB parsing behavior. """ from pathlib import Path from tempfile import TemporaryDirectory import zipfile from src.epub_parser import EpubParser def write_text_file(filePath, content): """Write a UTF-8 text file""" filePath.parent.mkdir(parents=True, exist_ok=True) filePath.write_text(content, encoding='utf-8') def build_test_epub(epubPath): """Build a minimal EPUB fixture for parser regression testing""" with TemporaryDirectory() as tempDir: tempPath = Path(tempDir) write_text_file( tempPath / 'mimetype', 'application/epub+zip' ) write_text_file( tempPath / 'META-INF' / 'container.xml', ''' ''' ) write_text_file( tempPath / 'OEBPS' / 'content.opf', ''' Parser Regression Book BookStorm Test ''' ) write_text_file( tempPath / 'OEBPS' / 'nav' / 'toc.xhtml', ''' ''' ) write_text_file( tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml', '''

Visible Heading

This is very bad.

First item
Second item

''' ) write_text_file( tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml', '''

Lead in text.

More text.

Tail text.

''' ) write_text_file( tempPath / 'OEBPS' / 'text' / 'notes.xhtml', '''

This non-linear note should not appear.

''' ) with zipfile.ZipFile(epubPath, 'w') as zipRef: mimetypePath = tempPath / 'mimetype' zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED) for filePath in sorted(tempPath.rglob('*')): if filePath.is_file() and filePath != mimetypePath: archivePath = filePath.relative_to(tempPath).as_posix() zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED) def test_epub_parser(): """Verify EPUB parsing follows linear spine order and preserves readable text""" with TemporaryDirectory() as tempDir: epubPath = Path(tempDir) / 'fixture.epub' build_test_epub(epubPath) parser = EpubParser() try: book = parser.parse(epubPath) finally: parser.cleanup() print("Testing spine-based chapter extraction...") assert book.title == 'Parser Regression Book' assert book.author == 'BookStorm Test' assert book.get_total_chapters() == 2 assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two'] print("Chapter extraction tests passed") print("\nTesting readable text extraction...") firstChapter = book.get_chapter(0) secondChapter = book.get_chapter(1) assert firstChapter is not None assert secondChapter is not None assert firstChapter.paragraphs == [ 'Visible Heading', 'This is very bad.', 'First item', 'Second item' ] assert secondChapter.paragraphs == [ 'Lead in text.', 'More text.', 'Tail text.' ] assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs) print("Readable text tests passed") print("\nAll EPUB parser tests passed!") if __name__ == "__main__": test_epub_parser()