diff --git a/src/epub_parser.py b/src/epub_parser.py index 0c9335d..a599b4e 100644 --- a/src/epub_parser.py +++ b/src/epub_parser.py @@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content. import zipfile import tempfile import shutil +import re from pathlib import Path -from bs4 import BeautifulSoup +from urllib.parse import unquote +from bs4 import BeautifulSoup, Tag, NavigableString from src.book import Book, Chapter @@ -63,10 +65,10 @@ class EpubParser: # Parse content files in spine order (authoritative reading sequence) for itemId in spine: if itemId in manifest: - contentPath = opfDir / manifest[itemId] + contentPath = (opfDir / unquote(manifest[itemId])).resolve() if contentPath.exists(): - chapters = self._parse_content_file(contentPath, tocMap) - for chapter in chapters: + chapter = self._parse_content_file(contentPath, tocMap) + if chapter: book.add_chapter(chapter) return book @@ -143,7 +145,8 @@ class EpubParser: if spineTag: for itemref in spineTag.find_all('itemref'): idref = itemref.get('idref') - if idref: + linear = itemref.get('linear', 'yes').lower() + if idref and linear != 'no': spine.append(idref) return metadata, spine, manifest @@ -215,10 +218,9 @@ class EpubParser: if not chapterTitle or not href: continue - # Strip anchor from href - hrefFile = href.split('#')[0] - if hrefFile: - tocMap[hrefFile] = chapterTitle + contentKey = self._normalize_content_key(navPath.parent, href) + if contentKey: + tocMap[contentKey] = chapterTitle return tocMap if tocMap else None @@ -280,10 +282,9 @@ class EpubParser: continue href = content.get('src') - # Strip anchor from href - hrefFile = href.split('#')[0] - if hrefFile: - tocMap[hrefFile] = chapterTitle + contentKey = self._normalize_content_key(ncxPath.parent, href) + if contentKey: + tocMap[contentKey] = chapterTitle return tocMap if tocMap else None @@ -300,7 +301,7 @@ class EpubParser: tocMap: Optional dict mapping filename to TOC title Returns: - List of Chapter objects + Chapter object, or None if the file has no readable text """ if tocMap is None: tocMap = {} @@ -310,75 +311,164 @@ class EpubParser: soup = BeautifulSoup(f.read(), 'html.parser') except Exception as e: print(f"Error reading content file {contentPath}: {e}") - return [] + return None - chapters = [] + bodyTag = soup.find('body') or soup + self._remove_non_readable_elements(bodyTag) - # Check if this file has a TOC title - tocTitle = tocMap.get(contentPath.name) + paragraphs = self._extract_paragraphs(bodyTag) + if not paragraphs: + return None - # Look for main content sections - # Try h1, h2, section elements - sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False) + title = self._resolve_chapter_title(contentPath, bodyTag, tocMap) - if not sections: - # Fallback: treat entire file as one chapter - sections = [soup.find('body') or soup] + chapter = Chapter(title) + chapter.paragraphs = paragraphs + return chapter - for sectionIndex, section in enumerate(sections): - # Find chapter title - title = None + def _normalize_content_key(self, baseDir, href): + """Normalize TOC hrefs and manifest paths to a comparable absolute key""" + hrefFile = unquote(href.split('#')[0].strip()) + if not hrefFile: + return None - # Priority 1: Use TOC title for the first section if available - if sectionIndex == 0 and tocTitle: - title = tocTitle - else: - # Priority 2: Look for heading in content - for hTag in ['h1', 'h2', 'h3']: - heading = section.find(hTag) - if heading: - title = heading.get_text(strip=True) - break + return str((baseDir / hrefFile).resolve()) - # Priority 3: Fallback to filename - if not title: - if tocTitle: - title = tocTitle - else: - title = contentPath.stem + def _resolve_chapter_title(self, contentPath, bodyTag, tocMap): + """Resolve the best title for a spine item""" + tocTitle = tocMap.get(str(contentPath.resolve())) + if tocTitle: + return tocTitle - # Extract paragraphs (including headings) - paragraphs = [] + for headingName in ['h1', 'h2', 'h3']: + heading = bodyTag.find(headingName) + if heading: + headingText = self._extract_tag_text(heading) + if headingText: + return headingText - # Include all headings and paragraphs in reading order - for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): - text = element.get_text(strip=True) + return contentPath.stem + + def _remove_non_readable_elements(self, rootTag): + """Remove tags that should not contribute spoken text""" + for element in rootTag.find_all(['script', 'style', 'noscript']): + element.decompose() + + def _extract_paragraphs(self, rootTag): + """Extract readable text blocks from XHTML in document order""" + paragraphs = [] + pendingInlineText = [] + blockTags = { + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'p', 'li', 'blockquote', 'pre', 'figcaption', + 'caption', 'td', 'th', 'dd', 'dt', 'address' + } + containerTags = { + 'body', 'section', 'article', 'main', 'div', + 'aside', 'nav', 'header', 'footer' + } + structuredTags = blockTags | containerTags | { + 'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr' + } + + for child in rootTag.children: + self._collect_paragraphs( + child, + paragraphs, + pendingInlineText, + blockTags, + containerTags, + structuredTags + ) + + self._flush_inline_text(paragraphs, pendingInlineText) + return paragraphs + + def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags): + """Walk the DOM tree and collect readable blocks without duplicating nested content""" + if isinstance(node, NavigableString): + text = self._normalize_text(str(node)) + if text: + pendingInlineText.append(text) + return + + if not isinstance(node, Tag): + return + + if node.name == 'br': + self._flush_inline_text(paragraphs, pendingInlineText) + return + + if node.name in blockTags: + self._flush_inline_text(paragraphs, pendingInlineText) + text = self._extract_tag_text(node) + if text: + paragraphs.append(text) + return + + if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}: + if not self._has_structured_children(node, structuredTags): + self._flush_inline_text(paragraphs, pendingInlineText) + text = self._extract_tag_text(node) if text: paragraphs.append(text) + return - # Only add chapter if it has content - if paragraphs: - chapter = Chapter(title) - chapter.paragraphs = paragraphs - chapters.append(chapter) + for child in node.children: + self._collect_paragraphs( + child, + paragraphs, + pendingInlineText, + blockTags, + containerTags, + structuredTags + ) - # If no chapters found, extract all content as one chapter - if not chapters: - title = tocTitle if tocTitle else contentPath.stem - paragraphs = [] + if node.name in containerTags: + self._flush_inline_text(paragraphs, pendingInlineText) + return - # Include all headings and paragraphs in reading order - for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): - text = element.get_text(strip=True) - if text: - paragraphs.append(text) + for child in node.children: + self._collect_paragraphs( + child, + paragraphs, + pendingInlineText, + blockTags, + containerTags, + structuredTags + ) - if paragraphs: - chapter = Chapter(title) - chapter.paragraphs = paragraphs - chapters.append(chapter) + def _has_structured_children(self, node, structuredTags): + """Return True when a container has nested structural elements to recurse into""" + for child in node.children: + if isinstance(child, Tag) and child.name in structuredTags: + return True + return False - return chapters + def _extract_tag_text(self, tag): + """Extract normalized text from a tag while preserving inline spacing""" + return self._normalize_text(tag.get_text(' ', strip=True)) + + def _flush_inline_text(self, paragraphs, pendingInlineText): + """Convert accumulated inline text into a paragraph""" + if not pendingInlineText: + return + + text = self._normalize_text(' '.join(pendingInlineText)) + pendingInlineText.clear() + if text: + paragraphs.append(text) + + def _normalize_text(self, text): + """Collapse whitespace while keeping natural word boundaries for TTS""" + text = re.sub(r'\s+', ' ', text).strip() + if not text: + return '' + + text = re.sub(r'\s+([,.;:!?])', r'\1', text) + text = re.sub(r'([\(\[\{])\s+', r'\1', text) + text = re.sub(r'\s+([\)\]\}])', r'\1', text) + return text def cleanup(self): """Clean up temporary files and memory""" diff --git a/test_epub_parser.py b/test_epub_parser.py new file mode 100644 index 0000000..517955d --- /dev/null +++ b/test_epub_parser.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Regression tests for EPUB parsing behavior. +""" + +from pathlib import Path +from tempfile import TemporaryDirectory +import zipfile + +from src.epub_parser import EpubParser + + +def write_text_file(filePath, content): + """Write a UTF-8 text file""" + filePath.parent.mkdir(parents=True, exist_ok=True) + filePath.write_text(content, encoding='utf-8') + + +def build_test_epub(epubPath): + """Build a minimal EPUB fixture for parser regression testing""" + with TemporaryDirectory() as tempDir: + tempPath = Path(tempDir) + + write_text_file( + tempPath / 'mimetype', + 'application/epub+zip' + ) + write_text_file( + tempPath / 'META-INF' / 'container.xml', + ''' + + + + + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'content.opf', + ''' + + + Parser Regression Book + BookStorm Test + + + + + + + + + + + + + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'nav' / 'toc.xhtml', + ''' + + + + + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml', + ''' + + +
+

Visible Heading

+

This is very bad.

+ +
+ + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml', + ''' + + +
Lead in text.
+

More text.

+

Tail text.

+ + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'text' / 'notes.xhtml', + ''' + + +

This non-linear note should not appear.

+ + +''' + ) + + with zipfile.ZipFile(epubPath, 'w') as zipRef: + mimetypePath = tempPath / 'mimetype' + zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED) + + for filePath in sorted(tempPath.rglob('*')): + if filePath.is_file() and filePath != mimetypePath: + archivePath = filePath.relative_to(tempPath).as_posix() + zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED) + + +def test_epub_parser(): + """Verify EPUB parsing follows linear spine order and preserves readable text""" + with TemporaryDirectory() as tempDir: + epubPath = Path(tempDir) / 'fixture.epub' + build_test_epub(epubPath) + + parser = EpubParser() + try: + book = parser.parse(epubPath) + finally: + parser.cleanup() + + print("Testing spine-based chapter extraction...") + assert book.title == 'Parser Regression Book' + assert book.author == 'BookStorm Test' + assert book.get_total_chapters() == 2 + assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two'] + print("Chapter extraction tests passed") + + print("\nTesting readable text extraction...") + firstChapter = book.get_chapter(0) + secondChapter = book.get_chapter(1) + + assert firstChapter is not None + assert secondChapter is not None + + assert firstChapter.paragraphs == [ + 'Visible Heading', + 'This is very bad.', + 'First item', + 'Second item' + ] + assert secondChapter.paragraphs == [ + 'Lead in text.', + 'More text.', + 'Tail text.' + ] + assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs) + print("Readable text tests passed") + + print("\nAll EPUB parser tests passed!") + + +if __name__ == "__main__": + test_epub_parser()