Fix EPUB parsing to follow linear spine items

Treat each linear spine document as a single reading unit instead of splitting one spine item into multiple chapters based on section heuristics. Improve XHTML text extraction so inline spacing is preserved, list and container text is included in reading order, and non-readable tags are ignored before speech output. Resolve TOC hrefs against the nav or NCX document location so chapter titles match nested content paths correctly, and skip non-linear spine items. Add a regression test that builds a minimal EPUB fixture and verifies spine order, title resolution, preserved inline spacing, extraction of non-paragraph content, and exclusion of linear=no content. Verified with: python test_epub_parser.py; python check_naming.py src/epub_parser.py; python check_naming.py test_epub_parser.py
2026-03-26 23:51:48 -04:00
parent b5f1ec4bed
commit c5cf555de6
2 changed files with 327 additions and 67 deletions
@@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content.
 import zipfile
 import tempfile
 import shutil
 import re
 from pathlib import Path
-from bs4 import BeautifulSoup
+from urllib.parse import unquote
 from bs4 import BeautifulSoup, Tag, NavigableString
 from src.book import Book, Chapter
@@ -63,10 +65,10 @@ class EpubParser:
            # Parse content files in spine order (authoritative reading sequence)
            for itemId in spine:
                if itemId in manifest:
-                    contentPath = opfDir / manifest[itemId]
+                    contentPath = (opfDir / unquote(manifest[itemId])).resolve()
                    if contentPath.exists():
-                        chapters = self._parse_content_file(contentPath, tocMap)
+                        chapter = self._parse_content_file(contentPath, tocMap)
-                        for chapter in chapters:
+                        if chapter:
                            book.add_chapter(chapter)
            return book
@@ -143,7 +145,8 @@ class EpubParser:
        if spineTag:
            for itemref in spineTag.find_all('itemref'):
                idref = itemref.get('idref')
-                if idref:
+                linear = itemref.get('linear', 'yes').lower()
                if idref and linear != 'no':
                    spine.append(idref)
        return metadata, spine, manifest
@@ -215,10 +218,9 @@ class EpubParser:
                if not chapterTitle or not href:
                    continue
-                # Strip anchor from href
+                contentKey = self._normalize_content_key(navPath.parent, href)
-                hrefFile = href.split('#')[0]
+                if contentKey:
-                if hrefFile:
+                    tocMap[contentKey] = chapterTitle
                    tocMap[hrefFile] = chapterTitle
            return tocMap if tocMap else None
@@ -280,10 +282,9 @@ class EpubParser:
                    continue
                href = content.get('src')
-                # Strip anchor from href
+                contentKey = self._normalize_content_key(ncxPath.parent, href)
-                hrefFile = href.split('#')[0]
+                if contentKey:
-                if hrefFile:
+                    tocMap[contentKey] = chapterTitle
                    tocMap[hrefFile] = chapterTitle
            return tocMap if tocMap else None
@@ -300,7 +301,7 @@ class EpubParser:
            tocMap: Optional dict mapping filename to TOC title
        Returns:
-            List of Chapter objects
+            Chapter object, or None if the file has no readable text
        """
        if tocMap is None:
            tocMap = {}
@@ -310,75 +311,164 @@ class EpubParser:
                soup = BeautifulSoup(f.read(), 'html.parser')
        except Exception as e:
            print(f"Error reading content file {contentPath}: {e}")
-            return []
+            return None
-        chapters = []
+        bodyTag = soup.find('body') or soup
        self._remove_non_readable_elements(bodyTag)
-        # Check if this file has a TOC title
+        paragraphs = self._extract_paragraphs(bodyTag)
-        tocTitle = tocMap.get(contentPath.name)
+        if not paragraphs:
            return None
-        # Look for main content sections
+        title = self._resolve_chapter_title(contentPath, bodyTag, tocMap)
        # Try h1, h2, section elements
        sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
-        if not sections:
+        chapter = Chapter(title)
-            # Fallback: treat entire file as one chapter
+        chapter.paragraphs = paragraphs
-            sections = [soup.find('body') or soup]
+        return chapter
-        for sectionIndex, section in enumerate(sections):
+    def _normalize_content_key(self, baseDir, href):
-            # Find chapter title
+        """Normalize TOC hrefs and manifest paths to a comparable absolute key"""
-            title = None
+        hrefFile = unquote(href.split('#')[0].strip())
        if not hrefFile:
            return None
-            # Priority 1: Use TOC title for the first section if available
+        return str((baseDir / hrefFile).resolve())
            if sectionIndex == 0 and tocTitle:
                title = tocTitle
            else:
                # Priority 2: Look for heading in content
                for hTag in ['h1', 'h2', 'h3']:
                    heading = section.find(hTag)
                    if heading:
                        title = heading.get_text(strip=True)
                        break
-            # Priority 3: Fallback to filename
+    def _resolve_chapter_title(self, contentPath, bodyTag, tocMap):
-            if not title:
+        """Resolve the best title for a spine item"""
-                if tocTitle:
+        tocTitle = tocMap.get(str(contentPath.resolve()))
-                    title = tocTitle
+        if tocTitle:
-                else:
+            return tocTitle
                    title = contentPath.stem
-            # Extract paragraphs (including headings)
+        for headingName in ['h1', 'h2', 'h3']:
-            paragraphs = []
+            heading = bodyTag.find(headingName)
            if heading:
                headingText = self._extract_tag_text(heading)
                if headingText:
                    return headingText
-            # Include all headings and paragraphs in reading order
+        return contentPath.stem
-            for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+
-                text = element.get_text(strip=True)
+    def _remove_non_readable_elements(self, rootTag):
        """Remove tags that should not contribute spoken text"""
        for element in rootTag.find_all(['script', 'style', 'noscript']):
            element.decompose()
    def _extract_paragraphs(self, rootTag):
        """Extract readable text blocks from XHTML in document order"""
        paragraphs = []
        pendingInlineText = []
        blockTags = {
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
            'p', 'li', 'blockquote', 'pre', 'figcaption',
            'caption', 'td', 'th', 'dd', 'dt', 'address'
        }
        containerTags = {
            'body', 'section', 'article', 'main', 'div',
            'aside', 'nav', 'header', 'footer'
        }
        structuredTags = blockTags | containerTags | {
            'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'
        }
        for child in rootTag.children:
            self._collect_paragraphs(
                child,
                paragraphs,
                pendingInlineText,
                blockTags,
                containerTags,
                structuredTags
            )
        self._flush_inline_text(paragraphs, pendingInlineText)
        return paragraphs
    def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags):
        """Walk the DOM tree and collect readable blocks without duplicating nested content"""
        if isinstance(node, NavigableString):
            text = self._normalize_text(str(node))
            if text:
                pendingInlineText.append(text)
            return
        if not isinstance(node, Tag):
            return
        if node.name == 'br':
            self._flush_inline_text(paragraphs, pendingInlineText)
            return
        if node.name in blockTags:
            self._flush_inline_text(paragraphs, pendingInlineText)
            text = self._extract_tag_text(node)
            if text:
                paragraphs.append(text)
            return
        if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}:
            if not self._has_structured_children(node, structuredTags):
                self._flush_inline_text(paragraphs, pendingInlineText)
                text = self._extract_tag_text(node)
                if text:
                    paragraphs.append(text)
                return
-            # Only add chapter if it has content
+            for child in node.children:
-            if paragraphs:
+                self._collect_paragraphs(
-                chapter = Chapter(title)
+                    child,
-                chapter.paragraphs = paragraphs
+                    paragraphs,
-                chapters.append(chapter)
+                    pendingInlineText,
                    blockTags,
                    containerTags,
                    structuredTags
                )
-        # If no chapters found, extract all content as one chapter
+            if node.name in containerTags:
-        if not chapters:
+                self._flush_inline_text(paragraphs, pendingInlineText)
-            title = tocTitle if tocTitle else contentPath.stem
+            return
            paragraphs = []
-            # Include all headings and paragraphs in reading order
+        for child in node.children:
-            for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+            self._collect_paragraphs(
-                text = element.get_text(strip=True)
+                child,
-                if text:
+                paragraphs,
-                    paragraphs.append(text)
+                pendingInlineText,
                blockTags,
                containerTags,
                structuredTags
            )
-            if paragraphs:
+    def _has_structured_children(self, node, structuredTags):
-                chapter = Chapter(title)
+        """Return True when a container has nested structural elements to recurse into"""
-                chapter.paragraphs = paragraphs
+        for child in node.children:
-                chapters.append(chapter)
+            if isinstance(child, Tag) and child.name in structuredTags:
                return True
        return False
-        return chapters
+    def _extract_tag_text(self, tag):
        """Extract normalized text from a tag while preserving inline spacing"""
        return self._normalize_text(tag.get_text(' ', strip=True))
    def _flush_inline_text(self, paragraphs, pendingInlineText):
        """Convert accumulated inline text into a paragraph"""
        if not pendingInlineText:
            return
        text = self._normalize_text(' '.join(pendingInlineText))
        pendingInlineText.clear()
        if text:
            paragraphs.append(text)
    def _normalize_text(self, text):
        """Collapse whitespace while keeping natural word boundaries for TTS"""
        text = re.sub(r'\s+', ' ', text).strip()
        if not text:
            return ''
        text = re.sub(r'\s+([,.;:!?])', r'\1', text)
        text = re.sub(r'([\(\[\{])\s+', r'\1', text)
        text = re.sub(r'\s+([\)\]\}])', r'\1', text)
        return text
    def cleanup(self):
        """Clean up temporary files and memory"""
@@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Regression tests for EPUB parsing behavior.
 """
 from pathlib import Path
 from tempfile import TemporaryDirectory
 import zipfile
 from src.epub_parser import EpubParser
 def write_text_file(filePath, content):
    """Write a UTF-8 text file"""
    filePath.parent.mkdir(parents=True, exist_ok=True)
    filePath.write_text(content, encoding='utf-8')
 def build_test_epub(epubPath):
    """Build a minimal EPUB fixture for parser regression testing"""
    with TemporaryDirectory() as tempDir:
        tempPath = Path(tempDir)
        write_text_file(
            tempPath / 'mimetype',
            'application/epub+zip'
        )
        write_text_file(
            tempPath / 'META-INF' / 'container.xml',
            '''<?xml version="1.0" encoding="UTF-8"?>
 <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
  </rootfiles>
 </container>
 '''
        )
        write_text_file(
            tempPath / 'OEBPS' / 'content.opf',
            '''<?xml version="1.0" encoding="utf-8"?>
 <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:title>Parser Regression Book</dc:title>
    <dc:creator>BookStorm Test</dc:creator>
  </metadata>
  <manifest>
    <item id="navdoc" href="nav/toc.xhtml" media-type="application/xhtml+xml" properties="nav"/>
    <item id="chap1" href="text/chapter1.xhtml" media-type="application/xhtml+xml"/>
    <item id="chap2" href="text/chapter2.xhtml" media-type="application/xhtml+xml"/>
    <item id="notes" href="text/notes.xhtml" media-type="application/xhtml+xml"/>
  </manifest>
  <spine>
    <itemref idref="chap1"/>
    <itemref idref="notes" linear="no"/>
    <itemref idref="chap2"/>
  </spine>
 </package>
 '''
        )
        write_text_file(
            tempPath / 'OEBPS' / 'nav' / 'toc.xhtml',
            '''<?xml version="1.0" encoding="utf-8"?>
 <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
  <body>
    <nav epub:type="toc">
      <ol>
        <li><a href="../text/chapter1.xhtml">Chapter One</a></li>
        <li><a href="../text/chapter2.xhtml">Chapter Two</a></li>
      </ol>
    </nav>
  </body>
 </html>
 '''
        )
        write_text_file(
            tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml',
            '''<?xml version="1.0" encoding="utf-8"?>
 <html xmlns="http://www.w3.org/1999/xhtml">
  <body>
    <section>
      <h1>Visible Heading</h1>
      <p>This is <em>very</em> bad.</p>
      <ul>
        <li>First item</li>
        <li>Second item</li>
      </ul>
    </section>
  </body>
 </html>
 '''
        )
        write_text_file(
            tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml',
            '''<?xml version="1.0" encoding="utf-8"?>
 <html xmlns="http://www.w3.org/1999/xhtml">
  <body>
    <div>Lead in <span>text</span>.</div>
    <div class="section"><p>More text.</p></div>
    <p>Tail text.</p>
  </body>
 </html>
 '''
        )
        write_text_file(
            tempPath / 'OEBPS' / 'text' / 'notes.xhtml',
            '''<?xml version="1.0" encoding="utf-8"?>
 <html xmlns="http://www.w3.org/1999/xhtml">
  <body>
    <p>This non-linear note should not appear.</p>
  </body>
 </html>
 '''
        )
        with zipfile.ZipFile(epubPath, 'w') as zipRef:
            mimetypePath = tempPath / 'mimetype'
            zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED)
            for filePath in sorted(tempPath.rglob('*')):
                if filePath.is_file() and filePath != mimetypePath:
                    archivePath = filePath.relative_to(tempPath).as_posix()
                    zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED)
 def test_epub_parser():
    """Verify EPUB parsing follows linear spine order and preserves readable text"""
    with TemporaryDirectory() as tempDir:
        epubPath = Path(tempDir) / 'fixture.epub'
        build_test_epub(epubPath)
        parser = EpubParser()
        try:
            book = parser.parse(epubPath)
        finally:
            parser.cleanup()
    print("Testing spine-based chapter extraction...")
    assert book.title == 'Parser Regression Book'
    assert book.author == 'BookStorm Test'
    assert book.get_total_chapters() == 2
    assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two']
    print("Chapter extraction tests passed")
    print("\nTesting readable text extraction...")
    firstChapter = book.get_chapter(0)
    secondChapter = book.get_chapter(1)
    assert firstChapter is not None
    assert secondChapter is not None
    assert firstChapter.paragraphs == [
        'Visible Heading',
        'This is very bad.',
        'First item',
        'Second item'
    ]
    assert secondChapter.paragraphs == [
        'Lead in text.',
        'More text.',
        'Tail text.'
    ]
    assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs)
    print("Readable text tests passed")
    print("\nAll EPUB parser tests passed!")
 if __name__ == "__main__":
    test_epub_parser()