Fix EPUB parsing to follow linear spine items

Treat each linear spine document as a single reading unit instead of splitting one spine item into multiple chapters based on section heuristics. Improve XHTML text extraction so inline spacing is preserved, list and container text is included in reading order, and non-readable tags are ignored before speech output. Resolve TOC hrefs against the nav or NCX document location so chapter titles match nested content paths correctly, and skip non-linear spine items. Add a regression test that builds a minimal EPUB fixture and verifies spine order, title resolution, preserved inline spacing, extraction of non-paragraph content, and exclusion of linear=no content. Verified with: python test_epub_parser.py; python check_naming.py src/epub_parser.py; python check_naming.py test_epub_parser.py
2026-03-26 23:51:48 -04:00
parent b5f1ec4bed
commit c5cf555de6
2 changed files with 327 additions and 67 deletions
@@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content.
 import zipfile
 import tempfile
 import shutil
+import re
 from pathlib import Path
-from bs4 import BeautifulSoup
+from urllib.parse import unquote
+from bs4 import BeautifulSoup, Tag, NavigableString
 from src.book import Book, Chapter


@@ -63,10 +65,10 @@ class EpubParser:
            # Parse content files in spine order (authoritative reading sequence)
            for itemId in spine:
                if itemId in manifest:
-                    contentPath = opfDir / manifest[itemId]
+                    contentPath = (opfDir / unquote(manifest[itemId])).resolve()
                    if contentPath.exists():
-                        chapters = self._parse_content_file(contentPath, tocMap)
-                        for chapter in chapters:
+                        chapter = self._parse_content_file(contentPath, tocMap)
+                        if chapter:
                            book.add_chapter(chapter)

            return book
@@ -143,7 +145,8 @@ class EpubParser:
        if spineTag:
            for itemref in spineTag.find_all('itemref'):
                idref = itemref.get('idref')
-                if idref:
+                linear = itemref.get('linear', 'yes').lower()
+                if idref and linear != 'no':
                    spine.append(idref)

        return metadata, spine, manifest
@@ -215,10 +218,9 @@ class EpubParser:
                if not chapterTitle or not href:
                    continue

-                # Strip anchor from href
-                hrefFile = href.split('#')[0]
-                if hrefFile:
-                    tocMap[hrefFile] = chapterTitle
+                contentKey = self._normalize_content_key(navPath.parent, href)
+                if contentKey:
+                    tocMap[contentKey] = chapterTitle

            return tocMap if tocMap else None

@@ -280,10 +282,9 @@ class EpubParser:
                    continue

                href = content.get('src')
-                # Strip anchor from href
-                hrefFile = href.split('#')[0]
-                if hrefFile:
-                    tocMap[hrefFile] = chapterTitle
+                contentKey = self._normalize_content_key(ncxPath.parent, href)
+                if contentKey:
+                    tocMap[contentKey] = chapterTitle

            return tocMap if tocMap else None

@@ -300,7 +301,7 @@ class EpubParser:
            tocMap: Optional dict mapping filename to TOC title

        Returns:
-            List of Chapter objects
+            Chapter object, or None if the file has no readable text
        """
        if tocMap is None:
            tocMap = {}
@@ -310,75 +311,164 @@ class EpubParser:
                soup = BeautifulSoup(f.read(), 'html.parser')
        except Exception as e:
            print(f"Error reading content file {contentPath}: {e}")
-            return []
+            return None

-        chapters = []
+        bodyTag = soup.find('body') or soup
+        self._remove_non_readable_elements(bodyTag)

-        # Check if this file has a TOC title
-        tocTitle = tocMap.get(contentPath.name)
+        paragraphs = self._extract_paragraphs(bodyTag)
+        if not paragraphs:
+            return None

-        # Look for main content sections
-        # Try h1, h2, section elements
-        sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
+        title = self._resolve_chapter_title(contentPath, bodyTag, tocMap)

-        if not sections:
-            # Fallback: treat entire file as one chapter
-            sections = [soup.find('body') or soup]
+        chapter = Chapter(title)
+        chapter.paragraphs = paragraphs
+        return chapter

-        for sectionIndex, section in enumerate(sections):
-            # Find chapter title
-            title = None
+    def _normalize_content_key(self, baseDir, href):
+        """Normalize TOC hrefs and manifest paths to a comparable absolute key"""
+        hrefFile = unquote(href.split('#')[0].strip())
+        if not hrefFile:
+            return None

-            # Priority 1: Use TOC title for the first section if available
-            if sectionIndex == 0 and tocTitle:
-                title = tocTitle
-            else:
-                # Priority 2: Look for heading in content
-                for hTag in ['h1', 'h2', 'h3']:
-                    heading = section.find(hTag)
-                    if heading:
-                        title = heading.get_text(strip=True)
-                        break
+        return str((baseDir / hrefFile).resolve())

-            # Priority 3: Fallback to filename
-            if not title:
-                if tocTitle:
-                    title = tocTitle
-                else:
-                    title = contentPath.stem
+    def _resolve_chapter_title(self, contentPath, bodyTag, tocMap):
+        """Resolve the best title for a spine item"""
+        tocTitle = tocMap.get(str(contentPath.resolve()))
+        if tocTitle:
+            return tocTitle

-            # Extract paragraphs (including headings)
-            paragraphs = []
+        for headingName in ['h1', 'h2', 'h3']:
+            heading = bodyTag.find(headingName)
+            if heading:
+                headingText = self._extract_tag_text(heading)
+                if headingText:
+                    return headingText

-            # Include all headings and paragraphs in reading order
-            for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
-                text = element.get_text(strip=True)
+        return contentPath.stem
+
+    def _remove_non_readable_elements(self, rootTag):
+        """Remove tags that should not contribute spoken text"""
+        for element in rootTag.find_all(['script', 'style', 'noscript']):
+            element.decompose()
+
+    def _extract_paragraphs(self, rootTag):
+        """Extract readable text blocks from XHTML in document order"""
+        paragraphs = []
+        pendingInlineText = []
+        blockTags = {
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'p', 'li', 'blockquote', 'pre', 'figcaption',
+            'caption', 'td', 'th', 'dd', 'dt', 'address'
+        }
+        containerTags = {
+            'body', 'section', 'article', 'main', 'div',
+            'aside', 'nav', 'header', 'footer'
+        }
+        structuredTags = blockTags | containerTags | {
+            'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'
+        }
+
+        for child in rootTag.children:
+            self._collect_paragraphs(
+                child,
+                paragraphs,
+                pendingInlineText,
+                blockTags,
+                containerTags,
+                structuredTags
+            )
+
+        self._flush_inline_text(paragraphs, pendingInlineText)
+        return paragraphs
+
+    def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags):
+        """Walk the DOM tree and collect readable blocks without duplicating nested content"""
+        if isinstance(node, NavigableString):
+            text = self._normalize_text(str(node))
+            if text:
+                pendingInlineText.append(text)
+            return
+
+        if not isinstance(node, Tag):
+            return
+
+        if node.name == 'br':
+            self._flush_inline_text(paragraphs, pendingInlineText)
+            return
+
+        if node.name in blockTags:
+            self._flush_inline_text(paragraphs, pendingInlineText)
+            text = self._extract_tag_text(node)
+            if text:
+                paragraphs.append(text)
+            return
+
+        if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}:
+            if not self._has_structured_children(node, structuredTags):
+                self._flush_inline_text(paragraphs, pendingInlineText)
+                text = self._extract_tag_text(node)
                if text:
                    paragraphs.append(text)
+                return

-            # Only add chapter if it has content
-            if paragraphs:
-                chapter = Chapter(title)
-                chapter.paragraphs = paragraphs
-                chapters.append(chapter)
+            for child in node.children:
+                self._collect_paragraphs(
+                    child,
+                    paragraphs,
+                    pendingInlineText,
+                    blockTags,
+                    containerTags,
+                    structuredTags
+                )

-        # If no chapters found, extract all content as one chapter
-        if not chapters:
-            title = tocTitle if tocTitle else contentPath.stem
-            paragraphs = []
+            if node.name in containerTags:
+                self._flush_inline_text(paragraphs, pendingInlineText)
+            return

-            # Include all headings and paragraphs in reading order
-            for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
-                text = element.get_text(strip=True)
-                if text:
-                    paragraphs.append(text)
+        for child in node.children:
+            self._collect_paragraphs(
+                child,
+                paragraphs,
+                pendingInlineText,
+                blockTags,
+                containerTags,
+                structuredTags
+            )

-            if paragraphs:
-                chapter = Chapter(title)
-                chapter.paragraphs = paragraphs
-                chapters.append(chapter)
+    def _has_structured_children(self, node, structuredTags):
+        """Return True when a container has nested structural elements to recurse into"""
+        for child in node.children:
+            if isinstance(child, Tag) and child.name in structuredTags:
+                return True
+        return False

-        return chapters
+    def _extract_tag_text(self, tag):
+        """Extract normalized text from a tag while preserving inline spacing"""
+        return self._normalize_text(tag.get_text(' ', strip=True))
+
+    def _flush_inline_text(self, paragraphs, pendingInlineText):
+        """Convert accumulated inline text into a paragraph"""
+        if not pendingInlineText:
+            return
+
+        text = self._normalize_text(' '.join(pendingInlineText))
+        pendingInlineText.clear()
+        if text:
+            paragraphs.append(text)
+
+    def _normalize_text(self, text):
+        """Collapse whitespace while keeping natural word boundaries for TTS"""
+        text = re.sub(r'\s+', ' ', text).strip()
+        if not text:
+            return ''
+
+        text = re.sub(r'\s+([,.;:!?])', r'\1', text)
+        text = re.sub(r'([\(\[\{])\s+', r'\1', text)
+        text = re.sub(r'\s+([\)\]\}])', r'\1', text)
+        return text

    def cleanup(self):
        """Clean up temporary files and memory"""
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Regression tests for EPUB parsing behavior.
+"""
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import zipfile
+
+from src.epub_parser import EpubParser
+
+
+def write_text_file(filePath, content):
+    """Write a UTF-8 text file"""
+    filePath.parent.mkdir(parents=True, exist_ok=True)
+    filePath.write_text(content, encoding='utf-8')
+
+
+def build_test_epub(epubPath):
+    """Build a minimal EPUB fixture for parser regression testing"""
+    with TemporaryDirectory() as tempDir:
+        tempPath = Path(tempDir)
+
+        write_text_file(
+            tempPath / 'mimetype',
+            'application/epub+zip'
+        )
+        write_text_file(
+            tempPath / 'META-INF' / 'container.xml',
+            '''<?xml version="1.0" encoding="UTF-8"?>
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+  <rootfiles>
+    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+  </rootfiles>
+</container>
+'''
+        )
+        write_text_file(
+            tempPath / 'OEBPS' / 'content.opf',
+            '''<?xml version="1.0" encoding="utf-8"?>
+<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
+  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+    <dc:title>Parser Regression Book</dc:title>
+    <dc:creator>BookStorm Test</dc:creator>
+  </metadata>
+  <manifest>
+    <item id="navdoc" href="nav/toc.xhtml" media-type="application/xhtml+xml" properties="nav"/>
+    <item id="chap1" href="text/chapter1.xhtml" media-type="application/xhtml+xml"/>
+    <item id="chap2" href="text/chapter2.xhtml" media-type="application/xhtml+xml"/>
+    <item id="notes" href="text/notes.xhtml" media-type="application/xhtml+xml"/>
+  </manifest>
+  <spine>
+    <itemref idref="chap1"/>
+    <itemref idref="notes" linear="no"/>
+    <itemref idref="chap2"/>
+  </spine>
+</package>
+'''
+        )
+        write_text_file(
+            tempPath / 'OEBPS' / 'nav' / 'toc.xhtml',
+            '''<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
+  <body>
+    <nav epub:type="toc">
+      <ol>
+        <li><a href="../text/chapter1.xhtml">Chapter One</a></li>
+        <li><a href="../text/chapter2.xhtml">Chapter Two</a></li>
+      </ol>
+    </nav>
+  </body>
+</html>
+'''
+        )
+        write_text_file(
+            tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml',
+            '''<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <body>
+    <section>
+      <h1>Visible Heading</h1>
+      <p>This is <em>very</em> bad.</p>
+      <ul>
+        <li>First item</li>
+        <li>Second item</li>
+      </ul>
+    </section>
+  </body>
+</html>
+'''
+        )
+        write_text_file(
+            tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml',
+            '''<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <body>
+    <div>Lead in <span>text</span>.</div>
+    <div class="section"><p>More text.</p></div>
+    <p>Tail text.</p>
+  </body>
+</html>
+'''
+        )
+        write_text_file(
+            tempPath / 'OEBPS' / 'text' / 'notes.xhtml',
+            '''<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <body>
+    <p>This non-linear note should not appear.</p>
+  </body>
+</html>
+'''
+        )
+
+        with zipfile.ZipFile(epubPath, 'w') as zipRef:
+            mimetypePath = tempPath / 'mimetype'
+            zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED)
+
+            for filePath in sorted(tempPath.rglob('*')):
+                if filePath.is_file() and filePath != mimetypePath:
+                    archivePath = filePath.relative_to(tempPath).as_posix()
+                    zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED)
+
+
+def test_epub_parser():
+    """Verify EPUB parsing follows linear spine order and preserves readable text"""
+    with TemporaryDirectory() as tempDir:
+        epubPath = Path(tempDir) / 'fixture.epub'
+        build_test_epub(epubPath)
+
+        parser = EpubParser()
+        try:
+            book = parser.parse(epubPath)
+        finally:
+            parser.cleanup()
+
+    print("Testing spine-based chapter extraction...")
+    assert book.title == 'Parser Regression Book'
+    assert book.author == 'BookStorm Test'
+    assert book.get_total_chapters() == 2
+    assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two']
+    print("Chapter extraction tests passed")
+
+    print("\nTesting readable text extraction...")
+    firstChapter = book.get_chapter(0)
+    secondChapter = book.get_chapter(1)
+
+    assert firstChapter is not None
+    assert secondChapter is not None
+
+    assert firstChapter.paragraphs == [
+        'Visible Heading',
+        'This is very bad.',
+        'First item',
+        'Second item'
+    ]
+    assert secondChapter.paragraphs == [
+        'Lead in text.',
+        'More text.',
+        'Tail text.'
+    ]
+    assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs)
+    print("Readable text tests passed")
+
+    print("\nAll EPUB parser tests passed!")
+
+
+if __name__ == "__main__":
+    test_epub_parser()