Fix EPUB parsing to follow linear spine items

Treat each linear spine document as a single reading unit instead of splitting one spine item into multiple chapters based on section heuristics.

Improve XHTML text extraction so inline spacing is preserved, list and container text is included in reading order, and non-readable tags are ignored before speech output.

Resolve TOC hrefs against the nav or NCX document location so chapter titles match nested content paths correctly, and skip non-linear spine items.

Add a regression test that builds a minimal EPUB fixture and verifies spine order, title resolution, preserved inline spacing, extraction of non-paragraph content, and exclusion of linear=no content.

Verified with: python test_epub_parser.py; python check_naming.py src/epub_parser.py; python check_naming.py test_epub_parser.py
This commit is contained in:
Storm Dragon
2026-03-26 23:51:48 -04:00
parent b5f1ec4bed
commit c5cf555de6
2 changed files with 327 additions and 67 deletions
+157 -67
View File
@@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content.
import zipfile import zipfile
import tempfile import tempfile
import shutil import shutil
import re
from pathlib import Path from pathlib import Path
from bs4 import BeautifulSoup from urllib.parse import unquote
from bs4 import BeautifulSoup, Tag, NavigableString
from src.book import Book, Chapter from src.book import Book, Chapter
@@ -63,10 +65,10 @@ class EpubParser:
# Parse content files in spine order (authoritative reading sequence) # Parse content files in spine order (authoritative reading sequence)
for itemId in spine: for itemId in spine:
if itemId in manifest: if itemId in manifest:
contentPath = opfDir / manifest[itemId] contentPath = (opfDir / unquote(manifest[itemId])).resolve()
if contentPath.exists(): if contentPath.exists():
chapters = self._parse_content_file(contentPath, tocMap) chapter = self._parse_content_file(contentPath, tocMap)
for chapter in chapters: if chapter:
book.add_chapter(chapter) book.add_chapter(chapter)
return book return book
@@ -143,7 +145,8 @@ class EpubParser:
if spineTag: if spineTag:
for itemref in spineTag.find_all('itemref'): for itemref in spineTag.find_all('itemref'):
idref = itemref.get('idref') idref = itemref.get('idref')
if idref: linear = itemref.get('linear', 'yes').lower()
if idref and linear != 'no':
spine.append(idref) spine.append(idref)
return metadata, spine, manifest return metadata, spine, manifest
@@ -215,10 +218,9 @@ class EpubParser:
if not chapterTitle or not href: if not chapterTitle or not href:
continue continue
# Strip anchor from href contentKey = self._normalize_content_key(navPath.parent, href)
hrefFile = href.split('#')[0] if contentKey:
if hrefFile: tocMap[contentKey] = chapterTitle
tocMap[hrefFile] = chapterTitle
return tocMap if tocMap else None return tocMap if tocMap else None
@@ -280,10 +282,9 @@ class EpubParser:
continue continue
href = content.get('src') href = content.get('src')
# Strip anchor from href contentKey = self._normalize_content_key(ncxPath.parent, href)
hrefFile = href.split('#')[0] if contentKey:
if hrefFile: tocMap[contentKey] = chapterTitle
tocMap[hrefFile] = chapterTitle
return tocMap if tocMap else None return tocMap if tocMap else None
@@ -300,7 +301,7 @@ class EpubParser:
tocMap: Optional dict mapping filename to TOC title tocMap: Optional dict mapping filename to TOC title
Returns: Returns:
List of Chapter objects Chapter object, or None if the file has no readable text
""" """
if tocMap is None: if tocMap is None:
tocMap = {} tocMap = {}
@@ -310,75 +311,164 @@ class EpubParser:
soup = BeautifulSoup(f.read(), 'html.parser') soup = BeautifulSoup(f.read(), 'html.parser')
except Exception as e: except Exception as e:
print(f"Error reading content file {contentPath}: {e}") print(f"Error reading content file {contentPath}: {e}")
return [] return None
chapters = [] bodyTag = soup.find('body') or soup
self._remove_non_readable_elements(bodyTag)
# Check if this file has a TOC title paragraphs = self._extract_paragraphs(bodyTag)
tocTitle = tocMap.get(contentPath.name) if not paragraphs:
return None
# Look for main content sections title = self._resolve_chapter_title(contentPath, bodyTag, tocMap)
# Try h1, h2, section elements
sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
if not sections: chapter = Chapter(title)
# Fallback: treat entire file as one chapter chapter.paragraphs = paragraphs
sections = [soup.find('body') or soup] return chapter
for sectionIndex, section in enumerate(sections): def _normalize_content_key(self, baseDir, href):
# Find chapter title """Normalize TOC hrefs and manifest paths to a comparable absolute key"""
title = None hrefFile = unquote(href.split('#')[0].strip())
if not hrefFile:
return None
# Priority 1: Use TOC title for the first section if available return str((baseDir / hrefFile).resolve())
if sectionIndex == 0 and tocTitle:
title = tocTitle
else:
# Priority 2: Look for heading in content
for hTag in ['h1', 'h2', 'h3']:
heading = section.find(hTag)
if heading:
title = heading.get_text(strip=True)
break
# Priority 3: Fallback to filename def _resolve_chapter_title(self, contentPath, bodyTag, tocMap):
if not title: """Resolve the best title for a spine item"""
if tocTitle: tocTitle = tocMap.get(str(contentPath.resolve()))
title = tocTitle if tocTitle:
else: return tocTitle
title = contentPath.stem
# Extract paragraphs (including headings) for headingName in ['h1', 'h2', 'h3']:
paragraphs = [] heading = bodyTag.find(headingName)
if heading:
headingText = self._extract_tag_text(heading)
if headingText:
return headingText
# Include all headings and paragraphs in reading order return contentPath.stem
for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = element.get_text(strip=True) def _remove_non_readable_elements(self, rootTag):
"""Remove tags that should not contribute spoken text"""
for element in rootTag.find_all(['script', 'style', 'noscript']):
element.decompose()
def _extract_paragraphs(self, rootTag):
"""Extract readable text blocks from XHTML in document order"""
paragraphs = []
pendingInlineText = []
blockTags = {
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'li', 'blockquote', 'pre', 'figcaption',
'caption', 'td', 'th', 'dd', 'dt', 'address'
}
containerTags = {
'body', 'section', 'article', 'main', 'div',
'aside', 'nav', 'header', 'footer'
}
structuredTags = blockTags | containerTags | {
'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'
}
for child in rootTag.children:
self._collect_paragraphs(
child,
paragraphs,
pendingInlineText,
blockTags,
containerTags,
structuredTags
)
self._flush_inline_text(paragraphs, pendingInlineText)
return paragraphs
def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags):
"""Walk the DOM tree and collect readable blocks without duplicating nested content"""
if isinstance(node, NavigableString):
text = self._normalize_text(str(node))
if text:
pendingInlineText.append(text)
return
if not isinstance(node, Tag):
return
if node.name == 'br':
self._flush_inline_text(paragraphs, pendingInlineText)
return
if node.name in blockTags:
self._flush_inline_text(paragraphs, pendingInlineText)
text = self._extract_tag_text(node)
if text:
paragraphs.append(text)
return
if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}:
if not self._has_structured_children(node, structuredTags):
self._flush_inline_text(paragraphs, pendingInlineText)
text = self._extract_tag_text(node)
if text: if text:
paragraphs.append(text) paragraphs.append(text)
return
# Only add chapter if it has content for child in node.children:
if paragraphs: self._collect_paragraphs(
chapter = Chapter(title) child,
chapter.paragraphs = paragraphs paragraphs,
chapters.append(chapter) pendingInlineText,
blockTags,
containerTags,
structuredTags
)
# If no chapters found, extract all content as one chapter if node.name in containerTags:
if not chapters: self._flush_inline_text(paragraphs, pendingInlineText)
title = tocTitle if tocTitle else contentPath.stem return
paragraphs = []
# Include all headings and paragraphs in reading order for child in node.children:
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): self._collect_paragraphs(
text = element.get_text(strip=True) child,
if text: paragraphs,
paragraphs.append(text) pendingInlineText,
blockTags,
containerTags,
structuredTags
)
if paragraphs: def _has_structured_children(self, node, structuredTags):
chapter = Chapter(title) """Return True when a container has nested structural elements to recurse into"""
chapter.paragraphs = paragraphs for child in node.children:
chapters.append(chapter) if isinstance(child, Tag) and child.name in structuredTags:
return True
return False
return chapters def _extract_tag_text(self, tag):
"""Extract normalized text from a tag while preserving inline spacing"""
return self._normalize_text(tag.get_text(' ', strip=True))
def _flush_inline_text(self, paragraphs, pendingInlineText):
"""Convert accumulated inline text into a paragraph"""
if not pendingInlineText:
return
text = self._normalize_text(' '.join(pendingInlineText))
pendingInlineText.clear()
if text:
paragraphs.append(text)
def _normalize_text(self, text):
"""Collapse whitespace while keeping natural word boundaries for TTS"""
text = re.sub(r'\s+', ' ', text).strip()
if not text:
return ''
text = re.sub(r'\s+([,.;:!?])', r'\1', text)
text = re.sub(r'([\(\[\{])\s+', r'\1', text)
text = re.sub(r'\s+([\)\]\}])', r'\1', text)
return text
def cleanup(self): def cleanup(self):
"""Clean up temporary files and memory""" """Clean up temporary files and memory"""
+170
View File
@@ -0,0 +1,170 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Regression tests for EPUB parsing behavior.
"""
from pathlib import Path
from tempfile import TemporaryDirectory
import zipfile
from src.epub_parser import EpubParser
def write_text_file(filePath, content):
"""Write a UTF-8 text file"""
filePath.parent.mkdir(parents=True, exist_ok=True)
filePath.write_text(content, encoding='utf-8')
def build_test_epub(epubPath):
"""Build a minimal EPUB fixture for parser regression testing"""
with TemporaryDirectory() as tempDir:
tempPath = Path(tempDir)
write_text_file(
tempPath / 'mimetype',
'application/epub+zip'
)
write_text_file(
tempPath / 'META-INF' / 'container.xml',
'''<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'content.opf',
'''<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Parser Regression Book</dc:title>
<dc:creator>BookStorm Test</dc:creator>
</metadata>
<manifest>
<item id="navdoc" href="nav/toc.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="chap1" href="text/chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="chap2" href="text/chapter2.xhtml" media-type="application/xhtml+xml"/>
<item id="notes" href="text/notes.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chap1"/>
<itemref idref="notes" linear="no"/>
<itemref idref="chap2"/>
</spine>
</package>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'nav' / 'toc.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<body>
<nav epub:type="toc">
<ol>
<li><a href="../text/chapter1.xhtml">Chapter One</a></li>
<li><a href="../text/chapter2.xhtml">Chapter Two</a></li>
</ol>
</nav>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section>
<h1>Visible Heading</h1>
<p>This is <em>very</em> bad.</p>
<ul>
<li>First item</li>
<li>Second item</li>
</ul>
</section>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<div>Lead in <span>text</span>.</div>
<div class="section"><p>More text.</p></div>
<p>Tail text.</p>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'notes.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<p>This non-linear note should not appear.</p>
</body>
</html>
'''
)
with zipfile.ZipFile(epubPath, 'w') as zipRef:
mimetypePath = tempPath / 'mimetype'
zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED)
for filePath in sorted(tempPath.rglob('*')):
if filePath.is_file() and filePath != mimetypePath:
archivePath = filePath.relative_to(tempPath).as_posix()
zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED)
def test_epub_parser():
"""Verify EPUB parsing follows linear spine order and preserves readable text"""
with TemporaryDirectory() as tempDir:
epubPath = Path(tempDir) / 'fixture.epub'
build_test_epub(epubPath)
parser = EpubParser()
try:
book = parser.parse(epubPath)
finally:
parser.cleanup()
print("Testing spine-based chapter extraction...")
assert book.title == 'Parser Regression Book'
assert book.author == 'BookStorm Test'
assert book.get_total_chapters() == 2
assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two']
print("Chapter extraction tests passed")
print("\nTesting readable text extraction...")
firstChapter = book.get_chapter(0)
secondChapter = book.get_chapter(1)
assert firstChapter is not None
assert secondChapter is not None
assert firstChapter.paragraphs == [
'Visible Heading',
'This is very bad.',
'First item',
'Second item'
]
assert secondChapter.paragraphs == [
'Lead in text.',
'More text.',
'Tail text.'
]
assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs)
print("Readable text tests passed")
print("\nAll EPUB parser tests passed!")
if __name__ == "__main__":
test_epub_parser()