Fix EPUB parsing to follow linear spine items

Treat each linear spine document as a single reading unit instead of splitting one spine item into multiple chapters based on section heuristics.

Improve XHTML text extraction so inline spacing is preserved, list and container text is included in reading order, and non-readable tags are ignored before speech output.

Resolve TOC hrefs against the nav or NCX document location so chapter titles match nested content paths correctly, and skip non-linear spine items.

Add a regression test that builds a minimal EPUB fixture and verifies spine order, title resolution, preserved inline spacing, extraction of non-paragraph content, and exclusion of linear=no content.

Verified with: python test_epub_parser.py; python check_naming.py src/epub_parser.py; python check_naming.py test_epub_parser.py
This commit is contained in:
Storm Dragon
2026-03-26 23:51:48 -04:00
parent b5f1ec4bed
commit c5cf555de6
2 changed files with 327 additions and 67 deletions

View File

@@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content.
import zipfile
import tempfile
import shutil
import re
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import unquote
from bs4 import BeautifulSoup, Tag, NavigableString
from src.book import Book, Chapter
@@ -63,10 +65,10 @@ class EpubParser:
# Parse content files in spine order (authoritative reading sequence)
for itemId in spine:
if itemId in manifest:
contentPath = opfDir / manifest[itemId]
contentPath = (opfDir / unquote(manifest[itemId])).resolve()
if contentPath.exists():
chapters = self._parse_content_file(contentPath, tocMap)
for chapter in chapters:
chapter = self._parse_content_file(contentPath, tocMap)
if chapter:
book.add_chapter(chapter)
return book
@@ -143,7 +145,8 @@ class EpubParser:
if spineTag:
for itemref in spineTag.find_all('itemref'):
idref = itemref.get('idref')
if idref:
linear = itemref.get('linear', 'yes').lower()
if idref and linear != 'no':
spine.append(idref)
return metadata, spine, manifest
@@ -215,10 +218,9 @@ class EpubParser:
if not chapterTitle or not href:
continue
# Strip anchor from href
hrefFile = href.split('#')[0]
if hrefFile:
tocMap[hrefFile] = chapterTitle
contentKey = self._normalize_content_key(navPath.parent, href)
if contentKey:
tocMap[contentKey] = chapterTitle
return tocMap if tocMap else None
@@ -280,10 +282,9 @@ class EpubParser:
continue
href = content.get('src')
# Strip anchor from href
hrefFile = href.split('#')[0]
if hrefFile:
tocMap[hrefFile] = chapterTitle
contentKey = self._normalize_content_key(ncxPath.parent, href)
if contentKey:
tocMap[contentKey] = chapterTitle
return tocMap if tocMap else None
@@ -300,7 +301,7 @@ class EpubParser:
tocMap: Optional dict mapping filename to TOC title
Returns:
List of Chapter objects
Chapter object, or None if the file has no readable text
"""
if tocMap is None:
tocMap = {}
@@ -310,75 +311,164 @@ class EpubParser:
soup = BeautifulSoup(f.read(), 'html.parser')
except Exception as e:
print(f"Error reading content file {contentPath}: {e}")
return []
return None
chapters = []
bodyTag = soup.find('body') or soup
self._remove_non_readable_elements(bodyTag)
# Check if this file has a TOC title
tocTitle = tocMap.get(contentPath.name)
paragraphs = self._extract_paragraphs(bodyTag)
if not paragraphs:
return None
# Look for main content sections
# Try h1, h2, section elements
sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
title = self._resolve_chapter_title(contentPath, bodyTag, tocMap)
if not sections:
# Fallback: treat entire file as one chapter
sections = [soup.find('body') or soup]
chapter = Chapter(title)
chapter.paragraphs = paragraphs
return chapter
for sectionIndex, section in enumerate(sections):
# Find chapter title
title = None
def _normalize_content_key(self, baseDir, href):
"""Normalize TOC hrefs and manifest paths to a comparable absolute key"""
hrefFile = unquote(href.split('#')[0].strip())
if not hrefFile:
return None
# Priority 1: Use TOC title for the first section if available
if sectionIndex == 0 and tocTitle:
title = tocTitle
else:
# Priority 2: Look for heading in content
for hTag in ['h1', 'h2', 'h3']:
heading = section.find(hTag)
if heading:
title = heading.get_text(strip=True)
break
return str((baseDir / hrefFile).resolve())
# Priority 3: Fallback to filename
if not title:
if tocTitle:
title = tocTitle
else:
title = contentPath.stem
def _resolve_chapter_title(self, contentPath, bodyTag, tocMap):
"""Resolve the best title for a spine item"""
tocTitle = tocMap.get(str(contentPath.resolve()))
if tocTitle:
return tocTitle
# Extract paragraphs (including headings)
paragraphs = []
for headingName in ['h1', 'h2', 'h3']:
heading = bodyTag.find(headingName)
if heading:
headingText = self._extract_tag_text(heading)
if headingText:
return headingText
# Include all headings and paragraphs in reading order
for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = element.get_text(strip=True)
return contentPath.stem
def _remove_non_readable_elements(self, rootTag):
"""Remove tags that should not contribute spoken text"""
for element in rootTag.find_all(['script', 'style', 'noscript']):
element.decompose()
def _extract_paragraphs(self, rootTag):
"""Extract readable text blocks from XHTML in document order"""
paragraphs = []
pendingInlineText = []
blockTags = {
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'li', 'blockquote', 'pre', 'figcaption',
'caption', 'td', 'th', 'dd', 'dt', 'address'
}
containerTags = {
'body', 'section', 'article', 'main', 'div',
'aside', 'nav', 'header', 'footer'
}
structuredTags = blockTags | containerTags | {
'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'
}
for child in rootTag.children:
self._collect_paragraphs(
child,
paragraphs,
pendingInlineText,
blockTags,
containerTags,
structuredTags
)
self._flush_inline_text(paragraphs, pendingInlineText)
return paragraphs
def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags):
"""Walk the DOM tree and collect readable blocks without duplicating nested content"""
if isinstance(node, NavigableString):
text = self._normalize_text(str(node))
if text:
pendingInlineText.append(text)
return
if not isinstance(node, Tag):
return
if node.name == 'br':
self._flush_inline_text(paragraphs, pendingInlineText)
return
if node.name in blockTags:
self._flush_inline_text(paragraphs, pendingInlineText)
text = self._extract_tag_text(node)
if text:
paragraphs.append(text)
return
if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}:
if not self._has_structured_children(node, structuredTags):
self._flush_inline_text(paragraphs, pendingInlineText)
text = self._extract_tag_text(node)
if text:
paragraphs.append(text)
return
# Only add chapter if it has content
if paragraphs:
chapter = Chapter(title)
chapter.paragraphs = paragraphs
chapters.append(chapter)
for child in node.children:
self._collect_paragraphs(
child,
paragraphs,
pendingInlineText,
blockTags,
containerTags,
structuredTags
)
# If no chapters found, extract all content as one chapter
if not chapters:
title = tocTitle if tocTitle else contentPath.stem
paragraphs = []
if node.name in containerTags:
self._flush_inline_text(paragraphs, pendingInlineText)
return
# Include all headings and paragraphs in reading order
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = element.get_text(strip=True)
if text:
paragraphs.append(text)
for child in node.children:
self._collect_paragraphs(
child,
paragraphs,
pendingInlineText,
blockTags,
containerTags,
structuredTags
)
if paragraphs:
chapter = Chapter(title)
chapter.paragraphs = paragraphs
chapters.append(chapter)
def _has_structured_children(self, node, structuredTags):
"""Return True when a container has nested structural elements to recurse into"""
for child in node.children:
if isinstance(child, Tag) and child.name in structuredTags:
return True
return False
return chapters
def _extract_tag_text(self, tag):
"""Extract normalized text from a tag while preserving inline spacing"""
return self._normalize_text(tag.get_text(' ', strip=True))
def _flush_inline_text(self, paragraphs, pendingInlineText):
"""Convert accumulated inline text into a paragraph"""
if not pendingInlineText:
return
text = self._normalize_text(' '.join(pendingInlineText))
pendingInlineText.clear()
if text:
paragraphs.append(text)
def _normalize_text(self, text):
"""Collapse whitespace while keeping natural word boundaries for TTS"""
text = re.sub(r'\s+', ' ', text).strip()
if not text:
return ''
text = re.sub(r'\s+([,.;:!?])', r'\1', text)
text = re.sub(r'([\(\[\{])\s+', r'\1', text)
text = re.sub(r'\s+([\)\]\}])', r'\1', text)
return text
def cleanup(self):
"""Clean up temporary files and memory"""

170
test_epub_parser.py Normal file
View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Regression tests for EPUB parsing behavior.
"""
from pathlib import Path
from tempfile import TemporaryDirectory
import zipfile
from src.epub_parser import EpubParser
def write_text_file(filePath, content):
"""Write a UTF-8 text file"""
filePath.parent.mkdir(parents=True, exist_ok=True)
filePath.write_text(content, encoding='utf-8')
def build_test_epub(epubPath):
"""Build a minimal EPUB fixture for parser regression testing"""
with TemporaryDirectory() as tempDir:
tempPath = Path(tempDir)
write_text_file(
tempPath / 'mimetype',
'application/epub+zip'
)
write_text_file(
tempPath / 'META-INF' / 'container.xml',
'''<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'content.opf',
'''<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Parser Regression Book</dc:title>
<dc:creator>BookStorm Test</dc:creator>
</metadata>
<manifest>
<item id="navdoc" href="nav/toc.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="chap1" href="text/chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="chap2" href="text/chapter2.xhtml" media-type="application/xhtml+xml"/>
<item id="notes" href="text/notes.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chap1"/>
<itemref idref="notes" linear="no"/>
<itemref idref="chap2"/>
</spine>
</package>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'nav' / 'toc.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<body>
<nav epub:type="toc">
<ol>
<li><a href="../text/chapter1.xhtml">Chapter One</a></li>
<li><a href="../text/chapter2.xhtml">Chapter Two</a></li>
</ol>
</nav>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'chapter1.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section>
<h1>Visible Heading</h1>
<p>This is <em>very</em> bad.</p>
<ul>
<li>First item</li>
<li>Second item</li>
</ul>
</section>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'chapter2.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<div>Lead in <span>text</span>.</div>
<div class="section"><p>More text.</p></div>
<p>Tail text.</p>
</body>
</html>
'''
)
write_text_file(
tempPath / 'OEBPS' / 'text' / 'notes.xhtml',
'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<p>This non-linear note should not appear.</p>
</body>
</html>
'''
)
with zipfile.ZipFile(epubPath, 'w') as zipRef:
mimetypePath = tempPath / 'mimetype'
zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED)
for filePath in sorted(tempPath.rglob('*')):
if filePath.is_file() and filePath != mimetypePath:
archivePath = filePath.relative_to(tempPath).as_posix()
zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED)
def test_epub_parser():
"""Verify EPUB parsing follows linear spine order and preserves readable text"""
with TemporaryDirectory() as tempDir:
epubPath = Path(tempDir) / 'fixture.epub'
build_test_epub(epubPath)
parser = EpubParser()
try:
book = parser.parse(epubPath)
finally:
parser.cleanup()
print("Testing spine-based chapter extraction...")
assert book.title == 'Parser Regression Book'
assert book.author == 'BookStorm Test'
assert book.get_total_chapters() == 2
assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two']
print("Chapter extraction tests passed")
print("\nTesting readable text extraction...")
firstChapter = book.get_chapter(0)
secondChapter = book.get_chapter(1)
assert firstChapter is not None
assert secondChapter is not None
assert firstChapter.paragraphs == [
'Visible Heading',
'This is very bad.',
'First item',
'Second item'
]
assert secondChapter.paragraphs == [
'Lead in text.',
'More text.',
'Tail text.'
]
assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs)
print("Readable text tests passed")
print("\nAll EPUB parser tests passed!")
if __name__ == "__main__":
test_epub_parser()