diff --git a/src/epub_parser.py b/src/epub_parser.py
index 0c9335d..a599b4e 100644
--- a/src/epub_parser.py
+++ b/src/epub_parser.py
@@ -10,8 +10,10 @@ EPUB files are ZIP archives containing XHTML/HTML content.
import zipfile
import tempfile
import shutil
+import re
from pathlib import Path
-from bs4 import BeautifulSoup
+from urllib.parse import unquote
+from bs4 import BeautifulSoup, Tag, NavigableString
from src.book import Book, Chapter
@@ -63,10 +65,10 @@ class EpubParser:
# Parse content files in spine order (authoritative reading sequence)
for itemId in spine:
if itemId in manifest:
- contentPath = opfDir / manifest[itemId]
+ contentPath = (opfDir / unquote(manifest[itemId])).resolve()
if contentPath.exists():
- chapters = self._parse_content_file(contentPath, tocMap)
- for chapter in chapters:
+ chapter = self._parse_content_file(contentPath, tocMap)
+ if chapter:
book.add_chapter(chapter)
return book
@@ -143,7 +145,8 @@ class EpubParser:
if spineTag:
for itemref in spineTag.find_all('itemref'):
idref = itemref.get('idref')
- if idref:
+ linear = itemref.get('linear', 'yes').lower()
+ if idref and linear != 'no':
spine.append(idref)
return metadata, spine, manifest
@@ -215,10 +218,9 @@ class EpubParser:
if not chapterTitle or not href:
continue
- # Strip anchor from href
- hrefFile = href.split('#')[0]
- if hrefFile:
- tocMap[hrefFile] = chapterTitle
+ contentKey = self._normalize_content_key(navPath.parent, href)
+ if contentKey:
+ tocMap[contentKey] = chapterTitle
return tocMap if tocMap else None
@@ -280,10 +282,9 @@ class EpubParser:
continue
href = content.get('src')
- # Strip anchor from href
- hrefFile = href.split('#')[0]
- if hrefFile:
- tocMap[hrefFile] = chapterTitle
+ contentKey = self._normalize_content_key(ncxPath.parent, href)
+ if contentKey:
+ tocMap[contentKey] = chapterTitle
return tocMap if tocMap else None
@@ -300,7 +301,7 @@ class EpubParser:
tocMap: Optional dict mapping filename to TOC title
Returns:
- List of Chapter objects
+ Chapter object, or None if the file has no readable text
"""
if tocMap is None:
tocMap = {}
@@ -310,75 +311,164 @@ class EpubParser:
soup = BeautifulSoup(f.read(), 'html.parser')
except Exception as e:
print(f"Error reading content file {contentPath}: {e}")
- return []
+ return None
- chapters = []
+ bodyTag = soup.find('body') or soup
+ self._remove_non_readable_elements(bodyTag)
- # Check if this file has a TOC title
- tocTitle = tocMap.get(contentPath.name)
+ paragraphs = self._extract_paragraphs(bodyTag)
+ if not paragraphs:
+ return None
- # Look for main content sections
- # Try h1, h2, section elements
- sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
+ title = self._resolve_chapter_title(contentPath, bodyTag, tocMap)
- if not sections:
- # Fallback: treat entire file as one chapter
- sections = [soup.find('body') or soup]
+ chapter = Chapter(title)
+ chapter.paragraphs = paragraphs
+ return chapter
- for sectionIndex, section in enumerate(sections):
- # Find chapter title
- title = None
+ def _normalize_content_key(self, baseDir, href):
+ """Normalize TOC hrefs and manifest paths to a comparable absolute key"""
+ hrefFile = unquote(href.split('#')[0].strip())
+ if not hrefFile:
+ return None
- # Priority 1: Use TOC title for the first section if available
- if sectionIndex == 0 and tocTitle:
- title = tocTitle
- else:
- # Priority 2: Look for heading in content
- for hTag in ['h1', 'h2', 'h3']:
- heading = section.find(hTag)
- if heading:
- title = heading.get_text(strip=True)
- break
+ return str((baseDir / hrefFile).resolve())
- # Priority 3: Fallback to filename
- if not title:
- if tocTitle:
- title = tocTitle
- else:
- title = contentPath.stem
+ def _resolve_chapter_title(self, contentPath, bodyTag, tocMap):
+ """Resolve the best title for a spine item"""
+ tocTitle = tocMap.get(str(contentPath.resolve()))
+ if tocTitle:
+ return tocTitle
- # Extract paragraphs (including headings)
- paragraphs = []
+ for headingName in ['h1', 'h2', 'h3']:
+ heading = bodyTag.find(headingName)
+ if heading:
+ headingText = self._extract_tag_text(heading)
+ if headingText:
+ return headingText
- # Include all headings and paragraphs in reading order
- for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
- text = element.get_text(strip=True)
+ return contentPath.stem
+
+ def _remove_non_readable_elements(self, rootTag):
+ """Remove tags that should not contribute spoken text"""
+ for element in rootTag.find_all(['script', 'style', 'noscript']):
+ element.decompose()
+
+ def _extract_paragraphs(self, rootTag):
+ """Extract readable text blocks from XHTML in document order"""
+ paragraphs = []
+ pendingInlineText = []
+ blockTags = {
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+ 'p', 'li', 'blockquote', 'pre', 'figcaption',
+ 'caption', 'td', 'th', 'dd', 'dt', 'address'
+ }
+ containerTags = {
+ 'body', 'section', 'article', 'main', 'div',
+ 'aside', 'nav', 'header', 'footer'
+ }
+ structuredTags = blockTags | containerTags | {
+ 'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'
+ }
+
+ for child in rootTag.children:
+ self._collect_paragraphs(
+ child,
+ paragraphs,
+ pendingInlineText,
+ blockTags,
+ containerTags,
+ structuredTags
+ )
+
+ self._flush_inline_text(paragraphs, pendingInlineText)
+ return paragraphs
+
+ def _collect_paragraphs(self, node, paragraphs, pendingInlineText, blockTags, containerTags, structuredTags):
+ """Walk the DOM tree and collect readable blocks without duplicating nested content"""
+ if isinstance(node, NavigableString):
+ text = self._normalize_text(str(node))
+ if text:
+ pendingInlineText.append(text)
+ return
+
+ if not isinstance(node, Tag):
+ return
+
+ if node.name == 'br':
+ self._flush_inline_text(paragraphs, pendingInlineText)
+ return
+
+ if node.name in blockTags:
+ self._flush_inline_text(paragraphs, pendingInlineText)
+ text = self._extract_tag_text(node)
+ if text:
+ paragraphs.append(text)
+ return
+
+ if node.name in containerTags or node.name in {'ul', 'ol', 'dl', 'table', 'thead', 'tbody', 'tfoot', 'tr'}:
+ if not self._has_structured_children(node, structuredTags):
+ self._flush_inline_text(paragraphs, pendingInlineText)
+ text = self._extract_tag_text(node)
if text:
paragraphs.append(text)
+ return
- # Only add chapter if it has content
- if paragraphs:
- chapter = Chapter(title)
- chapter.paragraphs = paragraphs
- chapters.append(chapter)
+ for child in node.children:
+ self._collect_paragraphs(
+ child,
+ paragraphs,
+ pendingInlineText,
+ blockTags,
+ containerTags,
+ structuredTags
+ )
- # If no chapters found, extract all content as one chapter
- if not chapters:
- title = tocTitle if tocTitle else contentPath.stem
- paragraphs = []
+ if node.name in containerTags:
+ self._flush_inline_text(paragraphs, pendingInlineText)
+ return
- # Include all headings and paragraphs in reading order
- for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
- text = element.get_text(strip=True)
- if text:
- paragraphs.append(text)
+ for child in node.children:
+ self._collect_paragraphs(
+ child,
+ paragraphs,
+ pendingInlineText,
+ blockTags,
+ containerTags,
+ structuredTags
+ )
- if paragraphs:
- chapter = Chapter(title)
- chapter.paragraphs = paragraphs
- chapters.append(chapter)
+ def _has_structured_children(self, node, structuredTags):
+ """Return True when a container has nested structural elements to recurse into"""
+ for child in node.children:
+ if isinstance(child, Tag) and child.name in structuredTags:
+ return True
+ return False
- return chapters
+ def _extract_tag_text(self, tag):
+ """Extract normalized text from a tag while preserving inline spacing"""
+ return self._normalize_text(tag.get_text(' ', strip=True))
+
+ def _flush_inline_text(self, paragraphs, pendingInlineText):
+ """Convert accumulated inline text into a paragraph"""
+ if not pendingInlineText:
+ return
+
+ text = self._normalize_text(' '.join(pendingInlineText))
+ pendingInlineText.clear()
+ if text:
+ paragraphs.append(text)
+
+ def _normalize_text(self, text):
+ """Collapse whitespace while keeping natural word boundaries for TTS"""
+ text = re.sub(r'\s+', ' ', text).strip()
+ if not text:
+ return ''
+
+ text = re.sub(r'\s+([,.;:!?])', r'\1', text)
+ text = re.sub(r'([\(\[\{])\s+', r'\1', text)
+ text = re.sub(r'\s+([\)\]\}])', r'\1', text)
+ return text
def cleanup(self):
"""Clean up temporary files and memory"""
diff --git a/test_epub_parser.py b/test_epub_parser.py
new file mode 100644
index 0000000..517955d
--- /dev/null
+++ b/test_epub_parser.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Regression tests for EPUB parsing behavior.
+"""
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import zipfile
+
+from src.epub_parser import EpubParser
+
+
+def write_text_file(filePath, content):
+ """Write a UTF-8 text file"""
+ filePath.parent.mkdir(parents=True, exist_ok=True)
+ filePath.write_text(content, encoding='utf-8')
+
+
+def build_test_epub(epubPath):
+ """Build a minimal EPUB fixture for parser regression testing"""
+ with TemporaryDirectory() as tempDir:
+ tempPath = Path(tempDir)
+
+ write_text_file(
+ tempPath / 'mimetype',
+ 'application/epub+zip'
+ )
+ write_text_file(
+ tempPath / 'META-INF' / 'container.xml',
+ '''
+
This is very bad.
+More text.
Tail text.
+ + +''' + ) + write_text_file( + tempPath / 'OEBPS' / 'text' / 'notes.xhtml', + ''' + + +This non-linear note should not appear.
+ + +''' + ) + + with zipfile.ZipFile(epubPath, 'w') as zipRef: + mimetypePath = tempPath / 'mimetype' + zipRef.write(mimetypePath, 'mimetype', compress_type=zipfile.ZIP_STORED) + + for filePath in sorted(tempPath.rglob('*')): + if filePath.is_file() and filePath != mimetypePath: + archivePath = filePath.relative_to(tempPath).as_posix() + zipRef.write(filePath, archivePath, compress_type=zipfile.ZIP_DEFLATED) + + +def test_epub_parser(): + """Verify EPUB parsing follows linear spine order and preserves readable text""" + with TemporaryDirectory() as tempDir: + epubPath = Path(tempDir) / 'fixture.epub' + build_test_epub(epubPath) + + parser = EpubParser() + try: + book = parser.parse(epubPath) + finally: + parser.cleanup() + + print("Testing spine-based chapter extraction...") + assert book.title == 'Parser Regression Book' + assert book.author == 'BookStorm Test' + assert book.get_total_chapters() == 2 + assert [chapter.title for chapter in book.chapters] == ['Chapter One', 'Chapter Two'] + print("Chapter extraction tests passed") + + print("\nTesting readable text extraction...") + firstChapter = book.get_chapter(0) + secondChapter = book.get_chapter(1) + + assert firstChapter is not None + assert secondChapter is not None + + assert firstChapter.paragraphs == [ + 'Visible Heading', + 'This is very bad.', + 'First item', + 'Second item' + ] + assert secondChapter.paragraphs == [ + 'Lead in text.', + 'More text.', + 'Tail text.' + ] + assert all('non-linear note' not in paragraph.lower() for chapter in book.chapters for paragraph in chapter.paragraphs) + print("Readable text tests passed") + + print("\nAll EPUB parser tests passed!") + + +if __name__ == "__main__": + test_epub_parser()