Epub now uses spine order for book navigation and headings are included in paragraph navigation.

This commit is contained in:
Storm Dragon
2025-10-19 16:02:19 -04:00
parent a934c06f6f
commit 16e01cb1f5
3 changed files with 272 additions and 220 deletions

View File

@@ -56,23 +56,18 @@ class EpubParser:
book.title = metadata.get('title', epubPath.stem)
book.author = metadata.get('creator', 'Unknown')
# Try to use TOC structure first
# Parse TOC for chapter titles
opfDir = opfPath.parent
tocChapters = self._parse_toc_structure(tempPath, opfDir, manifest)
tocMap = self._build_toc_map(tempPath, opfDir, manifest)
if tocChapters:
# Successfully parsed using TOC
for chapter in tocChapters:
book.add_chapter(chapter)
else:
# Fallback: Parse content files in spine order
for itemId in spine:
if itemId in manifest:
contentPath = opfDir / manifest[itemId]
if contentPath.exists():
chapters = self._parse_content_file(contentPath)
for chapter in chapters:
book.add_chapter(chapter)
# Parse content files in spine order (authoritative reading sequence)
for itemId in spine:
if itemId in manifest:
contentPath = opfDir / manifest[itemId]
if contentPath.exists():
chapters = self._parse_content_file(contentPath, tocMap)
for chapter in chapters:
book.add_chapter(chapter)
return book
@@ -153,9 +148,9 @@ class EpubParser:
return metadata, spine, manifest
def _parse_toc_structure(self, epubDir, opfDir, manifest):
def _build_toc_map(self, epubDir, opfDir, manifest):
"""
Parse TOC structure (NCX or nav.xhtml) to get chapters
Build a map of href -> chapter title from TOC
Args:
epubDir: Root EPUB directory
@@ -163,23 +158,23 @@ class EpubParser:
manifest: Manifest dict from OPF
Returns:
List of Chapter objects or None if TOC not found
Dict mapping href (without anchor) to chapter title
"""
# Try EPUB 3 nav.xhtml first
navChapters = self._parse_nav_xhtml(epubDir, opfDir, manifest)
if navChapters:
return navChapters
tocMap = self._parse_nav_xhtml_map(epubDir, opfDir, manifest)
if tocMap:
return tocMap
# Try EPUB 2 NCX
ncxChapters = self._parse_ncx(epubDir, opfDir, manifest)
if ncxChapters:
return ncxChapters
tocMap = self._parse_ncx_map(epubDir, opfDir, manifest)
if tocMap:
return tocMap
return None
return {}
def _parse_nav_xhtml(self, epubDir, opfDir, manifest):
def _parse_nav_xhtml_map(self, epubDir, opfDir, manifest):
"""
Parse EPUB 3 nav.xhtml for TOC structure
Parse EPUB 3 nav.xhtml to build href -> title map
Args:
epubDir: Root EPUB directory
@@ -187,7 +182,7 @@ class EpubParser:
manifest: Manifest dict from OPF
Returns:
List of Chapter objects or None
Dict mapping href to chapter title, or None
"""
# Find nav document in manifest
navPath = None
@@ -211,8 +206,8 @@ class EpubParser:
if not tocNav:
return None
# Extract chapters from nav list
chapters = []
# Extract href -> title mappings
tocMap = {}
for link in tocNav.find_all('a', href=True):
chapterTitle = link.get_text(strip=True)
href = link.get('href')
@@ -220,22 +215,20 @@ class EpubParser:
if not chapterTitle or not href:
continue
# Extract content from href location
paragraphs = self._extract_content_from_href(opfDir, href)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
# Strip anchor from href
hrefFile = href.split('#')[0]
if hrefFile:
tocMap[hrefFile] = chapterTitle
return chapters if chapters else None
return tocMap if tocMap else None
except Exception as e:
print(f"Error parsing nav.xhtml: {e}")
return None
def _parse_ncx(self, epubDir, opfDir, manifest):
def _parse_ncx_map(self, epubDir, opfDir, manifest):
"""
Parse EPUB 2 NCX file for TOC structure
Parse EPUB 2 NCX file to build href -> title map
Args:
epubDir: Root EPUB directory
@@ -243,7 +236,7 @@ class EpubParser:
manifest: Manifest dict from OPF
Returns:
List of Chapter objects or None
Dict mapping href to chapter title, or None
"""
# Find NCX file in manifest
ncxPath = None
@@ -266,13 +259,13 @@ class EpubParser:
with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), features='xml')
# Find all navPoints (top-level only)
# Find all navPoints (including nested)
navMap = soup.find('navMap')
if not navMap:
return None
chapters = []
for navPoint in navMap.find_all('navPoint', recursive=False):
tocMap = {}
for navPoint in navMap.find_all('navPoint'):
# Get chapter title
navLabel = navPoint.find('navLabel')
if navLabel:
@@ -287,78 +280,31 @@ class EpubParser:
continue
href = content.get('src')
# Strip anchor from href
hrefFile = href.split('#')[0]
if hrefFile:
tocMap[hrefFile] = chapterTitle
# Extract content from href location
paragraphs = self._extract_content_from_href(opfDir, href)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
return chapters if chapters else None
return tocMap if tocMap else None
except Exception as e:
print(f"Error parsing NCX: {e}")
return None
def _extract_content_from_href(self, opfDir, href):
"""
Extract paragraphs from a specific href location
Args:
opfDir: Directory containing OPF file
href: Content href (may include #anchor)
Returns:
List of paragraph strings
"""
# Split href into file and anchor
parts = href.split('#')
filePath = opfDir / parts[0]
anchor = parts[1] if len(parts) > 1 else None
if not filePath.exists():
return []
try:
with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
# If anchor specified, find that element
if anchor:
section = soup.find(id=anchor)
if not section:
# Try to find by name attribute
section = soup.find(attrs={'name': anchor})
if not section:
# Fallback to entire body
section = soup.find('body') or soup
else:
section = soup.find('body') or soup
# Extract paragraphs from section
paragraphs = []
for p in section.find_all('p'):
text = p.get_text(strip=True)
if text:
paragraphs.append(text)
return paragraphs
except Exception as e:
print(f"Error extracting content from {href}: {e}")
return []
def _parse_content_file(self, contentPath):
def _parse_content_file(self, contentPath, tocMap=None):
"""
Parse XHTML/HTML content file
Args:
contentPath: Path to content file
tocMap: Optional dict mapping filename to TOC title
Returns:
List of Chapter objects
"""
if tocMap is None:
tocMap = {}
try:
with open(contentPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
@@ -368,6 +314,9 @@ class EpubParser:
chapters = []
# Check if this file has a TOC title
tocTitle = tocMap.get(contentPath.name)
# Look for main content sections
# Try h1, h2, section elements
sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
@@ -376,22 +325,34 @@ class EpubParser:
# Fallback: treat entire file as one chapter
sections = [soup.find('body') or soup]
for section in sections:
for sectionIndex, section in enumerate(sections):
# Find chapter title
title = None
for hTag in ['h1', 'h2', 'h3']:
heading = section.find(hTag)
if heading:
title = heading.get_text(strip=True)
break
# Priority 1: Use TOC title for the first section if available
if sectionIndex == 0 and tocTitle:
title = tocTitle
else:
# Priority 2: Look for heading in content
for hTag in ['h1', 'h2', 'h3']:
heading = section.find(hTag)
if heading:
title = heading.get_text(strip=True)
break
# Priority 3: Fallback to filename
if not title:
title = contentPath.stem
if tocTitle:
title = tocTitle
else:
title = contentPath.stem
# Extract paragraphs
# Extract paragraphs (including headings)
paragraphs = []
for p in section.find_all('p'):
text = p.get_text(strip=True)
# Include all headings and paragraphs in reading order
for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = element.get_text(strip=True)
if text:
paragraphs.append(text)
@@ -401,12 +362,14 @@ class EpubParser:
chapter.paragraphs = paragraphs
chapters.append(chapter)
# If no chapters found, extract all paragraphs as one chapter
# If no chapters found, extract all content as one chapter
if not chapters:
title = contentPath.stem
title = tocTitle if tocTitle else contentPath.stem
paragraphs = []
for p in soup.find_all('p'):
text = p.get_text(strip=True)
# Include all headings and paragraphs in reading order
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = element.get_text(strip=True)
if text:
paragraphs.append(text)

192
src/wav_exporter.py Normal file
View File

@@ -0,0 +1,192 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WAV Exporter - Export text books to WAV audio files
Converts text books (DAISY, EPUB, PDF, TXT) to WAV files split by chapter
using piper-tts for speech synthesis.
"""
from pathlib import Path
from src.daisy_parser import DaisyParser
from src.epub_parser import EpubParser
from src.pdf_parser import PdfParser
from src.txt_parser import TxtParser
from src.tts_engine import TtsEngine
class WavExporter:
"""Export text books to WAV audio files"""
def __init__(self, config):
"""
Initialize WAV exporter
Args:
config: ConfigManager instance
"""
self.config = config
def export(self, bookPath, outputDir=None):
"""
Export book to WAV files split by chapter
Args:
bookPath: Path to book file
outputDir: Output directory (optional, defaults to ./{bookname}_audio)
Returns:
Exit code (0 for success, 1 for error)
"""
print(f"Exporting book to WAV: {bookPath}")
# Parse book using appropriate parser
bookPath = Path(bookPath)
parser = self._create_parser(bookPath)
if not parser:
return 1
try:
book = parser.parse(bookPath)
except Exception as e:
print(f"Error parsing book: {e}")
return 1
# Determine output directory
if outputDir is None:
bookName = bookPath.stem
outputDir = Path(f"./{bookName}_audio")
else:
outputDir = Path(outputDir)
outputDir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {outputDir}")
# Initialize TTS engine
tts = self._create_tts_engine()
if not tts:
return 1
voiceModel = self.config.get_voice_model()
print(f"Using voice: {voiceModel}")
print(f"Chapters: {book.get_total_chapters()}")
print()
# Export each chapter
successCount = 0
for chapterIdx in range(book.get_total_chapters()):
chapter = book.get_chapter(chapterIdx)
if not chapter:
continue
if self._export_chapter(chapter, chapterIdx, book.get_total_chapters(), tts, outputDir):
successCount += 1
parser.cleanup()
if successCount > 0:
print(f"\nExport complete! {successCount} chapters saved to: {outputDir}")
return 0
else:
print("\nExport failed! No chapters were successfully exported.")
return 1
def _create_parser(self, bookPath):
"""
Create appropriate parser for book format
Args:
bookPath: Path to book file
Returns:
Parser instance or None if unsupported format
"""
suffix = bookPath.suffix.lower()
if suffix in ['.epub']:
return EpubParser()
elif suffix in ['.zip']:
return DaisyParser()
elif suffix in ['.pdf']:
return PdfParser()
elif suffix in ['.txt']:
return TxtParser()
else:
print(f"Error: Unsupported book format: {suffix}")
print("Supported formats: .epub, .zip (DAISY), .pdf, .txt")
return None
def _create_tts_engine(self):
"""
Create TTS engine for export
Returns:
TtsEngine instance or None if not available
"""
readerEngine = self.config.get_reader_engine()
if readerEngine == 'speechd':
print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.")
return None
voiceModel = self.config.get_voice_model()
try:
return TtsEngine(voiceModel)
except Exception as e:
print(f"Error initializing TTS engine: {e}")
return None
def _export_chapter(self, chapter, chapterIdx, totalChapters, tts, outputDir):
"""
Export a single chapter to WAV file
Args:
chapter: Chapter object
chapterIdx: Chapter index (0-based)
totalChapters: Total number of chapters
tts: TtsEngine instance
outputDir: Output directory path
Returns:
True if successful, False otherwise
"""
chapterNum = chapterIdx + 1
print(f"Exporting Chapter {chapterNum}/{totalChapters}: {chapter.title}")
# Combine all paragraphs in chapter
chapterText = "\n\n".join(chapter.paragraphs)
# Generate audio
try:
wavData = tts.text_to_wav_data(chapterText)
if not wavData:
print(f" Warning: No audio generated for chapter {chapterNum}")
return False
# Save to file
sanitizedTitle = self._sanitize_filename(chapter.title)
if not sanitizedTitle:
sanitizedTitle = f"Chapter_{chapterNum}"
outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav"
with open(outputFile, 'wb') as f:
f.write(wavData)
print(f" Saved: {outputFile.name}")
return True
except Exception as e:
print(f" Error generating audio for chapter {chapterNum}: {e}")
return False
def _sanitize_filename(self, title):
"""
Sanitize chapter title for use as filename
Args:
title: Chapter title
Returns:
Sanitized filename string
"""
return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()