Epub now uses spine order for book navigation and headings are included in paragraph navigation.
This commit is contained in:
@@ -56,23 +56,18 @@ class EpubParser:
|
||||
book.title = metadata.get('title', epubPath.stem)
|
||||
book.author = metadata.get('creator', 'Unknown')
|
||||
|
||||
# Try to use TOC structure first
|
||||
# Parse TOC for chapter titles
|
||||
opfDir = opfPath.parent
|
||||
tocChapters = self._parse_toc_structure(tempPath, opfDir, manifest)
|
||||
tocMap = self._build_toc_map(tempPath, opfDir, manifest)
|
||||
|
||||
if tocChapters:
|
||||
# Successfully parsed using TOC
|
||||
for chapter in tocChapters:
|
||||
book.add_chapter(chapter)
|
||||
else:
|
||||
# Fallback: Parse content files in spine order
|
||||
for itemId in spine:
|
||||
if itemId in manifest:
|
||||
contentPath = opfDir / manifest[itemId]
|
||||
if contentPath.exists():
|
||||
chapters = self._parse_content_file(contentPath)
|
||||
for chapter in chapters:
|
||||
book.add_chapter(chapter)
|
||||
# Parse content files in spine order (authoritative reading sequence)
|
||||
for itemId in spine:
|
||||
if itemId in manifest:
|
||||
contentPath = opfDir / manifest[itemId]
|
||||
if contentPath.exists():
|
||||
chapters = self._parse_content_file(contentPath, tocMap)
|
||||
for chapter in chapters:
|
||||
book.add_chapter(chapter)
|
||||
|
||||
return book
|
||||
|
||||
@@ -153,9 +148,9 @@ class EpubParser:
|
||||
|
||||
return metadata, spine, manifest
|
||||
|
||||
def _parse_toc_structure(self, epubDir, opfDir, manifest):
|
||||
def _build_toc_map(self, epubDir, opfDir, manifest):
|
||||
"""
|
||||
Parse TOC structure (NCX or nav.xhtml) to get chapters
|
||||
Build a map of href -> chapter title from TOC
|
||||
|
||||
Args:
|
||||
epubDir: Root EPUB directory
|
||||
@@ -163,23 +158,23 @@ class EpubParser:
|
||||
manifest: Manifest dict from OPF
|
||||
|
||||
Returns:
|
||||
List of Chapter objects or None if TOC not found
|
||||
Dict mapping href (without anchor) to chapter title
|
||||
"""
|
||||
# Try EPUB 3 nav.xhtml first
|
||||
navChapters = self._parse_nav_xhtml(epubDir, opfDir, manifest)
|
||||
if navChapters:
|
||||
return navChapters
|
||||
tocMap = self._parse_nav_xhtml_map(epubDir, opfDir, manifest)
|
||||
if tocMap:
|
||||
return tocMap
|
||||
|
||||
# Try EPUB 2 NCX
|
||||
ncxChapters = self._parse_ncx(epubDir, opfDir, manifest)
|
||||
if ncxChapters:
|
||||
return ncxChapters
|
||||
tocMap = self._parse_ncx_map(epubDir, opfDir, manifest)
|
||||
if tocMap:
|
||||
return tocMap
|
||||
|
||||
return None
|
||||
return {}
|
||||
|
||||
def _parse_nav_xhtml(self, epubDir, opfDir, manifest):
|
||||
def _parse_nav_xhtml_map(self, epubDir, opfDir, manifest):
|
||||
"""
|
||||
Parse EPUB 3 nav.xhtml for TOC structure
|
||||
Parse EPUB 3 nav.xhtml to build href -> title map
|
||||
|
||||
Args:
|
||||
epubDir: Root EPUB directory
|
||||
@@ -187,7 +182,7 @@ class EpubParser:
|
||||
manifest: Manifest dict from OPF
|
||||
|
||||
Returns:
|
||||
List of Chapter objects or None
|
||||
Dict mapping href to chapter title, or None
|
||||
"""
|
||||
# Find nav document in manifest
|
||||
navPath = None
|
||||
@@ -211,8 +206,8 @@ class EpubParser:
|
||||
if not tocNav:
|
||||
return None
|
||||
|
||||
# Extract chapters from nav list
|
||||
chapters = []
|
||||
# Extract href -> title mappings
|
||||
tocMap = {}
|
||||
for link in tocNav.find_all('a', href=True):
|
||||
chapterTitle = link.get_text(strip=True)
|
||||
href = link.get('href')
|
||||
@@ -220,22 +215,20 @@ class EpubParser:
|
||||
if not chapterTitle or not href:
|
||||
continue
|
||||
|
||||
# Extract content from href location
|
||||
paragraphs = self._extract_content_from_href(opfDir, href)
|
||||
if paragraphs:
|
||||
chapter = Chapter(chapterTitle)
|
||||
chapter.paragraphs = paragraphs
|
||||
chapters.append(chapter)
|
||||
# Strip anchor from href
|
||||
hrefFile = href.split('#')[0]
|
||||
if hrefFile:
|
||||
tocMap[hrefFile] = chapterTitle
|
||||
|
||||
return chapters if chapters else None
|
||||
return tocMap if tocMap else None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing nav.xhtml: {e}")
|
||||
return None
|
||||
|
||||
def _parse_ncx(self, epubDir, opfDir, manifest):
|
||||
def _parse_ncx_map(self, epubDir, opfDir, manifest):
|
||||
"""
|
||||
Parse EPUB 2 NCX file for TOC structure
|
||||
Parse EPUB 2 NCX file to build href -> title map
|
||||
|
||||
Args:
|
||||
epubDir: Root EPUB directory
|
||||
@@ -243,7 +236,7 @@ class EpubParser:
|
||||
manifest: Manifest dict from OPF
|
||||
|
||||
Returns:
|
||||
List of Chapter objects or None
|
||||
Dict mapping href to chapter title, or None
|
||||
"""
|
||||
# Find NCX file in manifest
|
||||
ncxPath = None
|
||||
@@ -266,13 +259,13 @@ class EpubParser:
|
||||
with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
soup = BeautifulSoup(f.read(), features='xml')
|
||||
|
||||
# Find all navPoints (top-level only)
|
||||
# Find all navPoints (including nested)
|
||||
navMap = soup.find('navMap')
|
||||
if not navMap:
|
||||
return None
|
||||
|
||||
chapters = []
|
||||
for navPoint in navMap.find_all('navPoint', recursive=False):
|
||||
tocMap = {}
|
||||
for navPoint in navMap.find_all('navPoint'):
|
||||
# Get chapter title
|
||||
navLabel = navPoint.find('navLabel')
|
||||
if navLabel:
|
||||
@@ -287,78 +280,31 @@ class EpubParser:
|
||||
continue
|
||||
|
||||
href = content.get('src')
|
||||
# Strip anchor from href
|
||||
hrefFile = href.split('#')[0]
|
||||
if hrefFile:
|
||||
tocMap[hrefFile] = chapterTitle
|
||||
|
||||
# Extract content from href location
|
||||
paragraphs = self._extract_content_from_href(opfDir, href)
|
||||
if paragraphs:
|
||||
chapter = Chapter(chapterTitle)
|
||||
chapter.paragraphs = paragraphs
|
||||
chapters.append(chapter)
|
||||
|
||||
return chapters if chapters else None
|
||||
return tocMap if tocMap else None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing NCX: {e}")
|
||||
return None
|
||||
|
||||
def _extract_content_from_href(self, opfDir, href):
|
||||
"""
|
||||
Extract paragraphs from a specific href location
|
||||
|
||||
Args:
|
||||
opfDir: Directory containing OPF file
|
||||
href: Content href (may include #anchor)
|
||||
|
||||
Returns:
|
||||
List of paragraph strings
|
||||
"""
|
||||
# Split href into file and anchor
|
||||
parts = href.split('#')
|
||||
filePath = opfDir / parts[0]
|
||||
anchor = parts[1] if len(parts) > 1 else None
|
||||
|
||||
if not filePath.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||
|
||||
# If anchor specified, find that element
|
||||
if anchor:
|
||||
section = soup.find(id=anchor)
|
||||
if not section:
|
||||
# Try to find by name attribute
|
||||
section = soup.find(attrs={'name': anchor})
|
||||
if not section:
|
||||
# Fallback to entire body
|
||||
section = soup.find('body') or soup
|
||||
else:
|
||||
section = soup.find('body') or soup
|
||||
|
||||
# Extract paragraphs from section
|
||||
paragraphs = []
|
||||
for p in section.find_all('p'):
|
||||
text = p.get_text(strip=True)
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
|
||||
return paragraphs
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting content from {href}: {e}")
|
||||
return []
|
||||
|
||||
def _parse_content_file(self, contentPath):
|
||||
def _parse_content_file(self, contentPath, tocMap=None):
|
||||
"""
|
||||
Parse XHTML/HTML content file
|
||||
|
||||
Args:
|
||||
contentPath: Path to content file
|
||||
tocMap: Optional dict mapping filename to TOC title
|
||||
|
||||
Returns:
|
||||
List of Chapter objects
|
||||
"""
|
||||
if tocMap is None:
|
||||
tocMap = {}
|
||||
|
||||
try:
|
||||
with open(contentPath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||
@@ -368,6 +314,9 @@ class EpubParser:
|
||||
|
||||
chapters = []
|
||||
|
||||
# Check if this file has a TOC title
|
||||
tocTitle = tocMap.get(contentPath.name)
|
||||
|
||||
# Look for main content sections
|
||||
# Try h1, h2, section elements
|
||||
sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
|
||||
@@ -376,22 +325,34 @@ class EpubParser:
|
||||
# Fallback: treat entire file as one chapter
|
||||
sections = [soup.find('body') or soup]
|
||||
|
||||
for section in sections:
|
||||
for sectionIndex, section in enumerate(sections):
|
||||
# Find chapter title
|
||||
title = None
|
||||
for hTag in ['h1', 'h2', 'h3']:
|
||||
heading = section.find(hTag)
|
||||
if heading:
|
||||
title = heading.get_text(strip=True)
|
||||
break
|
||||
|
||||
# Priority 1: Use TOC title for the first section if available
|
||||
if sectionIndex == 0 and tocTitle:
|
||||
title = tocTitle
|
||||
else:
|
||||
# Priority 2: Look for heading in content
|
||||
for hTag in ['h1', 'h2', 'h3']:
|
||||
heading = section.find(hTag)
|
||||
if heading:
|
||||
title = heading.get_text(strip=True)
|
||||
break
|
||||
|
||||
# Priority 3: Fallback to filename
|
||||
if not title:
|
||||
title = contentPath.stem
|
||||
if tocTitle:
|
||||
title = tocTitle
|
||||
else:
|
||||
title = contentPath.stem
|
||||
|
||||
# Extract paragraphs
|
||||
# Extract paragraphs (including headings)
|
||||
paragraphs = []
|
||||
for p in section.find_all('p'):
|
||||
text = p.get_text(strip=True)
|
||||
|
||||
# Include all headings and paragraphs in reading order
|
||||
for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
|
||||
@@ -401,12 +362,14 @@ class EpubParser:
|
||||
chapter.paragraphs = paragraphs
|
||||
chapters.append(chapter)
|
||||
|
||||
# If no chapters found, extract all paragraphs as one chapter
|
||||
# If no chapters found, extract all content as one chapter
|
||||
if not chapters:
|
||||
title = contentPath.stem
|
||||
title = tocTitle if tocTitle else contentPath.stem
|
||||
paragraphs = []
|
||||
for p in soup.find_all('p'):
|
||||
text = p.get_text(strip=True)
|
||||
|
||||
# Include all headings and paragraphs in reading order
|
||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
|
||||
|
||||
192
src/wav_exporter.py
Normal file
192
src/wav_exporter.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
WAV Exporter - Export text books to WAV audio files
|
||||
|
||||
Converts text books (DAISY, EPUB, PDF, TXT) to WAV files split by chapter
|
||||
using piper-tts for speech synthesis.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from src.daisy_parser import DaisyParser
|
||||
from src.epub_parser import EpubParser
|
||||
from src.pdf_parser import PdfParser
|
||||
from src.txt_parser import TxtParser
|
||||
from src.tts_engine import TtsEngine
|
||||
|
||||
|
||||
class WavExporter:
|
||||
"""Export text books to WAV audio files"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
Initialize WAV exporter
|
||||
|
||||
Args:
|
||||
config: ConfigManager instance
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
def export(self, bookPath, outputDir=None):
|
||||
"""
|
||||
Export book to WAV files split by chapter
|
||||
|
||||
Args:
|
||||
bookPath: Path to book file
|
||||
outputDir: Output directory (optional, defaults to ./{bookname}_audio)
|
||||
|
||||
Returns:
|
||||
Exit code (0 for success, 1 for error)
|
||||
"""
|
||||
print(f"Exporting book to WAV: {bookPath}")
|
||||
|
||||
# Parse book using appropriate parser
|
||||
bookPath = Path(bookPath)
|
||||
parser = self._create_parser(bookPath)
|
||||
if not parser:
|
||||
return 1
|
||||
|
||||
try:
|
||||
book = parser.parse(bookPath)
|
||||
except Exception as e:
|
||||
print(f"Error parsing book: {e}")
|
||||
return 1
|
||||
|
||||
# Determine output directory
|
||||
if outputDir is None:
|
||||
bookName = bookPath.stem
|
||||
outputDir = Path(f"./{bookName}_audio")
|
||||
else:
|
||||
outputDir = Path(outputDir)
|
||||
|
||||
outputDir.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Output directory: {outputDir}")
|
||||
|
||||
# Initialize TTS engine
|
||||
tts = self._create_tts_engine()
|
||||
if not tts:
|
||||
return 1
|
||||
|
||||
voiceModel = self.config.get_voice_model()
|
||||
print(f"Using voice: {voiceModel}")
|
||||
print(f"Chapters: {book.get_total_chapters()}")
|
||||
print()
|
||||
|
||||
# Export each chapter
|
||||
successCount = 0
|
||||
for chapterIdx in range(book.get_total_chapters()):
|
||||
chapter = book.get_chapter(chapterIdx)
|
||||
if not chapter:
|
||||
continue
|
||||
|
||||
if self._export_chapter(chapter, chapterIdx, book.get_total_chapters(), tts, outputDir):
|
||||
successCount += 1
|
||||
|
||||
parser.cleanup()
|
||||
|
||||
if successCount > 0:
|
||||
print(f"\nExport complete! {successCount} chapters saved to: {outputDir}")
|
||||
return 0
|
||||
else:
|
||||
print("\nExport failed! No chapters were successfully exported.")
|
||||
return 1
|
||||
|
||||
def _create_parser(self, bookPath):
|
||||
"""
|
||||
Create appropriate parser for book format
|
||||
|
||||
Args:
|
||||
bookPath: Path to book file
|
||||
|
||||
Returns:
|
||||
Parser instance or None if unsupported format
|
||||
"""
|
||||
suffix = bookPath.suffix.lower()
|
||||
|
||||
if suffix in ['.epub']:
|
||||
return EpubParser()
|
||||
elif suffix in ['.zip']:
|
||||
return DaisyParser()
|
||||
elif suffix in ['.pdf']:
|
||||
return PdfParser()
|
||||
elif suffix in ['.txt']:
|
||||
return TxtParser()
|
||||
else:
|
||||
print(f"Error: Unsupported book format: {suffix}")
|
||||
print("Supported formats: .epub, .zip (DAISY), .pdf, .txt")
|
||||
return None
|
||||
|
||||
def _create_tts_engine(self):
|
||||
"""
|
||||
Create TTS engine for export
|
||||
|
||||
Returns:
|
||||
TtsEngine instance or None if not available
|
||||
"""
|
||||
readerEngine = self.config.get_reader_engine()
|
||||
if readerEngine == 'speechd':
|
||||
print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.")
|
||||
return None
|
||||
|
||||
voiceModel = self.config.get_voice_model()
|
||||
try:
|
||||
return TtsEngine(voiceModel)
|
||||
except Exception as e:
|
||||
print(f"Error initializing TTS engine: {e}")
|
||||
return None
|
||||
|
||||
def _export_chapter(self, chapter, chapterIdx, totalChapters, tts, outputDir):
|
||||
"""
|
||||
Export a single chapter to WAV file
|
||||
|
||||
Args:
|
||||
chapter: Chapter object
|
||||
chapterIdx: Chapter index (0-based)
|
||||
totalChapters: Total number of chapters
|
||||
tts: TtsEngine instance
|
||||
outputDir: Output directory path
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
chapterNum = chapterIdx + 1
|
||||
print(f"Exporting Chapter {chapterNum}/{totalChapters}: {chapter.title}")
|
||||
|
||||
# Combine all paragraphs in chapter
|
||||
chapterText = "\n\n".join(chapter.paragraphs)
|
||||
|
||||
# Generate audio
|
||||
try:
|
||||
wavData = tts.text_to_wav_data(chapterText)
|
||||
if not wavData:
|
||||
print(f" Warning: No audio generated for chapter {chapterNum}")
|
||||
return False
|
||||
|
||||
# Save to file
|
||||
sanitizedTitle = self._sanitize_filename(chapter.title)
|
||||
if not sanitizedTitle:
|
||||
sanitizedTitle = f"Chapter_{chapterNum}"
|
||||
|
||||
outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav"
|
||||
with open(outputFile, 'wb') as f:
|
||||
f.write(wavData)
|
||||
|
||||
print(f" Saved: {outputFile.name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error generating audio for chapter {chapterNum}: {e}")
|
||||
return False
|
||||
|
||||
def _sanitize_filename(self, title):
|
||||
"""
|
||||
Sanitize chapter title for use as filename
|
||||
|
||||
Args:
|
||||
title: Chapter title
|
||||
|
||||
Returns:
|
||||
Sanitized filename string
|
||||
"""
|
||||
return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
|
||||
Reference in New Issue
Block a user