diff --git a/bookstorm.py b/bookstorm.py index 9118a35..6fbf595 100755 --- a/bookstorm.py +++ b/bookstorm.py @@ -50,6 +50,7 @@ from src.audiobookshelf_client import AudiobookshelfClient from src.audiobookshelf_menu import AudiobookshelfMenu from src.server_link_manager import ServerLinkManager from src.bookmarks_menu import BookmarksMenu +from src.wav_exporter import WavExporter class BookReader: @@ -2595,7 +2596,8 @@ def main(): # Handle export mode if args.wav: - return export_to_wav(bookPath, config, args.outputDir) + exporter = WavExporter(config) + return exporter.export(bookPath, args.outputDir) # Interactive reading mode config.set_last_book(bookPath) @@ -2614,110 +2616,5 @@ def main(): return 0 -def export_to_wav(bookPath, config, outputDir=None): - """ - Export book to WAV files split by chapter - - Args: - bookPath: Path to book file - config: ConfigManager instance - outputDir: Output directory (optional) - - Returns: - Exit code - """ - from src.daisy_parser import DaisyParser - from src.epub_parser import EpubParser - from src.pdf_parser import PdfParser - from src.txt_parser import TxtParser - from src.tts_engine import TtsEngine - import wave - - print(f"Exporting book to WAV: {bookPath}") - - # Parse book using appropriate parser - bookPath = Path(bookPath) - suffix = bookPath.suffix.lower() - - if suffix in ['.epub']: - parser = EpubParser() - elif suffix in ['.zip']: - parser = DaisyParser() - elif suffix in ['.pdf']: - parser = PdfParser() - elif suffix in ['.txt']: - parser = TxtParser() - else: - print(f"Error: Unsupported book format: {suffix}") - return 1 - - try: - book = parser.parse(bookPath) - except Exception as e: - print(f"Error parsing book: {e}") - return 1 - - # Determine output directory - if outputDir is None: - bookName = Path(bookPath).stem - outputDir = Path(f"./{bookName}_audio") - else: - outputDir = Path(outputDir) - - outputDir.mkdir(parents=True, exist_ok=True) - print(f"Output directory: {outputDir}") - - # Initialize TTS engine - readerEngine = config.get_reader_engine() - if readerEngine == 'speechd': - print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.") - return 1 - - voiceModel = config.get_voice_model() - tts = TtsEngine(voiceModel) - - print(f"Using voice: {voiceModel}") - print(f"Chapters: {book.get_total_chapters()}") - print() - - # Export each chapter - for chapterIdx in range(book.get_total_chapters()): - chapter = book.get_chapter(chapterIdx) - if not chapter: - continue - - chapterNum = chapterIdx + 1 - print(f"Exporting Chapter {chapterNum}/{book.get_total_chapters()}: {chapter.title}") - - # Combine all paragraphs in chapter - chapterText = "\n\n".join(chapter.paragraphs) - - # Generate audio - try: - wavData = tts.text_to_wav_data(chapterText) - if not wavData: - print(f" Warning: No audio generated for chapter {chapterNum}") - continue - - # Save to file - sanitizedTitle = "".join(c for c in chapter.title if c.isalnum() or c in (' ', '-', '_')).strip() - if not sanitizedTitle: - sanitizedTitle = f"Chapter_{chapterNum}" - - outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav" - with open(outputFile, 'wb') as f: - f.write(wavData) - - print(f" Saved: {outputFile.name}") - - except Exception as e: - print(f" Error generating audio for chapter {chapterNum}: {e}") - continue - - parser.cleanup() - print(f"\nExport complete! Files saved to: {outputDir}") - return 0 - - if __name__ == '__main__': sys.exit(main()) diff --git a/src/epub_parser.py b/src/epub_parser.py index ac03d63..f210cab 100644 --- a/src/epub_parser.py +++ b/src/epub_parser.py @@ -56,23 +56,18 @@ class EpubParser: book.title = metadata.get('title', epubPath.stem) book.author = metadata.get('creator', 'Unknown') - # Try to use TOC structure first + # Parse TOC for chapter titles opfDir = opfPath.parent - tocChapters = self._parse_toc_structure(tempPath, opfDir, manifest) + tocMap = self._build_toc_map(tempPath, opfDir, manifest) - if tocChapters: - # Successfully parsed using TOC - for chapter in tocChapters: - book.add_chapter(chapter) - else: - # Fallback: Parse content files in spine order - for itemId in spine: - if itemId in manifest: - contentPath = opfDir / manifest[itemId] - if contentPath.exists(): - chapters = self._parse_content_file(contentPath) - for chapter in chapters: - book.add_chapter(chapter) + # Parse content files in spine order (authoritative reading sequence) + for itemId in spine: + if itemId in manifest: + contentPath = opfDir / manifest[itemId] + if contentPath.exists(): + chapters = self._parse_content_file(contentPath, tocMap) + for chapter in chapters: + book.add_chapter(chapter) return book @@ -153,9 +148,9 @@ class EpubParser: return metadata, spine, manifest - def _parse_toc_structure(self, epubDir, opfDir, manifest): + def _build_toc_map(self, epubDir, opfDir, manifest): """ - Parse TOC structure (NCX or nav.xhtml) to get chapters + Build a map of href -> chapter title from TOC Args: epubDir: Root EPUB directory @@ -163,23 +158,23 @@ class EpubParser: manifest: Manifest dict from OPF Returns: - List of Chapter objects or None if TOC not found + Dict mapping href (without anchor) to chapter title """ # Try EPUB 3 nav.xhtml first - navChapters = self._parse_nav_xhtml(epubDir, opfDir, manifest) - if navChapters: - return navChapters + tocMap = self._parse_nav_xhtml_map(epubDir, opfDir, manifest) + if tocMap: + return tocMap # Try EPUB 2 NCX - ncxChapters = self._parse_ncx(epubDir, opfDir, manifest) - if ncxChapters: - return ncxChapters + tocMap = self._parse_ncx_map(epubDir, opfDir, manifest) + if tocMap: + return tocMap - return None + return {} - def _parse_nav_xhtml(self, epubDir, opfDir, manifest): + def _parse_nav_xhtml_map(self, epubDir, opfDir, manifest): """ - Parse EPUB 3 nav.xhtml for TOC structure + Parse EPUB 3 nav.xhtml to build href -> title map Args: epubDir: Root EPUB directory @@ -187,7 +182,7 @@ class EpubParser: manifest: Manifest dict from OPF Returns: - List of Chapter objects or None + Dict mapping href to chapter title, or None """ # Find nav document in manifest navPath = None @@ -211,8 +206,8 @@ class EpubParser: if not tocNav: return None - # Extract chapters from nav list - chapters = [] + # Extract href -> title mappings + tocMap = {} for link in tocNav.find_all('a', href=True): chapterTitle = link.get_text(strip=True) href = link.get('href') @@ -220,22 +215,20 @@ class EpubParser: if not chapterTitle or not href: continue - # Extract content from href location - paragraphs = self._extract_content_from_href(opfDir, href) - if paragraphs: - chapter = Chapter(chapterTitle) - chapter.paragraphs = paragraphs - chapters.append(chapter) + # Strip anchor from href + hrefFile = href.split('#')[0] + if hrefFile: + tocMap[hrefFile] = chapterTitle - return chapters if chapters else None + return tocMap if tocMap else None except Exception as e: print(f"Error parsing nav.xhtml: {e}") return None - def _parse_ncx(self, epubDir, opfDir, manifest): + def _parse_ncx_map(self, epubDir, opfDir, manifest): """ - Parse EPUB 2 NCX file for TOC structure + Parse EPUB 2 NCX file to build href -> title map Args: epubDir: Root EPUB directory @@ -243,7 +236,7 @@ class EpubParser: manifest: Manifest dict from OPF Returns: - List of Chapter objects or None + Dict mapping href to chapter title, or None """ # Find NCX file in manifest ncxPath = None @@ -266,13 +259,13 @@ class EpubParser: with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), features='xml') - # Find all navPoints (top-level only) + # Find all navPoints (including nested) navMap = soup.find('navMap') if not navMap: return None - chapters = [] - for navPoint in navMap.find_all('navPoint', recursive=False): + tocMap = {} + for navPoint in navMap.find_all('navPoint'): # Get chapter title navLabel = navPoint.find('navLabel') if navLabel: @@ -287,78 +280,31 @@ class EpubParser: continue href = content.get('src') + # Strip anchor from href + hrefFile = href.split('#')[0] + if hrefFile: + tocMap[hrefFile] = chapterTitle - # Extract content from href location - paragraphs = self._extract_content_from_href(opfDir, href) - if paragraphs: - chapter = Chapter(chapterTitle) - chapter.paragraphs = paragraphs - chapters.append(chapter) - - return chapters if chapters else None + return tocMap if tocMap else None except Exception as e: print(f"Error parsing NCX: {e}") return None - def _extract_content_from_href(self, opfDir, href): - """ - Extract paragraphs from a specific href location - - Args: - opfDir: Directory containing OPF file - href: Content href (may include #anchor) - - Returns: - List of paragraph strings - """ - # Split href into file and anchor - parts = href.split('#') - filePath = opfDir / parts[0] - anchor = parts[1] if len(parts) > 1 else None - - if not filePath.exists(): - return [] - - try: - with open(filePath, 'r', encoding='utf-8', errors='ignore') as f: - soup = BeautifulSoup(f.read(), 'html.parser') - - # If anchor specified, find that element - if anchor: - section = soup.find(id=anchor) - if not section: - # Try to find by name attribute - section = soup.find(attrs={'name': anchor}) - if not section: - # Fallback to entire body - section = soup.find('body') or soup - else: - section = soup.find('body') or soup - - # Extract paragraphs from section - paragraphs = [] - for p in section.find_all('p'): - text = p.get_text(strip=True) - if text: - paragraphs.append(text) - - return paragraphs - - except Exception as e: - print(f"Error extracting content from {href}: {e}") - return [] - - def _parse_content_file(self, contentPath): + def _parse_content_file(self, contentPath, tocMap=None): """ Parse XHTML/HTML content file Args: contentPath: Path to content file + tocMap: Optional dict mapping filename to TOC title Returns: List of Chapter objects """ + if tocMap is None: + tocMap = {} + try: with open(contentPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), 'html.parser') @@ -368,6 +314,9 @@ class EpubParser: chapters = [] + # Check if this file has a TOC title + tocTitle = tocMap.get(contentPath.name) + # Look for main content sections # Try h1, h2, section elements sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False) @@ -376,22 +325,34 @@ class EpubParser: # Fallback: treat entire file as one chapter sections = [soup.find('body') or soup] - for section in sections: + for sectionIndex, section in enumerate(sections): # Find chapter title title = None - for hTag in ['h1', 'h2', 'h3']: - heading = section.find(hTag) - if heading: - title = heading.get_text(strip=True) - break + # Priority 1: Use TOC title for the first section if available + if sectionIndex == 0 and tocTitle: + title = tocTitle + else: + # Priority 2: Look for heading in content + for hTag in ['h1', 'h2', 'h3']: + heading = section.find(hTag) + if heading: + title = heading.get_text(strip=True) + break + + # Priority 3: Fallback to filename if not title: - title = contentPath.stem + if tocTitle: + title = tocTitle + else: + title = contentPath.stem - # Extract paragraphs + # Extract paragraphs (including headings) paragraphs = [] - for p in section.find_all('p'): - text = p.get_text(strip=True) + + # Include all headings and paragraphs in reading order + for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): + text = element.get_text(strip=True) if text: paragraphs.append(text) @@ -401,12 +362,14 @@ class EpubParser: chapter.paragraphs = paragraphs chapters.append(chapter) - # If no chapters found, extract all paragraphs as one chapter + # If no chapters found, extract all content as one chapter if not chapters: - title = contentPath.stem + title = tocTitle if tocTitle else contentPath.stem paragraphs = [] - for p in soup.find_all('p'): - text = p.get_text(strip=True) + + # Include all headings and paragraphs in reading order + for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): + text = element.get_text(strip=True) if text: paragraphs.append(text) diff --git a/src/wav_exporter.py b/src/wav_exporter.py new file mode 100644 index 0000000..07936cb --- /dev/null +++ b/src/wav_exporter.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +WAV Exporter - Export text books to WAV audio files + +Converts text books (DAISY, EPUB, PDF, TXT) to WAV files split by chapter +using piper-tts for speech synthesis. +""" + +from pathlib import Path + +from src.daisy_parser import DaisyParser +from src.epub_parser import EpubParser +from src.pdf_parser import PdfParser +from src.txt_parser import TxtParser +from src.tts_engine import TtsEngine + + +class WavExporter: + """Export text books to WAV audio files""" + + def __init__(self, config): + """ + Initialize WAV exporter + + Args: + config: ConfigManager instance + """ + self.config = config + + def export(self, bookPath, outputDir=None): + """ + Export book to WAV files split by chapter + + Args: + bookPath: Path to book file + outputDir: Output directory (optional, defaults to ./{bookname}_audio) + + Returns: + Exit code (0 for success, 1 for error) + """ + print(f"Exporting book to WAV: {bookPath}") + + # Parse book using appropriate parser + bookPath = Path(bookPath) + parser = self._create_parser(bookPath) + if not parser: + return 1 + + try: + book = parser.parse(bookPath) + except Exception as e: + print(f"Error parsing book: {e}") + return 1 + + # Determine output directory + if outputDir is None: + bookName = bookPath.stem + outputDir = Path(f"./{bookName}_audio") + else: + outputDir = Path(outputDir) + + outputDir.mkdir(parents=True, exist_ok=True) + print(f"Output directory: {outputDir}") + + # Initialize TTS engine + tts = self._create_tts_engine() + if not tts: + return 1 + + voiceModel = self.config.get_voice_model() + print(f"Using voice: {voiceModel}") + print(f"Chapters: {book.get_total_chapters()}") + print() + + # Export each chapter + successCount = 0 + for chapterIdx in range(book.get_total_chapters()): + chapter = book.get_chapter(chapterIdx) + if not chapter: + continue + + if self._export_chapter(chapter, chapterIdx, book.get_total_chapters(), tts, outputDir): + successCount += 1 + + parser.cleanup() + + if successCount > 0: + print(f"\nExport complete! {successCount} chapters saved to: {outputDir}") + return 0 + else: + print("\nExport failed! No chapters were successfully exported.") + return 1 + + def _create_parser(self, bookPath): + """ + Create appropriate parser for book format + + Args: + bookPath: Path to book file + + Returns: + Parser instance or None if unsupported format + """ + suffix = bookPath.suffix.lower() + + if suffix in ['.epub']: + return EpubParser() + elif suffix in ['.zip']: + return DaisyParser() + elif suffix in ['.pdf']: + return PdfParser() + elif suffix in ['.txt']: + return TxtParser() + else: + print(f"Error: Unsupported book format: {suffix}") + print("Supported formats: .epub, .zip (DAISY), .pdf, .txt") + return None + + def _create_tts_engine(self): + """ + Create TTS engine for export + + Returns: + TtsEngine instance or None if not available + """ + readerEngine = self.config.get_reader_engine() + if readerEngine == 'speechd': + print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.") + return None + + voiceModel = self.config.get_voice_model() + try: + return TtsEngine(voiceModel) + except Exception as e: + print(f"Error initializing TTS engine: {e}") + return None + + def _export_chapter(self, chapter, chapterIdx, totalChapters, tts, outputDir): + """ + Export a single chapter to WAV file + + Args: + chapter: Chapter object + chapterIdx: Chapter index (0-based) + totalChapters: Total number of chapters + tts: TtsEngine instance + outputDir: Output directory path + + Returns: + True if successful, False otherwise + """ + chapterNum = chapterIdx + 1 + print(f"Exporting Chapter {chapterNum}/{totalChapters}: {chapter.title}") + + # Combine all paragraphs in chapter + chapterText = "\n\n".join(chapter.paragraphs) + + # Generate audio + try: + wavData = tts.text_to_wav_data(chapterText) + if not wavData: + print(f" Warning: No audio generated for chapter {chapterNum}") + return False + + # Save to file + sanitizedTitle = self._sanitize_filename(chapter.title) + if not sanitizedTitle: + sanitizedTitle = f"Chapter_{chapterNum}" + + outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav" + with open(outputFile, 'wb') as f: + f.write(wavData) + + print(f" Saved: {outputFile.name}") + return True + + except Exception as e: + print(f" Error generating audio for chapter {chapterNum}: {e}") + return False + + def _sanitize_filename(self, title): + """ + Sanitize chapter title for use as filename + + Args: + title: Chapter title + + Returns: + Sanitized filename string + """ + return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()