Epub now uses spine order for book navigation and headings are included in paragraph navigation.

2025-10-19 16:02:19 -04:00
parent a934c06f6f
commit 16e01cb1f5
3 changed files with 272 additions and 220 deletions
@@ -50,6 +50,7 @@ from src.audiobookshelf_client import AudiobookshelfClient
 from src.audiobookshelf_menu import AudiobookshelfMenu
 from src.server_link_manager import ServerLinkManager
 from src.bookmarks_menu import BookmarksMenu
 from src.wav_exporter import WavExporter
 class BookReader:
@@ -2595,7 +2596,8 @@ def main():
    # Handle export mode
    if args.wav:
-        return export_to_wav(bookPath, config, args.outputDir)
+        exporter = WavExporter(config)
        return exporter.export(bookPath, args.outputDir)
    # Interactive reading mode
    config.set_last_book(bookPath)
@@ -2614,110 +2616,5 @@ def main():
    return 0
 def export_to_wav(bookPath, config, outputDir=None):
    """
    Export book to WAV files split by chapter
    Args:
        bookPath: Path to book file
        config: ConfigManager instance
        outputDir: Output directory (optional)
    Returns:
        Exit code
    """
    from src.daisy_parser import DaisyParser
    from src.epub_parser import EpubParser
    from src.pdf_parser import PdfParser
    from src.txt_parser import TxtParser
    from src.tts_engine import TtsEngine
    import wave
    print(f"Exporting book to WAV: {bookPath}")
    # Parse book using appropriate parser
    bookPath = Path(bookPath)
    suffix = bookPath.suffix.lower()
    if suffix in ['.epub']:
        parser = EpubParser()
    elif suffix in ['.zip']:
        parser = DaisyParser()
    elif suffix in ['.pdf']:
        parser = PdfParser()
    elif suffix in ['.txt']:
        parser = TxtParser()
    else:
        print(f"Error: Unsupported book format: {suffix}")
        return 1
    try:
        book = parser.parse(bookPath)
    except Exception as e:
        print(f"Error parsing book: {e}")
        return 1
    # Determine output directory
    if outputDir is None:
        bookName = Path(bookPath).stem
        outputDir = Path(f"./{bookName}_audio")
    else:
        outputDir = Path(outputDir)
    outputDir.mkdir(parents=True, exist_ok=True)
    print(f"Output directory: {outputDir}")
    # Initialize TTS engine
    readerEngine = config.get_reader_engine()
    if readerEngine == 'speechd':
        print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.")
        return 1
    voiceModel = config.get_voice_model()
    tts = TtsEngine(voiceModel)
    print(f"Using voice: {voiceModel}")
    print(f"Chapters: {book.get_total_chapters()}")
    print()
    # Export each chapter
    for chapterIdx in range(book.get_total_chapters()):
        chapter = book.get_chapter(chapterIdx)
        if not chapter:
            continue
        chapterNum = chapterIdx + 1
        print(f"Exporting Chapter {chapterNum}/{book.get_total_chapters()}: {chapter.title}")
        # Combine all paragraphs in chapter
        chapterText = "\n\n".join(chapter.paragraphs)
        # Generate audio
        try:
            wavData = tts.text_to_wav_data(chapterText)
            if not wavData:
                print(f"  Warning: No audio generated for chapter {chapterNum}")
                continue
            # Save to file
            sanitizedTitle = "".join(c for c in chapter.title if c.isalnum() or c in (' ', '-', '_')).strip()
            if not sanitizedTitle:
                sanitizedTitle = f"Chapter_{chapterNum}"
            outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav"
            with open(outputFile, 'wb') as f:
                f.write(wavData)
            print(f"  Saved: {outputFile.name}")
        except Exception as e:
            print(f"  Error generating audio for chapter {chapterNum}: {e}")
            continue
    parser.cleanup()
    print(f"\nExport complete! Files saved to: {outputDir}")
    return 0
 if __name__ == '__main__':
    sys.exit(main())
@@ -56,23 +56,18 @@ class EpubParser:
            book.title = metadata.get('title', epubPath.stem)
            book.author = metadata.get('creator', 'Unknown')
-            # Try to use TOC structure first
+            # Parse TOC for chapter titles
            opfDir = opfPath.parent
-            tocChapters = self._parse_toc_structure(tempPath, opfDir, manifest)
+            tocMap = self._build_toc_map(tempPath, opfDir, manifest)
-            if tocChapters:
+            # Parse content files in spine order (authoritative reading sequence)
-                # Successfully parsed using TOC
+            for itemId in spine:
-                for chapter in tocChapters:
+                if itemId in manifest:
-                    book.add_chapter(chapter)
+                    contentPath = opfDir / manifest[itemId]
-            else:
+                    if contentPath.exists():
-                # Fallback: Parse content files in spine order
+                        chapters = self._parse_content_file(contentPath, tocMap)
-                for itemId in spine:
+                        for chapter in chapters:
-                    if itemId in manifest:
+                            book.add_chapter(chapter)
                        contentPath = opfDir / manifest[itemId]
                        if contentPath.exists():
                            chapters = self._parse_content_file(contentPath)
                            for chapter in chapters:
                                book.add_chapter(chapter)
            return book
@@ -153,9 +148,9 @@ class EpubParser:
        return metadata, spine, manifest
-    def _parse_toc_structure(self, epubDir, opfDir, manifest):
+    def _build_toc_map(self, epubDir, opfDir, manifest):
        """
-        Parse TOC structure (NCX or nav.xhtml) to get chapters
+        Build a map of href -> chapter title from TOC
        Args:
            epubDir: Root EPUB directory
@@ -163,23 +158,23 @@ class EpubParser:
            manifest: Manifest dict from OPF
        Returns:
-            List of Chapter objects or None if TOC not found
+            Dict mapping href (without anchor) to chapter title
        """
        # Try EPUB 3 nav.xhtml first
-        navChapters = self._parse_nav_xhtml(epubDir, opfDir, manifest)
+        tocMap = self._parse_nav_xhtml_map(epubDir, opfDir, manifest)
-        if navChapters:
+        if tocMap:
-            return navChapters
+            return tocMap
        # Try EPUB 2 NCX
-        ncxChapters = self._parse_ncx(epubDir, opfDir, manifest)
+        tocMap = self._parse_ncx_map(epubDir, opfDir, manifest)
-        if ncxChapters:
+        if tocMap:
-            return ncxChapters
+            return tocMap
-        return None
+        return {}
-    def _parse_nav_xhtml(self, epubDir, opfDir, manifest):
+    def _parse_nav_xhtml_map(self, epubDir, opfDir, manifest):
        """
-        Parse EPUB 3 nav.xhtml for TOC structure
+        Parse EPUB 3 nav.xhtml to build href -> title map
        Args:
            epubDir: Root EPUB directory
@@ -187,7 +182,7 @@ class EpubParser:
            manifest: Manifest dict from OPF
        Returns:
-            List of Chapter objects or None
+            Dict mapping href to chapter title, or None
        """
        # Find nav document in manifest
        navPath = None
@@ -211,8 +206,8 @@ class EpubParser:
            if not tocNav:
                return None
-            # Extract chapters from nav list
+            # Extract href -> title mappings
-            chapters = []
+            tocMap = {}
            for link in tocNav.find_all('a', href=True):
                chapterTitle = link.get_text(strip=True)
                href = link.get('href')
@@ -220,22 +215,20 @@ class EpubParser:
                if not chapterTitle or not href:
                    continue
-                # Extract content from href location
+                # Strip anchor from href
-                paragraphs = self._extract_content_from_href(opfDir, href)
+                hrefFile = href.split('#')[0]
-                if paragraphs:
+                if hrefFile:
-                    chapter = Chapter(chapterTitle)
+                    tocMap[hrefFile] = chapterTitle
                    chapter.paragraphs = paragraphs
                    chapters.append(chapter)
-            return chapters if chapters else None
+            return tocMap if tocMap else None
        except Exception as e:
            print(f"Error parsing nav.xhtml: {e}")
            return None
-    def _parse_ncx(self, epubDir, opfDir, manifest):
+    def _parse_ncx_map(self, epubDir, opfDir, manifest):
        """
-        Parse EPUB 2 NCX file for TOC structure
+        Parse EPUB 2 NCX file to build href -> title map
        Args:
            epubDir: Root EPUB directory
@@ -243,7 +236,7 @@ class EpubParser:
            manifest: Manifest dict from OPF
        Returns:
-            List of Chapter objects or None
+            Dict mapping href to chapter title, or None
        """
        # Find NCX file in manifest
        ncxPath = None
@@ -266,13 +259,13 @@ class EpubParser:
            with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f.read(), features='xml')
-            # Find all navPoints (top-level only)
+            # Find all navPoints (including nested)
            navMap = soup.find('navMap')
            if not navMap:
                return None
-            chapters = []
+            tocMap = {}
-            for navPoint in navMap.find_all('navPoint', recursive=False):
+            for navPoint in navMap.find_all('navPoint'):
                # Get chapter title
                navLabel = navPoint.find('navLabel')
                if navLabel:
@@ -287,78 +280,31 @@ class EpubParser:
                    continue
                href = content.get('src')
                # Strip anchor from href
                hrefFile = href.split('#')[0]
                if hrefFile:
                    tocMap[hrefFile] = chapterTitle
-                # Extract content from href location
+            return tocMap if tocMap else None
                paragraphs = self._extract_content_from_href(opfDir, href)
                if paragraphs:
                    chapter = Chapter(chapterTitle)
                    chapter.paragraphs = paragraphs
                    chapters.append(chapter)
            return chapters if chapters else None
        except Exception as e:
            print(f"Error parsing NCX: {e}")
            return None
-    def _extract_content_from_href(self, opfDir, href):
+    def _parse_content_file(self, contentPath, tocMap=None):
        """
        Extract paragraphs from a specific href location
        Args:
            opfDir: Directory containing OPF file
            href: Content href (may include #anchor)
        Returns:
            List of paragraph strings
        """
        # Split href into file and anchor
        parts = href.split('#')
        filePath = opfDir / parts[0]
        anchor = parts[1] if len(parts) > 1 else None
        if not filePath.exists():
            return []
        try:
            with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f.read(), 'html.parser')
            # If anchor specified, find that element
            if anchor:
                section = soup.find(id=anchor)
                if not section:
                    # Try to find by name attribute
                    section = soup.find(attrs={'name': anchor})
                if not section:
                    # Fallback to entire body
                    section = soup.find('body') or soup
            else:
                section = soup.find('body') or soup
            # Extract paragraphs from section
            paragraphs = []
            for p in section.find_all('p'):
                text = p.get_text(strip=True)
                if text:
                    paragraphs.append(text)
            return paragraphs
        except Exception as e:
            print(f"Error extracting content from {href}: {e}")
            return []
    def _parse_content_file(self, contentPath):
        """
        Parse XHTML/HTML content file
        Args:
            contentPath: Path to content file
            tocMap: Optional dict mapping filename to TOC title
        Returns:
            List of Chapter objects
        """
        if tocMap is None:
            tocMap = {}
        try:
            with open(contentPath, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f.read(), 'html.parser')
@@ -368,6 +314,9 @@ class EpubParser:
        chapters = []
        # Check if this file has a TOC title
        tocTitle = tocMap.get(contentPath.name)
        # Look for main content sections
        # Try h1, h2, section elements
        sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
@@ -376,22 +325,34 @@ class EpubParser:
            # Fallback: treat entire file as one chapter
            sections = [soup.find('body') or soup]
-        for section in sections:
+        for sectionIndex, section in enumerate(sections):
            # Find chapter title
            title = None
            for hTag in ['h1', 'h2', 'h3']:
                heading = section.find(hTag)
                if heading:
                    title = heading.get_text(strip=True)
                    break
            # Priority 1: Use TOC title for the first section if available
            if sectionIndex == 0 and tocTitle:
                title = tocTitle
            else:
                # Priority 2: Look for heading in content
                for hTag in ['h1', 'h2', 'h3']:
                    heading = section.find(hTag)
                    if heading:
                        title = heading.get_text(strip=True)
                        break
            # Priority 3: Fallback to filename
            if not title:
-                title = contentPath.stem
+                if tocTitle:
                    title = tocTitle
                else:
                    title = contentPath.stem
-            # Extract paragraphs
+            # Extract paragraphs (including headings)
            paragraphs = []
-            for p in section.find_all('p'):
+
-                text = p.get_text(strip=True)
+            # Include all headings and paragraphs in reading order
            for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
                text = element.get_text(strip=True)
                if text:
                    paragraphs.append(text)
@@ -401,12 +362,14 @@ class EpubParser:
                chapter.paragraphs = paragraphs
                chapters.append(chapter)
-        # If no chapters found, extract all paragraphs as one chapter
+        # If no chapters found, extract all content as one chapter
        if not chapters:
-            title = contentPath.stem
+            title = tocTitle if tocTitle else contentPath.stem
            paragraphs = []
-            for p in soup.find_all('p'):
+
-                text = p.get_text(strip=True)
+            # Include all headings and paragraphs in reading order
            for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
                text = element.get_text(strip=True)
                if text:
                    paragraphs.append(text)
@@ -0,0 +1,192 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 WAV Exporter - Export text books to WAV audio files
 Converts text books (DAISY, EPUB, PDF, TXT) to WAV files split by chapter
 using piper-tts for speech synthesis.
 """
 from pathlib import Path
 from src.daisy_parser import DaisyParser
 from src.epub_parser import EpubParser
 from src.pdf_parser import PdfParser
 from src.txt_parser import TxtParser
 from src.tts_engine import TtsEngine
 class WavExporter:
    """Export text books to WAV audio files"""
    def __init__(self, config):
        """
        Initialize WAV exporter
        Args:
            config: ConfigManager instance
        """
        self.config = config
    def export(self, bookPath, outputDir=None):
        """
        Export book to WAV files split by chapter
        Args:
            bookPath: Path to book file
            outputDir: Output directory (optional, defaults to ./{bookname}_audio)
        Returns:
            Exit code (0 for success, 1 for error)
        """
        print(f"Exporting book to WAV: {bookPath}")
        # Parse book using appropriate parser
        bookPath = Path(bookPath)
        parser = self._create_parser(bookPath)
        if not parser:
            return 1
        try:
            book = parser.parse(bookPath)
        except Exception as e:
            print(f"Error parsing book: {e}")
            return 1
        # Determine output directory
        if outputDir is None:
            bookName = bookPath.stem
            outputDir = Path(f"./{bookName}_audio")
        else:
            outputDir = Path(outputDir)
        outputDir.mkdir(parents=True, exist_ok=True)
        print(f"Output directory: {outputDir}")
        # Initialize TTS engine
        tts = self._create_tts_engine()
        if not tts:
            return 1
        voiceModel = self.config.get_voice_model()
        print(f"Using voice: {voiceModel}")
        print(f"Chapters: {book.get_total_chapters()}")
        print()
        # Export each chapter
        successCount = 0
        for chapterIdx in range(book.get_total_chapters()):
            chapter = book.get_chapter(chapterIdx)
            if not chapter:
                continue
            if self._export_chapter(chapter, chapterIdx, book.get_total_chapters(), tts, outputDir):
                successCount += 1
        parser.cleanup()
        if successCount > 0:
            print(f"\nExport complete! {successCount} chapters saved to: {outputDir}")
            return 0
        else:
            print("\nExport failed! No chapters were successfully exported.")
            return 1
    def _create_parser(self, bookPath):
        """
        Create appropriate parser for book format
        Args:
            bookPath: Path to book file
        Returns:
            Parser instance or None if unsupported format
        """
        suffix = bookPath.suffix.lower()
        if suffix in ['.epub']:
            return EpubParser()
        elif suffix in ['.zip']:
            return DaisyParser()
        elif suffix in ['.pdf']:
            return PdfParser()
        elif suffix in ['.txt']:
            return TxtParser()
        else:
            print(f"Error: Unsupported book format: {suffix}")
            print("Supported formats: .epub, .zip (DAISY), .pdf, .txt")
            return None
    def _create_tts_engine(self):
        """
        Create TTS engine for export
        Returns:
            TtsEngine instance or None if not available
        """
        readerEngine = self.config.get_reader_engine()
        if readerEngine == 'speechd':
            print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.")
            return None
        voiceModel = self.config.get_voice_model()
        try:
            return TtsEngine(voiceModel)
        except Exception as e:
            print(f"Error initializing TTS engine: {e}")
            return None
    def _export_chapter(self, chapter, chapterIdx, totalChapters, tts, outputDir):
        """
        Export a single chapter to WAV file
        Args:
            chapter: Chapter object
            chapterIdx: Chapter index (0-based)
            totalChapters: Total number of chapters
            tts: TtsEngine instance
            outputDir: Output directory path
        Returns:
            True if successful, False otherwise
        """
        chapterNum = chapterIdx + 1
        print(f"Exporting Chapter {chapterNum}/{totalChapters}: {chapter.title}")
        # Combine all paragraphs in chapter
        chapterText = "\n\n".join(chapter.paragraphs)
        # Generate audio
        try:
            wavData = tts.text_to_wav_data(chapterText)
            if not wavData:
                print(f"  Warning: No audio generated for chapter {chapterNum}")
                return False
            # Save to file
            sanitizedTitle = self._sanitize_filename(chapter.title)
            if not sanitizedTitle:
                sanitizedTitle = f"Chapter_{chapterNum}"
            outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav"
            with open(outputFile, 'wb') as f:
                f.write(wavData)
            print(f"  Saved: {outputFile.name}")
            return True
        except Exception as e:
            print(f"  Error generating audio for chapter {chapterNum}: {e}")
            return False
    def _sanitize_filename(self, title):
        """
        Sanitize chapter title for use as filename
        Args:
            title: Chapter title
        Returns:
            Sanitized filename string
        """
        return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()