Epub now uses spine order for book navigation and headings are included in paragraph navigation.

2025-10-19 16:02:19 -04:00
parent a934c06f6f
commit 16e01cb1f5
3 changed files with 272 additions and 220 deletions
@@ -56,23 +56,18 @@ class EpubParser:
            book.title = metadata.get('title', epubPath.stem)
            book.author = metadata.get('creator', 'Unknown')

-            # Try to use TOC structure first
+            # Parse TOC for chapter titles
            opfDir = opfPath.parent
-            tocChapters = self._parse_toc_structure(tempPath, opfDir, manifest)
+            tocMap = self._build_toc_map(tempPath, opfDir, manifest)

-            if tocChapters:
-                # Successfully parsed using TOC
-                for chapter in tocChapters:
-                    book.add_chapter(chapter)
-            else:
-                # Fallback: Parse content files in spine order
-                for itemId in spine:
-                    if itemId in manifest:
-                        contentPath = opfDir / manifest[itemId]
-                        if contentPath.exists():
-                            chapters = self._parse_content_file(contentPath)
-                            for chapter in chapters:
-                                book.add_chapter(chapter)
+            # Parse content files in spine order (authoritative reading sequence)
+            for itemId in spine:
+                if itemId in manifest:
+                    contentPath = opfDir / manifest[itemId]
+                    if contentPath.exists():
+                        chapters = self._parse_content_file(contentPath, tocMap)
+                        for chapter in chapters:
+                            book.add_chapter(chapter)

            return book

@@ -153,9 +148,9 @@ class EpubParser:

        return metadata, spine, manifest

-    def _parse_toc_structure(self, epubDir, opfDir, manifest):
+    def _build_toc_map(self, epubDir, opfDir, manifest):
        """
-        Parse TOC structure (NCX or nav.xhtml) to get chapters
+        Build a map of href -> chapter title from TOC

        Args:
            epubDir: Root EPUB directory
@@ -163,23 +158,23 @@ class EpubParser:
            manifest: Manifest dict from OPF

        Returns:
-            List of Chapter objects or None if TOC not found
+            Dict mapping href (without anchor) to chapter title
        """
        # Try EPUB 3 nav.xhtml first
-        navChapters = self._parse_nav_xhtml(epubDir, opfDir, manifest)
-        if navChapters:
-            return navChapters
+        tocMap = self._parse_nav_xhtml_map(epubDir, opfDir, manifest)
+        if tocMap:
+            return tocMap

        # Try EPUB 2 NCX
-        ncxChapters = self._parse_ncx(epubDir, opfDir, manifest)
-        if ncxChapters:
-            return ncxChapters
+        tocMap = self._parse_ncx_map(epubDir, opfDir, manifest)
+        if tocMap:
+            return tocMap

-        return None
+        return {}

-    def _parse_nav_xhtml(self, epubDir, opfDir, manifest):
+    def _parse_nav_xhtml_map(self, epubDir, opfDir, manifest):
        """
-        Parse EPUB 3 nav.xhtml for TOC structure
+        Parse EPUB 3 nav.xhtml to build href -> title map

        Args:
            epubDir: Root EPUB directory
@@ -187,7 +182,7 @@ class EpubParser:
            manifest: Manifest dict from OPF

        Returns:
-            List of Chapter objects or None
+            Dict mapping href to chapter title, or None
        """
        # Find nav document in manifest
        navPath = None
@@ -211,8 +206,8 @@ class EpubParser:
            if not tocNav:
                return None

-            # Extract chapters from nav list
-            chapters = []
+            # Extract href -> title mappings
+            tocMap = {}
            for link in tocNav.find_all('a', href=True):
                chapterTitle = link.get_text(strip=True)
                href = link.get('href')
@@ -220,22 +215,20 @@ class EpubParser:
                if not chapterTitle or not href:
                    continue

-                # Extract content from href location
-                paragraphs = self._extract_content_from_href(opfDir, href)
-                if paragraphs:
-                    chapter = Chapter(chapterTitle)
-                    chapter.paragraphs = paragraphs
-                    chapters.append(chapter)
+                # Strip anchor from href
+                hrefFile = href.split('#')[0]
+                if hrefFile:
+                    tocMap[hrefFile] = chapterTitle

-            return chapters if chapters else None
+            return tocMap if tocMap else None

        except Exception as e:
            print(f"Error parsing nav.xhtml: {e}")
            return None

-    def _parse_ncx(self, epubDir, opfDir, manifest):
+    def _parse_ncx_map(self, epubDir, opfDir, manifest):
        """
-        Parse EPUB 2 NCX file for TOC structure
+        Parse EPUB 2 NCX file to build href -> title map

        Args:
            epubDir: Root EPUB directory
@@ -243,7 +236,7 @@ class EpubParser:
            manifest: Manifest dict from OPF

        Returns:
-            List of Chapter objects or None
+            Dict mapping href to chapter title, or None
        """
        # Find NCX file in manifest
        ncxPath = None
@@ -266,13 +259,13 @@ class EpubParser:
            with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f.read(), features='xml')

-            # Find all navPoints (top-level only)
+            # Find all navPoints (including nested)
            navMap = soup.find('navMap')
            if not navMap:
                return None

-            chapters = []
-            for navPoint in navMap.find_all('navPoint', recursive=False):
+            tocMap = {}
+            for navPoint in navMap.find_all('navPoint'):
                # Get chapter title
                navLabel = navPoint.find('navLabel')
                if navLabel:
@@ -287,78 +280,31 @@ class EpubParser:
                    continue

                href = content.get('src')
+                # Strip anchor from href
+                hrefFile = href.split('#')[0]
+                if hrefFile:
+                    tocMap[hrefFile] = chapterTitle

-                # Extract content from href location
-                paragraphs = self._extract_content_from_href(opfDir, href)
-                if paragraphs:
-                    chapter = Chapter(chapterTitle)
-                    chapter.paragraphs = paragraphs
-                    chapters.append(chapter)
-
-            return chapters if chapters else None
+            return tocMap if tocMap else None

        except Exception as e:
            print(f"Error parsing NCX: {e}")
            return None

-    def _extract_content_from_href(self, opfDir, href):
-        """
-        Extract paragraphs from a specific href location
-
-        Args:
-            opfDir: Directory containing OPF file
-            href: Content href (may include #anchor)
-
-        Returns:
-            List of paragraph strings
-        """
-        # Split href into file and anchor
-        parts = href.split('#')
-        filePath = opfDir / parts[0]
-        anchor = parts[1] if len(parts) > 1 else None
-
-        if not filePath.exists():
-            return []
-
-        try:
-            with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
-                soup = BeautifulSoup(f.read(), 'html.parser')
-
-            # If anchor specified, find that element
-            if anchor:
-                section = soup.find(id=anchor)
-                if not section:
-                    # Try to find by name attribute
-                    section = soup.find(attrs={'name': anchor})
-                if not section:
-                    # Fallback to entire body
-                    section = soup.find('body') or soup
-            else:
-                section = soup.find('body') or soup
-
-            # Extract paragraphs from section
-            paragraphs = []
-            for p in section.find_all('p'):
-                text = p.get_text(strip=True)
-                if text:
-                    paragraphs.append(text)
-
-            return paragraphs
-
-        except Exception as e:
-            print(f"Error extracting content from {href}: {e}")
-            return []
-
-    def _parse_content_file(self, contentPath):
+    def _parse_content_file(self, contentPath, tocMap=None):
        """
        Parse XHTML/HTML content file

        Args:
            contentPath: Path to content file
+            tocMap: Optional dict mapping filename to TOC title

        Returns:
            List of Chapter objects
        """
+        if tocMap is None:
+            tocMap = {}
+
        try:
            with open(contentPath, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f.read(), 'html.parser')
@@ -368,6 +314,9 @@ class EpubParser:

        chapters = []

+        # Check if this file has a TOC title
+        tocTitle = tocMap.get(contentPath.name)
+
        # Look for main content sections
        # Try h1, h2, section elements
        sections = soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower() if x else False)
@@ -376,22 +325,34 @@ class EpubParser:
            # Fallback: treat entire file as one chapter
            sections = [soup.find('body') or soup]

-        for section in sections:
+        for sectionIndex, section in enumerate(sections):
            # Find chapter title
            title = None
-            for hTag in ['h1', 'h2', 'h3']:
-                heading = section.find(hTag)
-                if heading:
-                    title = heading.get_text(strip=True)
-                    break

+            # Priority 1: Use TOC title for the first section if available
+            if sectionIndex == 0 and tocTitle:
+                title = tocTitle
+            else:
+                # Priority 2: Look for heading in content
+                for hTag in ['h1', 'h2', 'h3']:
+                    heading = section.find(hTag)
+                    if heading:
+                        title = heading.get_text(strip=True)
+                        break
+
+            # Priority 3: Fallback to filename
            if not title:
-                title = contentPath.stem
+                if tocTitle:
+                    title = tocTitle
+                else:
+                    title = contentPath.stem

-            # Extract paragraphs
+            # Extract paragraphs (including headings)
            paragraphs = []
-            for p in section.find_all('p'):
-                text = p.get_text(strip=True)
+
+            # Include all headings and paragraphs in reading order
+            for element in section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+                text = element.get_text(strip=True)
                if text:
                    paragraphs.append(text)

@@ -401,12 +362,14 @@ class EpubParser:
                chapter.paragraphs = paragraphs
                chapters.append(chapter)

-        # If no chapters found, extract all paragraphs as one chapter
+        # If no chapters found, extract all content as one chapter
        if not chapters:
-            title = contentPath.stem
+            title = tocTitle if tocTitle else contentPath.stem
            paragraphs = []
-            for p in soup.find_all('p'):
-                text = p.get_text(strip=True)
+
+            # Include all headings and paragraphs in reading order
+            for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+                text = element.get_text(strip=True)
                if text:
                    paragraphs.append(text)

@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+WAV Exporter - Export text books to WAV audio files
+
+Converts text books (DAISY, EPUB, PDF, TXT) to WAV files split by chapter
+using piper-tts for speech synthesis.
+"""
+
+from pathlib import Path
+
+from src.daisy_parser import DaisyParser
+from src.epub_parser import EpubParser
+from src.pdf_parser import PdfParser
+from src.txt_parser import TxtParser
+from src.tts_engine import TtsEngine
+
+
+class WavExporter:
+    """Export text books to WAV audio files"""
+
+    def __init__(self, config):
+        """
+        Initialize WAV exporter
+
+        Args:
+            config: ConfigManager instance
+        """
+        self.config = config
+
+    def export(self, bookPath, outputDir=None):
+        """
+        Export book to WAV files split by chapter
+
+        Args:
+            bookPath: Path to book file
+            outputDir: Output directory (optional, defaults to ./{bookname}_audio)
+
+        Returns:
+            Exit code (0 for success, 1 for error)
+        """
+        print(f"Exporting book to WAV: {bookPath}")
+
+        # Parse book using appropriate parser
+        bookPath = Path(bookPath)
+        parser = self._create_parser(bookPath)
+        if not parser:
+            return 1
+
+        try:
+            book = parser.parse(bookPath)
+        except Exception as e:
+            print(f"Error parsing book: {e}")
+            return 1
+
+        # Determine output directory
+        if outputDir is None:
+            bookName = bookPath.stem
+            outputDir = Path(f"./{bookName}_audio")
+        else:
+            outputDir = Path(outputDir)
+
+        outputDir.mkdir(parents=True, exist_ok=True)
+        print(f"Output directory: {outputDir}")
+
+        # Initialize TTS engine
+        tts = self._create_tts_engine()
+        if not tts:
+            return 1
+
+        voiceModel = self.config.get_voice_model()
+        print(f"Using voice: {voiceModel}")
+        print(f"Chapters: {book.get_total_chapters()}")
+        print()
+
+        # Export each chapter
+        successCount = 0
+        for chapterIdx in range(book.get_total_chapters()):
+            chapter = book.get_chapter(chapterIdx)
+            if not chapter:
+                continue
+
+            if self._export_chapter(chapter, chapterIdx, book.get_total_chapters(), tts, outputDir):
+                successCount += 1
+
+        parser.cleanup()
+
+        if successCount > 0:
+            print(f"\nExport complete! {successCount} chapters saved to: {outputDir}")
+            return 0
+        else:
+            print("\nExport failed! No chapters were successfully exported.")
+            return 1
+
+    def _create_parser(self, bookPath):
+        """
+        Create appropriate parser for book format
+
+        Args:
+            bookPath: Path to book file
+
+        Returns:
+            Parser instance or None if unsupported format
+        """
+        suffix = bookPath.suffix.lower()
+
+        if suffix in ['.epub']:
+            return EpubParser()
+        elif suffix in ['.zip']:
+            return DaisyParser()
+        elif suffix in ['.pdf']:
+            return PdfParser()
+        elif suffix in ['.txt']:
+            return TxtParser()
+        else:
+            print(f"Error: Unsupported book format: {suffix}")
+            print("Supported formats: .epub, .zip (DAISY), .pdf, .txt")
+            return None
+
+    def _create_tts_engine(self):
+        """
+        Create TTS engine for export
+
+        Returns:
+            TtsEngine instance or None if not available
+        """
+        readerEngine = self.config.get_reader_engine()
+        if readerEngine == 'speechd':
+            print("Error: WAV export requires piper-tts. Set reader_engine=piper in config.")
+            return None
+
+        voiceModel = self.config.get_voice_model()
+        try:
+            return TtsEngine(voiceModel)
+        except Exception as e:
+            print(f"Error initializing TTS engine: {e}")
+            return None
+
+    def _export_chapter(self, chapter, chapterIdx, totalChapters, tts, outputDir):
+        """
+        Export a single chapter to WAV file
+
+        Args:
+            chapter: Chapter object
+            chapterIdx: Chapter index (0-based)
+            totalChapters: Total number of chapters
+            tts: TtsEngine instance
+            outputDir: Output directory path
+
+        Returns:
+            True if successful, False otherwise
+        """
+        chapterNum = chapterIdx + 1
+        print(f"Exporting Chapter {chapterNum}/{totalChapters}: {chapter.title}")
+
+        # Combine all paragraphs in chapter
+        chapterText = "\n\n".join(chapter.paragraphs)
+
+        # Generate audio
+        try:
+            wavData = tts.text_to_wav_data(chapterText)
+            if not wavData:
+                print(f"  Warning: No audio generated for chapter {chapterNum}")
+                return False
+
+            # Save to file
+            sanitizedTitle = self._sanitize_filename(chapter.title)
+            if not sanitizedTitle:
+                sanitizedTitle = f"Chapter_{chapterNum}"
+
+            outputFile = outputDir / f"{chapterNum:03d}_{sanitizedTitle}.wav"
+            with open(outputFile, 'wb') as f:
+                f.write(wavData)
+
+            print(f"  Saved: {outputFile.name}")
+            return True
+
+        except Exception as e:
+            print(f"  Error generating audio for chapter {chapterNum}: {e}")
+            return False
+
+    def _sanitize_filename(self, title):
+        """
+        Sanitize chapter title for use as filename
+
+        Args:
+            title: Chapter title
+
+        Returns:
+            Sanitized filename string
+        """
+        return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()