Daisy support improved, now supports audio.

2025-10-23 16:06:04 -04:00
parent e23877f688
commit 105c83a941
2 changed files with 603 additions and 5 deletions
--- a/src/daisy_audio_parser.py
+++ b/src/daisy_audio_parser.py
@@ -0,0 +1,558 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 DAISY Audio Book Parser
 Handles parsing of DAISY 2.02 and DAISY 3 audio-only books.
 Extracts audio segments from SMIL files for playback.
 """
 import zipfile
 import tempfile
 import shutil
 from pathlib import Path
 from bs4 import BeautifulSoup
 import re
 import os
 from src.audio_parser import AudioBook, AudioChapter
 class DaisyAudioParser:
    """Parser for DAISY audio-only books"""
    def __init__(self):
        self.tempDir = None
        self.basePath = None
    def parse(self, daisyPath):
        """
        Parse a DAISY audio book (zip file)
        Args:
            daisyPath: Path to DAISY zip file
        Returns:
            Book object with audio file information
        """
        daisyPath = Path(daisyPath)
        if not daisyPath.exists():
            raise FileNotFoundError(f"DAISY file not found: {daisyPath}")
        # Extract zip to temp directory
        self.tempDir = tempfile.mkdtemp(prefix="daisy_audio_")
        self.basePath = Path(self.tempDir)
        try:
            with zipfile.ZipFile(daisyPath, 'r') as zipRef:
                zipRef.extractall(self.basePath)
            # Find the actual DAISY directory (might be in a subdirectory)
            daisyDir = self._find_daisy_directory(self.basePath)
            if not daisyDir:
                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
            # Detect DAISY version and parse accordingly
            if (daisyDir / "ncc.html").exists():
                book = self._parse_daisy2_audio(daisyDir)
            elif (daisyDir / "navigation.ncx").exists() or list(daisyDir.glob("*.ncx")):
                book = self._parse_daisy3_audio(daisyDir)
            else:
                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
            # Store temp directory path for cleanup later
            book.tempDir = self.tempDir
            return book
        except Exception as e:
            # Ensure cleanup on error
            self.cleanup()
            raise e
    def _find_daisy_directory(self, basePath):
        """
        Find the actual DAISY directory within extracted files.
        Some DAISY books are packaged with a subdirectory inside the zip.
        Args:
            basePath: Path to extracted directory
        Returns:
            Path to directory containing ncc.html or navigation.ncx, or None if not found
        """
        # Check if ncc.html or .ncx is at root level
        if (basePath / "ncc.html").exists():
            return basePath
        if (basePath / "navigation.ncx").exists() or list(basePath.glob("*.ncx")):
            return basePath
        # Check subdirectories (only one level deep)
        for item in basePath.iterdir():
            if item.is_dir():
                if (item / "ncc.html").exists():
                    return item
                if (item / "navigation.ncx").exists() or list(item.glob("*.ncx")):
                    return item
        return None
    def _parse_daisy2_audio(self, basePath):
        """Parse DAISY 2.02 audio format (NCC.html + SMIL)"""
        nccPath = basePath / "ncc.html"
        with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        # Get title
        titleTag = soup.find('title')
        bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title"
        # Find all top-level headings (h1) which represent chapters
        headings = soup.find_all('h1')
        chapters = []
        totalDuration = 0.0
        for heading in headings:
            # Get chapter title
            chapterTitle = heading.get_text().strip()
            # Find linked SMIL file
            link = heading.find('a')
            if not link or not link.get('href'):
                continue
            smilHref = link.get('href')
            smilFile = smilHref.split('#')[0]
            smilPath = basePath / smilFile
            if smilPath.exists():
                # Parse SMIL to get audio info for this chapter
                chapterDuration, mergedAudio = self._process_smil_chapter(smilPath, basePath)
                if mergedAudio and chapterDuration > 0:
                    chapter = AudioChapter(
                        title=chapterTitle,
                        startTime=totalDuration,
                        duration=chapterDuration
                    )
                    # Store merged audio file path
                    chapter.audioPath = mergedAudio
                    chapters.append(chapter)
                    totalDuration += chapterDuration
        book = AudioBook(title=bookTitle, author="Unknown")
        book.totalDuration = totalDuration
        for chapter in chapters:
            book.add_chapter(chapter)
        # Create playlist of all chapter audio files
        book.audioFiles = [chap.audioPath for chap in chapters]
        book.isMultiFile = True
        return book
    def _parse_daisy3_audio(self, basePath):
        """Parse DAISY 3 audio format (NCX + SMIL)"""
        # Find NCX file
        ncxFiles = list(basePath.glob("*.ncx"))
        if not ncxFiles:
            ncxFiles = [basePath / "navigation.ncx"]
        ncxPath = ncxFiles[0]
        with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), features='xml')
        # Get title
        titleTag = soup.find('docTitle')
        if titleTag:
            textTag = titleTag.find('text')
            bookTitle = textTag.get_text().strip() if textTag else "Unknown Title"
        else:
            bookTitle = "Unknown Title"
        # Find all top-level navPoints (chapters)
        navMap = soup.find('navMap')
        if not navMap:
            raise ValueError("No navMap found in NCX")
        chapters = []
        totalDuration = 0.0
        for navPoint in navMap.find_all('navPoint', recursive=False):
            # Get chapter title
            navLabel = navPoint.find('navLabel')
            if navLabel:
                textTag = navLabel.find('text')
                chapterTitle = textTag.get_text().strip() if textTag else "Untitled"
            else:
                chapterTitle = "Untitled"
            # Get content source (SMIL file)
            content = navPoint.find('content')
            if not content or not content.get('src'):
                continue
            contentSrc = content.get('src')
            smilFile = contentSrc.split('#')[0]
            smilPath = basePath / smilFile
            if smilPath.exists():
                # Parse SMIL to get audio info for this chapter
                chapterDuration, mergedAudio = self._process_smil_chapter(smilPath, basePath)
                if mergedAudio and chapterDuration > 0:
                    chapter = AudioChapter(
                        title=chapterTitle,
                        startTime=totalDuration,
                        duration=chapterDuration
                    )
                    # Store merged audio file path
                    chapter.audioPath = mergedAudio
                    chapters.append(chapter)
                    totalDuration += chapterDuration
        book = AudioBook(title=bookTitle, author="Unknown")
        book.totalDuration = totalDuration
        for chapter in chapters:
            book.add_chapter(chapter)
        # Create playlist of all chapter audio files
        book.audioFiles = [chap.audioPath for chap in chapters]
        book.isMultiFile = True
        return book
    def _process_smil_chapter(self, smilPath, basePath):
        """
        Process a SMIL file and extract/merge audio segments for a chapter
        Args:
            smilPath: Path to SMIL file
            basePath: Base directory for resolving relative paths
        Returns:
            Tuple of (duration, audio_file_path)
            - duration: Total duration of chapter in seconds
            - audio_file_path: Path to audio file (merged if needed)
        """
        with open(smilPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), features='xml')
        # Collect all audio segments
        audioSegments = []
        totalDuration = 0.0
        # Find all audio elements
        for audio in soup.find_all('audio'):
            src = audio.get('src')
            if not src:
                continue
            # Resolve audio file path
            audioPath = basePath / src
            if not audioPath.exists():
                continue
            # Parse clip times (NPT format: "npt=123.456s")
            clipBegin = audio.get('clip-begin') or audio.get('clipBegin')
            clipEnd = audio.get('clip-end') or audio.get('clipEnd')
            beginTime = self._parse_npt_time(clipBegin) if clipBegin else None
            endTime = self._parse_npt_time(clipEnd) if clipEnd else None
            segment = {
                'file': str(audioPath.resolve()),
                'clip_begin': beginTime,
                'clip_end': endTime
            }
            # Calculate segment duration
            if beginTime is not None and endTime is not None:
                segmentDuration = endTime - beginTime
            elif endTime is not None:
                segmentDuration = endTime
            else:
                # Will need to get file duration
                segmentDuration = self._get_audio_duration(str(audioPath.resolve()))
            totalDuration += segmentDuration
            audioSegments.append(segment)
        if not audioSegments:
            return (0.0, None)
        # Check if we need to merge segments or can use a single file
        if len(audioSegments) == 1:
            segment = audioSegments[0]
            # Single segment - check if it's the whole file or a clip
            if segment['clip_begin'] is None and segment['clip_end'] is None:
                # Whole file, no processing needed
                return (totalDuration, segment['file'])
            else:
                # Single segment with clips - extract it
                outputFile = self._extract_audio_segment(
                    segment['file'],
                    segment['clip_begin'],
                    segment['clip_end']
                )
                return (totalDuration, outputFile)
        else:
            # Multiple segments - need to merge them
            outputFile = self._merge_audio_segments(audioSegments)
            return (totalDuration, outputFile)
    def _get_audio_duration(self, audioPath):
        """Get duration of audio file using ffprobe"""
        try:
            import subprocess
            result = subprocess.run(
                ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
                 '-of', 'default=noprint_wrappers=1:nokey=1', audioPath],
                capture_output=True,  # noqa: F821 - capture_output is a subprocess.run parameter
                text=True,
                timeout=10
            )
            if result.returncode == 0:
                return float(result.stdout.strip())
        except:
            pass
        return 0.0
    def _extract_audio_segment(self, audioPath, clipBegin, clipEnd):
        """
        Extract a segment from an audio file using ffmpeg
        Args:
            audioPath: Path to source audio file
            clipBegin: Start time in seconds (or None)
            clipEnd: End time in seconds (or None)
        Returns:
            Path to extracted segment file
        """
        import subprocess
        import hashlib
        # Create unique filename for this segment
        segmentId = hashlib.md5(
            f"{audioPath}_{clipBegin}_{clipEnd}".encode()
        ).hexdigest()[:16]
        outputPath = os.path.join(
            self.tempDir,
            f"segment_{segmentId}.opus"
        )
        # Build ffmpeg command
        cmd = ['ffmpeg', '-y', '-loglevel', 'error']
        if clipBegin is not None:
            cmd.extend(['-ss', str(clipBegin)])
        cmd.extend(['-i', audioPath])
        if clipEnd is not None:
            duration = clipEnd - (clipBegin or 0.0)
            cmd.extend(['-t', str(duration)])
        # Use opus for efficient compression
        cmd.extend(['-c:a', 'libopus', '-b:a', '64k', outputPath])
        try:
            subprocess.run(cmd, check=True, timeout=300)
            return outputPath
        except Exception as e:
            print(f"Error extracting audio segment: {e}")
            return None
    def _merge_audio_segments(self, audioSegments):
        """
        Merge multiple audio segments into a single file using ffmpeg
        Args:
            audioSegments: List of segment dictionaries
        Returns:
            Path to merged audio file
        """
        import subprocess
        import hashlib
        # Create unique filename for merged output
        segmentIds = "_".join([
            f"{seg['file']}_{seg['clip_begin']}_{seg['clip_end']}"
            for seg in audioSegments
        ])
        mergedId = hashlib.md5(segmentIds.encode()).hexdigest()[:16]
        outputPath = os.path.join(
            self.tempDir,
            f"merged_{mergedId}.opus"
        )
        # Create concat file for ffmpeg
        concatFilePath = os.path.join(self.tempDir, f"concat_{mergedId}.txt")
        # First, extract all segments
        extractedFiles = []
        for segment in audioSegments:
            if segment['clip_begin'] is None and segment['clip_end'] is None:
                # Whole file
                extractedFiles.append(segment['file'])
            else:
                # Extract segment
                extracted = self._extract_audio_segment(
                    segment['file'],
                    segment['clip_begin'],
                    segment['clip_end']
                )
                if extracted:
                    extractedFiles.append(extracted)
        if not extractedFiles:
            return None
        # Create concat file
        with open(concatFilePath, 'w', encoding='utf-8') as f:
            for filepath in extractedFiles:
                # Escape single quotes in path
                escapedPath = filepath.replace("'", "'\\''")
                f.write(f"file '{escapedPath}'\n")
        # Merge using ffmpeg concat
        cmd = [
            'ffmpeg', '-y', '-loglevel', 'error',
            '-f', 'concat', '-safe', '0',
            '-i', concatFilePath,
            '-c:a', 'libopus', '-b:a', '64k',
            outputPath
        ]
        try:
            subprocess.run(cmd, check=True, timeout=600)
            return outputPath
        except Exception as e:
            print(f"Error merging audio segments: {e}")
            return None
    def _parse_npt_time(self, nptString):
        """
        Parse Normal Play Time (NPT) format to seconds
        Examples:
            "npt=123.456s" -> 123.456
            "123.456s" -> 123.456
            "npt=1:23.456" -> 83.456
        Args:
            nptString: NPT formatted time string
        Returns:
            Float seconds, or None if parse fails
        """
        if not nptString:
            return None
        try:
            # Remove "npt=" prefix if present
            timeStr = nptString.replace('npt=', '').strip()
            # Handle seconds format: "123.456s"
            if timeStr.endswith('s'):
                return float(timeStr[:-1])
            # Handle time format: "1:23.456" (minutes:seconds)
            if ':' in timeStr:
                parts = timeStr.split(':')
                if len(parts) == 2:
                    minutes = float(parts[0])
                    seconds = float(parts[1])
                    return minutes * 60 + seconds
                elif len(parts) == 3:
                    # Handle hours:minutes:seconds
                    hours = float(parts[0])
                    minutes = float(parts[1])
                    seconds = float(parts[2])
                    return hours * 3600 + minutes * 60 + seconds
            # Try parsing as plain number
            return float(timeStr)
        except (ValueError, AttributeError):
            return None
    def cleanup(self):
        """
        Clean up temporary files
        This is called when the book is closed and audio files are no longer needed.
        """
        if self.tempDir and Path(self.tempDir).exists():
            try:
                shutil.rmtree(self.tempDir)
            except:
                pass  # Ignore errors during cleanup
            self.tempDir = None
 def is_daisy_audio_book(daisyPath):
    """
    Detect if a DAISY book is audio-only
    Args:
        daisyPath: Path to DAISY zip file
    Returns:
        Boolean - True if audio-only, False if text-based
    """
    daisyPath = Path(daisyPath)
    if not daisyPath.exists() or not zipfile.is_zipfile(daisyPath):
        return False
    try:
        with zipfile.ZipFile(daisyPath, 'r') as zipRef:
            fileList = zipRef.namelist()
            # Check for audio file extensions
            audioExtensions = {'.mp3', '.wav', '.mp2', '.m4a', '.m4b', '.aac', '.ogg', '.opus'}
            hasAudioFiles = any(
                Path(f).suffix.lower() in audioExtensions
                for f in fileList
            )
            # Check for SMIL files (strong indicator of audio content)
            hasSmilFiles = any(f.lower().endswith('.smil') for f in fileList)
            # If has SMIL files and audio, it's likely an audio book
            if hasSmilFiles and hasAudioFiles:
                return True
            # For DAISY 3, check package file metadata
            opfFiles = [f for f in fileList if f.lower().endswith('.opf')]
            if opfFiles:
                with zipRef.open(opfFiles[0]) as opfFile:
                    content = opfFile.read().decode('utf-8', errors='ignore')
                    # Check for multimediaType metadata
                    if 'audioOnly' in content or 'audioNCX' in content:
                        return True
            # Check if there are DTBook XML files (text content)
            hasDtbook = any(
                'dtbook' in f.lower() or
                (f.lower().endswith('.xml') and 'navigation.ncx' not in f.lower())
                for f in fileList
            )
            # If has audio and SMIL but no DTBook, likely audio-only
            if hasAudioFiles and hasSmilFiles and not hasDtbook:
                return True
            return False
    except Exception:
        return False
--- a/src/daisy_parser.py
+++ b/src/daisy_parser.py
@@ -14,6 +14,7 @@ from pathlib import Path
 from bs4 import BeautifulSoup
 import re
 from src.book import Book, Chapter
 from src.daisy_audio_parser import DaisyAudioParser, is_daisy_audio_book
 class DaisyParser:
@@ -30,13 +31,20 @@ class DaisyParser:
            daisyPath: Path to DAISY zip file
        Returns:
-            Book object
+            Book object (or AudioBook if audio-only)
        """
        daisyPath = Path(daisyPath)
        if not daisyPath.exists():
            raise FileNotFoundError(f"DAISY file not found: {daisyPath}")
        # Check if this is an audio-only DAISY book
        if is_daisy_audio_book(daisyPath):
            # Delegate to audio parser
            audioParser = DaisyAudioParser()
            return audioParser.parse(daisyPath)
        # Text-based DAISY book - continue with normal parsing
        # Extract zip to temp directory
        self.tempDir = tempfile.mkdtemp(prefix="daisy_")
        tempPath = Path(self.tempDir)
@@ -45,11 +53,16 @@ class DaisyParser:
            with zipfile.ZipFile(daisyPath, 'r') as zipRef:
                zipRef.extractall(tempPath)
            # Find the actual DAISY directory (might be in a subdirectory)
            daisyDir = self._find_daisy_directory(tempPath)
            if not daisyDir:
                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
            # Detect DAISY version and parse accordingly
-            if (tempPath / "ncc.html").exists():
+            if (daisyDir / "ncc.html").exists():
-                book = self._parse_daisy2(tempPath)
+                book = self._parse_daisy2(daisyDir)
-            elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")):
+            elif (daisyDir / "navigation.ncx").exists() or list(daisyDir.glob("*.ncx")):
-                book = self._parse_daisy3(tempPath)
+                book = self._parse_daisy3(daisyDir)
            else:
                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
@@ -62,6 +75,33 @@ class DaisyParser:
            self.cleanup()
            raise e
    def _find_daisy_directory(self, basePath):
        """
        Find the actual DAISY directory within extracted files.
        Some DAISY books are packaged with a subdirectory inside the zip.
        Args:
            basePath: Path to extracted directory
        Returns:
            Path to directory containing ncc.html or navigation.ncx, or None if not found
        """
        # Check if ncc.html or .ncx is at root level
        if (basePath / "ncc.html").exists():
            return basePath
        if (basePath / "navigation.ncx").exists() or list(basePath.glob("*.ncx")):
            return basePath
        # Check subdirectories (only one level deep)
        for item in basePath.iterdir():
            if item.is_dir():
                if (item / "ncc.html").exists():
                    return item
                if (item / "navigation.ncx").exists() or list(item.glob("*.ncx")):
                    return item
        return None
    def _parse_daisy2(self, basePath):
        """Parse DAISY 2.02 format (NCC.html based)"""
        nccPath = basePath / "ncc.html"