Initial commit.

2025-10-04 02:55:01 -04:00
commit 1d19ed377c
16 changed files with 4401 additions and 0 deletions
--- a/src/daisy_parser.py
+++ b/src/daisy_parser.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+DAISY Book Parser
+
+Handles parsing of DAISY 2.02 and DAISY 3 book formats.
+Extracts structure and content for text-to-speech playback.
+"""
+
+import zipfile
+import tempfile
+import shutil
+from pathlib import Path
+from bs4 import BeautifulSoup
+import re
+from src.book import Book, Chapter
+
+
+class DaisyParser:
+    """Parser for DAISY format books"""
+
+    def __init__(self):
+        self.tempDir = None
+
+    def parse(self, daisyPath):
+        """
+        Parse a DAISY book (zip file)
+
+        Args:
+            daisyPath: Path to DAISY zip file
+
+        Returns:
+            Book object
+        """
+        daisyPath = Path(daisyPath)
+
+        if not daisyPath.exists():
+            raise FileNotFoundError(f"DAISY file not found: {daisyPath}")
+
+        # Extract zip to temp directory
+        self.tempDir = tempfile.mkdtemp(prefix="daisy_")
+        tempPath = Path(self.tempDir)
+
+        try:
+            with zipfile.ZipFile(daisyPath, 'r') as zipRef:
+                zipRef.extractall(tempPath)
+
+            # Detect DAISY version and parse accordingly
+            if (tempPath / "ncc.html").exists():
+                return self._parse_daisy2(tempPath)
+            elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")):
+                return self._parse_daisy3(tempPath)
+            else:
+                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
+
+        except Exception as e:
+            self.cleanup()
+            raise e
+
+    def _parse_daisy2(self, basePath):
+        """Parse DAISY 2.02 format (NCC.html based)"""
+        nccPath = basePath / "ncc.html"
+
+        with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f:
+            soup = BeautifulSoup(f.read(), 'html.parser')
+
+        # Get title
+        titleTag = soup.find('title')
+        bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title"
+
+        # Find all headings (h1-h6) which represent navigation points
+        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+
+        chapters = []
+        for heading in headings:
+            # Get chapter title
+            chapterTitle = heading.get_text().strip()
+
+            # Find linked content file
+            link = heading.find('a')
+            if not link or not link.get('href'):
+                continue
+
+            contentHref = link.get('href')
+            contentPath = basePath / contentHref.split('#')[0]
+
+            if contentPath.exists():
+                paragraphs = self._extract_paragraphs(contentPath)
+                if paragraphs:
+                    chapter = Chapter(chapterTitle)
+                    chapter.paragraphs = paragraphs
+                    chapters.append(chapter)
+
+        book = Book(bookTitle)
+        for chapter in chapters:
+            book.add_chapter(chapter)
+        return book
+
+    def _parse_daisy3(self, basePath):
+        """Parse DAISY 3 format (NCX based)"""
+        # Find NCX file for title
+        ncxFiles = list(basePath.glob("*.ncx"))
+        if not ncxFiles:
+            ncxFiles = [basePath / "navigation.ncx"]
+
+        ncxPath = ncxFiles[0]
+
+        with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
+            soup = BeautifulSoup(f.read(), features='xml')
+
+        # Get title
+        titleTag = soup.find('docTitle')
+        if titleTag:
+            textTag = titleTag.find('text')
+            bookTitle = textTag.get_text().strip() if textTag else "Unknown Title"
+        else:
+            bookTitle = "Unknown Title"
+
+        # Find DTBook XML file (main content)
+        dtbookFiles = list(basePath.glob("*.xml"))
+        # Exclude navigation.ncx if it was named .xml
+        dtbookFiles = [f for f in dtbookFiles if not f.name.endswith('.ncx')]
+
+        if dtbookFiles:
+            # Try to parse DTBook using NCX navigation structure first
+            chapters = self._parse_dtbook_with_ncx(dtbookFiles[0], basePath, soup)
+
+            if not chapters:
+                # Fallback: Parse DTBook XML directly for content
+                chapters = self._parse_dtbook(dtbookFiles[0])
+        else:
+            # Fallback to old method for HTML-based DAISY
+            chapters = self._parse_daisy3_html(basePath, soup)
+
+        book = Book(bookTitle)
+        for chapter in chapters:
+            book.add_chapter(chapter)
+        return book
+
+    def _parse_daisy3_html(self, basePath, ncxSoup):
+        """Parse DAISY 3 with HTML content files (fallback)"""
+        navPoints = ncxSoup.find_all('navPoint')
+
+        chapters = []
+        for navPoint in navPoints:
+            # Get chapter title
+            navLabel = navPoint.find('navLabel')
+            if navLabel:
+                textTag = navLabel.find('text')
+                chapterTitle = textTag.get_text().strip() if textTag else "Untitled Chapter"
+            else:
+                chapterTitle = "Untitled Chapter"
+
+            # Find content source
+            content = navPoint.find('content')
+            if not content or not content.get('src'):
+                continue
+
+            contentSrc = content.get('src')
+            contentPath = basePath / contentSrc.split('#')[0]
+
+            if contentPath.exists():
+                paragraphs = self._extract_paragraphs(contentPath)
+                if paragraphs:
+                    chapter = Chapter(chapterTitle)
+                    chapter.paragraphs = paragraphs
+                    chapters.append(chapter)
+
+        return chapters
+
+    def _parse_dtbook_with_ncx(self, dtbookPath, basePath, ncxSoup):
+        """
+        Parse DTBook using NCX navigation structure
+
+        Args:
+            dtbookPath: Path to DTBook XML file
+            basePath: Base directory path
+            ncxSoup: BeautifulSoup object of parsed NCX
+
+        Returns:
+            List of Chapter objects or None if parsing fails
+        """
+        try:
+            # Load DTBook content
+            with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
+                dtbookSoup = BeautifulSoup(f.read(), features='xml')
+
+            # Find all top-level navPoints (chapters)
+            navMap = ncxSoup.find('navMap')
+            if not navMap:
+                return None
+
+            chapters = []
+            for navPoint in navMap.find_all('navPoint', recursive=False):
+                # Get chapter title
+                navLabel = navPoint.find('navLabel')
+                if navLabel:
+                    textTag = navLabel.find('text')
+                    chapterTitle = textTag.get_text().strip() if textTag else "Untitled"
+                else:
+                    chapterTitle = "Untitled"
+
+                # Get content source
+                content = navPoint.find('content')
+                if not content or not content.get('src'):
+                    continue
+
+                contentSrc = content.get('src')
+
+                # Extract fragment identifier (anchor)
+                parts = contentSrc.split('#')
+                anchor = parts[1] if len(parts) > 1 else None
+
+                if not anchor:
+                    continue
+
+                # Find the element in DTBook by ID
+                section = dtbookSoup.find(id=anchor)
+                if not section:
+                    continue
+
+                # Extract paragraphs from this section
+                paragraphs = []
+                for p in section.find_all('p'):
+                    text = p.get_text().strip()
+                    text = re.sub(r'\s+', ' ', text)
+                    if text:
+                        paragraphs.append(text)
+
+                if paragraphs:
+                    chapter = Chapter(chapterTitle)
+                    chapter.paragraphs = paragraphs
+                    chapters.append(chapter)
+
+            return chapters if chapters else None
+
+        except Exception as e:
+            print(f"Error parsing DTBook with NCX: {e}")
+            return None
+
+    def _parse_dtbook(self, dtbookPath):
+        """Parse DTBook XML format"""
+        with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
+            soup = BeautifulSoup(f.read(), features='xml')
+
+        chapters = []
+
+        # Find all level1 elements (top-level sections)
+        level1Elements = soup.find_all('level1')
+
+        for level1 in level1Elements:
+            # Get chapter title from h1, h2, or id
+            chapterTitle = None
+
+            # Try to find heading
+            for hTag in ['h1', 'h2', 'h3']:
+                heading = level1.find(hTag)
+                if heading:
+                    chapterTitle = heading.get_text().strip()
+                    break
+
+            # Fallback to id
+            if not chapterTitle:
+                chapterTitle = level1.get('id', 'Untitled Chapter')
+
+            # Extract paragraphs from this level1
+            paragraphs = []
+            for p in level1.find_all('p'):
+                text = p.get_text().strip()
+                text = re.sub(r'\s+', ' ', text)
+                if text:
+                    paragraphs.append(text)
+
+            if paragraphs:
+                chapter = Chapter(chapterTitle)
+                chapter.paragraphs = paragraphs
+                chapters.append(chapter)
+
+        return chapters
+
+    def _extract_paragraphs(self, htmlPath):
+        """Extract paragraphs from HTML content file"""
+        with open(htmlPath, 'r', encoding='utf-8', errors='ignore') as f:
+            soup = BeautifulSoup(f.read(), 'html.parser')
+
+        paragraphs = []
+
+        # Find all paragraph tags
+        for p in soup.find_all('p'):
+            text = p.get_text().strip()
+            # Clean up whitespace
+            text = re.sub(r'\s+', ' ', text)
+            if text:
+                paragraphs.append(text)
+
+        # If no <p> tags, try divs or just get all text
+        if not paragraphs:
+            # Try divs
+            for div in soup.find_all('div'):
+                text = div.get_text().strip()
+                text = re.sub(r'\s+', ' ', text)
+                if text and len(text) > 10:  # Avoid tiny fragments
+                    paragraphs.append(text)
+
+        # Last resort: split body text by double newlines
+        if not paragraphs:
+            body = soup.find('body')
+            if body:
+                text = body.get_text()
+                # Split on multiple newlines or periods followed by newline
+                chunks = re.split(r'\n\n+', text)
+                for chunk in chunks:
+                    chunk = chunk.strip()
+                    chunk = re.sub(r'\s+', ' ', chunk)
+                    if chunk:
+                        paragraphs.append(chunk)
+
+        return paragraphs
+
+    def cleanup(self):
+        """Clean up temporary files"""
+        if self.tempDir and Path(self.tempDir).exists():
+            shutil.rmtree(self.tempDir)
+            self.tempDir = None