bookstorm/src/daisy_parser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DAISY Book Parser

Handles parsing of DAISY 2.02 and DAISY 3 book formats.
Extracts structure and content for text-to-speech playback.
"""

import zipfile
import tempfile
import shutil
from pathlib import Path
from bs4 import BeautifulSoup
import re
from src.book import Book, Chapter


class DaisyParser:
    """Parser for DAISY format books"""

    def __init__(self):
        self.tempDir = None

    def parse(self, daisyPath):
        """
        Parse a DAISY book (zip file)

        Args:
            daisyPath: Path to DAISY zip file

        Returns:
            Book object
        """
        daisyPath = Path(daisyPath)

        if not daisyPath.exists():
            raise FileNotFoundError(f"DAISY file not found: {daisyPath}")

        # Extract zip to temp directory
        self.tempDir = tempfile.mkdtemp(prefix="daisy_")
        tempPath = Path(self.tempDir)

        try:
            with zipfile.ZipFile(daisyPath, 'r') as zipRef:
                zipRef.extractall(tempPath)

            # Detect DAISY version and parse accordingly
            if (tempPath / "ncc.html").exists():
                return self._parse_daisy2(tempPath)
            elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")):
                return self._parse_daisy3(tempPath)
            else:
                raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")

        except Exception as e:
            self.cleanup()
            raise e

    def _parse_daisy2(self, basePath):
        """Parse DAISY 2.02 format (NCC.html based)"""
        nccPath = basePath / "ncc.html"

        with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')

        # Get title
        titleTag = soup.find('title')
        bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title"

        # Find all headings (h1-h6) which represent navigation points
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        chapters = []
        for heading in headings:
            # Get chapter title
            chapterTitle = heading.get_text().strip()

            # Find linked content file
            link = heading.find('a')
            if not link or not link.get('href'):
                continue

            contentHref = link.get('href')
            contentPath = basePath / contentHref.split('#')[0]

            if contentPath.exists():
                paragraphs = self._extract_paragraphs(contentPath)
                if paragraphs:
                    chapter = Chapter(chapterTitle)
                    chapter.paragraphs = paragraphs
                    chapters.append(chapter)

        book = Book(bookTitle)
        for chapter in chapters:
            book.add_chapter(chapter)
        return book

    def _parse_daisy3(self, basePath):
        """Parse DAISY 3 format (NCX based)"""
        # Find NCX file for title
        ncxFiles = list(basePath.glob("*.ncx"))
        if not ncxFiles:
            ncxFiles = [basePath / "navigation.ncx"]

        ncxPath = ncxFiles[0]

        with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), features='xml')

        # Get title
        titleTag = soup.find('docTitle')
        if titleTag:
            textTag = titleTag.find('text')
            bookTitle = textTag.get_text().strip() if textTag else "Unknown Title"
        else:
            bookTitle = "Unknown Title"

        # Find DTBook XML file (main content)
        dtbookFiles = list(basePath.glob("*.xml"))
        # Exclude navigation.ncx if it was named .xml
        dtbookFiles = [f for f in dtbookFiles if not f.name.endswith('.ncx')]

        if dtbookFiles:
            # Try to parse DTBook using NCX navigation structure first
            chapters = self._parse_dtbook_with_ncx(dtbookFiles[0], basePath, soup)

            if not chapters:
                # Fallback: Parse DTBook XML directly for content
                chapters = self._parse_dtbook(dtbookFiles[0])
        else:
            # Fallback to old method for HTML-based DAISY
            chapters = self._parse_daisy3_html(basePath, soup)

        book = Book(bookTitle)
        for chapter in chapters:
            book.add_chapter(chapter)
        return book

    def _parse_daisy3_html(self, basePath, ncxSoup):
        """Parse DAISY 3 with HTML content files (fallback)"""
        navPoints = ncxSoup.find_all('navPoint')

        chapters = []
        for navPoint in navPoints:
            # Get chapter title
            navLabel = navPoint.find('navLabel')
            if navLabel:
                textTag = navLabel.find('text')
                chapterTitle = textTag.get_text().strip() if textTag else "Untitled Chapter"
            else:
                chapterTitle = "Untitled Chapter"

            # Find content source
            content = navPoint.find('content')
            if not content or not content.get('src'):
                continue

            contentSrc = content.get('src')
            contentPath = basePath / contentSrc.split('#')[0]

            if contentPath.exists():
                paragraphs = self._extract_paragraphs(contentPath)
                if paragraphs:
                    chapter = Chapter(chapterTitle)
                    chapter.paragraphs = paragraphs
                    chapters.append(chapter)

        return chapters

    def _parse_dtbook_with_ncx(self, dtbookPath, basePath, ncxSoup):
        """
        Parse DTBook using NCX navigation structure

        Args:
            dtbookPath: Path to DTBook XML file
            basePath: Base directory path
            ncxSoup: BeautifulSoup object of parsed NCX

        Returns:
            List of Chapter objects or None if parsing fails
        """
        try:
            # Load DTBook content
            with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
                dtbookSoup = BeautifulSoup(f.read(), features='xml')

            # Find all top-level navPoints (chapters)
            navMap = ncxSoup.find('navMap')
            if not navMap:
                return None

            chapters = []
            for navPoint in navMap.find_all('navPoint', recursive=False):
                # Get chapter title
                navLabel = navPoint.find('navLabel')
                if navLabel:
                    textTag = navLabel.find('text')
                    chapterTitle = textTag.get_text().strip() if textTag else "Untitled"
                else:
                    chapterTitle = "Untitled"

                # Get content source
                content = navPoint.find('content')
                if not content or not content.get('src'):
                    continue

                contentSrc = content.get('src')

                # Extract fragment identifier (anchor)
                parts = contentSrc.split('#')
                anchor = parts[1] if len(parts) > 1 else None

                if not anchor:
                    continue

                # Find the element in DTBook by ID
                section = dtbookSoup.find(id=anchor)
                if not section:
                    continue

                # Extract paragraphs from this section
                paragraphs = []
                for p in section.find_all('p'):
                    text = p.get_text().strip()
                    text = re.sub(r'\s+', ' ', text)
                    if text:
                        paragraphs.append(text)

                if paragraphs:
                    chapter = Chapter(chapterTitle)
                    chapter.paragraphs = paragraphs
                    chapters.append(chapter)

            return chapters if chapters else None

        except Exception as e:
            print(f"Error parsing DTBook with NCX: {e}")
            return None

    def _parse_dtbook(self, dtbookPath):
        """Parse DTBook XML format"""
        with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), features='xml')

        chapters = []

        # Find all level1 elements (top-level sections)
        level1Elements = soup.find_all('level1')

        for level1 in level1Elements:
            # Get chapter title from h1, h2, or id
            chapterTitle = None

            # Try to find heading
            for hTag in ['h1', 'h2', 'h3']:
                heading = level1.find(hTag)
                if heading:
                    chapterTitle = heading.get_text().strip()
                    break

            # Fallback to id
            if not chapterTitle:
                chapterTitle = level1.get('id', 'Untitled Chapter')

            # Extract paragraphs from this level1
            paragraphs = []
            for p in level1.find_all('p'):
                text = p.get_text().strip()
                text = re.sub(r'\s+', ' ', text)
                if text:
                    paragraphs.append(text)

            if paragraphs:
                chapter = Chapter(chapterTitle)
                chapter.paragraphs = paragraphs
                chapters.append(chapter)

        return chapters

    def _extract_paragraphs(self, htmlPath):
        """Extract paragraphs from HTML content file"""
        with open(htmlPath, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')

        paragraphs = []

        # Find all paragraph tags
        for p in soup.find_all('p'):
            text = p.get_text().strip()
            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text)
            if text:
                paragraphs.append(text)

        # If no <p> tags, try divs or just get all text
        if not paragraphs:
            # Try divs
            for div in soup.find_all('div'):
                text = div.get_text().strip()
                text = re.sub(r'\s+', ' ', text)
                if text and len(text) > 10:  # Avoid tiny fragments
                    paragraphs.append(text)

        # Last resort: split body text by double newlines
        if not paragraphs:
            body = soup.find('body')
            if body:
                text = body.get_text()
                # Split on multiple newlines or periods followed by newline
                chunks = re.split(r'\n\n+', text)
                for chunk in chunks:
                    chunk = chunk.strip()
                    chunk = re.sub(r'\s+', ' ', chunk)
                    if chunk:
                        paragraphs.append(chunk)

        return paragraphs

    def cleanup(self):
        """Clean up temporary files"""
        if self.tempDir and Path(self.tempDir).exists():
            shutil.rmtree(self.tempDir)
            self.tempDir = None