#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ DAISY Book Parser Handles parsing of DAISY 2.02 and DAISY 3 book formats. Extracts structure and content for text-to-speech playback. """ import zipfile import tempfile import shutil from pathlib import Path from bs4 import BeautifulSoup import re from src.book import Book, Chapter class DaisyParser: """Parser for DAISY format books""" def __init__(self): self.tempDir = None def parse(self, daisyPath): """ Parse a DAISY book (zip file) Args: daisyPath: Path to DAISY zip file Returns: Book object """ daisyPath = Path(daisyPath) if not daisyPath.exists(): raise FileNotFoundError(f"DAISY file not found: {daisyPath}") # Extract zip to temp directory self.tempDir = tempfile.mkdtemp(prefix="daisy_") tempPath = Path(self.tempDir) try: with zipfile.ZipFile(daisyPath, 'r') as zipRef: zipRef.extractall(tempPath) # Detect DAISY version and parse accordingly if (tempPath / "ncc.html").exists(): return self._parse_daisy2(tempPath) elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")): return self._parse_daisy3(tempPath) else: raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found") except Exception as e: self.cleanup() raise e def _parse_daisy2(self, basePath): """Parse DAISY 2.02 format (NCC.html based)""" nccPath = basePath / "ncc.html" with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), 'html.parser') # Get title titleTag = soup.find('title') bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title" # Find all headings (h1-h6) which represent navigation points headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) chapters = [] for heading in headings: # Get chapter title chapterTitle = heading.get_text().strip() # Find linked content file link = heading.find('a') if not link or not link.get('href'): continue contentHref = link.get('href') contentPath = basePath / contentHref.split('#')[0] if contentPath.exists(): paragraphs = self._extract_paragraphs(contentPath) if paragraphs: chapter = Chapter(chapterTitle) chapter.paragraphs = paragraphs chapters.append(chapter) book = Book(bookTitle) for chapter in chapters: book.add_chapter(chapter) return book def _parse_daisy3(self, basePath): """Parse DAISY 3 format (NCX based)""" # Find NCX file for title ncxFiles = list(basePath.glob("*.ncx")) if not ncxFiles: ncxFiles = [basePath / "navigation.ncx"] ncxPath = ncxFiles[0] with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), features='xml') # Get title titleTag = soup.find('docTitle') if titleTag: textTag = titleTag.find('text') bookTitle = textTag.get_text().strip() if textTag else "Unknown Title" else: bookTitle = "Unknown Title" # Find DTBook XML file (main content) dtbookFiles = list(basePath.glob("*.xml")) # Exclude navigation.ncx if it was named .xml dtbookFiles = [f for f in dtbookFiles if not f.name.endswith('.ncx')] if dtbookFiles: # Try to parse DTBook using NCX navigation structure first chapters = self._parse_dtbook_with_ncx(dtbookFiles[0], basePath, soup) if not chapters: # Fallback: Parse DTBook XML directly for content chapters = self._parse_dtbook(dtbookFiles[0]) else: # Fallback to old method for HTML-based DAISY chapters = self._parse_daisy3_html(basePath, soup) book = Book(bookTitle) for chapter in chapters: book.add_chapter(chapter) return book def _parse_daisy3_html(self, basePath, ncxSoup): """Parse DAISY 3 with HTML content files (fallback)""" navPoints = ncxSoup.find_all('navPoint') chapters = [] for navPoint in navPoints: # Get chapter title navLabel = navPoint.find('navLabel') if navLabel: textTag = navLabel.find('text') chapterTitle = textTag.get_text().strip() if textTag else "Untitled Chapter" else: chapterTitle = "Untitled Chapter" # Find content source content = navPoint.find('content') if not content or not content.get('src'): continue contentSrc = content.get('src') contentPath = basePath / contentSrc.split('#')[0] if contentPath.exists(): paragraphs = self._extract_paragraphs(contentPath) if paragraphs: chapter = Chapter(chapterTitle) chapter.paragraphs = paragraphs chapters.append(chapter) return chapters def _parse_dtbook_with_ncx(self, dtbookPath, basePath, ncxSoup): """ Parse DTBook using NCX navigation structure Args: dtbookPath: Path to DTBook XML file basePath: Base directory path ncxSoup: BeautifulSoup object of parsed NCX Returns: List of Chapter objects or None if parsing fails """ try: # Load DTBook content with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f: dtbookSoup = BeautifulSoup(f.read(), features='xml') # Find all top-level navPoints (chapters) navMap = ncxSoup.find('navMap') if not navMap: return None chapters = [] for navPoint in navMap.find_all('navPoint', recursive=False): # Get chapter title navLabel = navPoint.find('navLabel') if navLabel: textTag = navLabel.find('text') chapterTitle = textTag.get_text().strip() if textTag else "Untitled" else: chapterTitle = "Untitled" # Get content source content = navPoint.find('content') if not content or not content.get('src'): continue contentSrc = content.get('src') # Extract fragment identifier (anchor) parts = contentSrc.split('#') anchor = parts[1] if len(parts) > 1 else None if not anchor: continue # Find the element in DTBook by ID section = dtbookSoup.find(id=anchor) if not section: continue # Extract paragraphs from this section paragraphs = [] for p in section.find_all('p'): text = p.get_text().strip() text = re.sub(r'\s+', ' ', text) if text: paragraphs.append(text) if paragraphs: chapter = Chapter(chapterTitle) chapter.paragraphs = paragraphs chapters.append(chapter) return chapters if chapters else None except Exception as e: print(f"Error parsing DTBook with NCX: {e}") return None def _parse_dtbook(self, dtbookPath): """Parse DTBook XML format""" with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), features='xml') chapters = [] # Find all level1 elements (top-level sections) level1Elements = soup.find_all('level1') for level1 in level1Elements: # Get chapter title from h1, h2, or id chapterTitle = None # Try to find heading for hTag in ['h1', 'h2', 'h3']: heading = level1.find(hTag) if heading: chapterTitle = heading.get_text().strip() break # Fallback to id if not chapterTitle: chapterTitle = level1.get('id', 'Untitled Chapter') # Extract paragraphs from this level1 paragraphs = [] for p in level1.find_all('p'): text = p.get_text().strip() text = re.sub(r'\s+', ' ', text) if text: paragraphs.append(text) if paragraphs: chapter = Chapter(chapterTitle) chapter.paragraphs = paragraphs chapters.append(chapter) return chapters def _extract_paragraphs(self, htmlPath): """Extract paragraphs from HTML content file""" with open(htmlPath, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), 'html.parser') paragraphs = [] # Find all paragraph tags for p in soup.find_all('p'): text = p.get_text().strip() # Clean up whitespace text = re.sub(r'\s+', ' ', text) if text: paragraphs.append(text) # If no
tags, try divs or just get all text if not paragraphs: # Try divs for div in soup.find_all('div'): text = div.get_text().strip() text = re.sub(r'\s+', ' ', text) if text and len(text) > 10: # Avoid tiny fragments paragraphs.append(text) # Last resort: split body text by double newlines if not paragraphs: body = soup.find('body') if body: text = body.get_text() # Split on multiple newlines or periods followed by newline chunks = re.split(r'\n\n+', text) for chunk in chunks: chunk = chunk.strip() chunk = re.sub(r'\s+', ' ', chunk) if chunk: paragraphs.append(chunk) return paragraphs def cleanup(self): """Clean up temporary files""" if self.tempDir and Path(self.tempDir).exists(): shutil.rmtree(self.tempDir) self.tempDir = None