Files
bookstorm/src/daisy_parser.py
2025-10-04 02:55:01 -04:00

325 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DAISY Book Parser
Handles parsing of DAISY 2.02 and DAISY 3 book formats.
Extracts structure and content for text-to-speech playback.
"""
import zipfile
import tempfile
import shutil
from pathlib import Path
from bs4 import BeautifulSoup
import re
from src.book import Book, Chapter
class DaisyParser:
"""Parser for DAISY format books"""
def __init__(self):
self.tempDir = None
def parse(self, daisyPath):
"""
Parse a DAISY book (zip file)
Args:
daisyPath: Path to DAISY zip file
Returns:
Book object
"""
daisyPath = Path(daisyPath)
if not daisyPath.exists():
raise FileNotFoundError(f"DAISY file not found: {daisyPath}")
# Extract zip to temp directory
self.tempDir = tempfile.mkdtemp(prefix="daisy_")
tempPath = Path(self.tempDir)
try:
with zipfile.ZipFile(daisyPath, 'r') as zipRef:
zipRef.extractall(tempPath)
# Detect DAISY version and parse accordingly
if (tempPath / "ncc.html").exists():
return self._parse_daisy2(tempPath)
elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")):
return self._parse_daisy3(tempPath)
else:
raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
except Exception as e:
self.cleanup()
raise e
def _parse_daisy2(self, basePath):
"""Parse DAISY 2.02 format (NCC.html based)"""
nccPath = basePath / "ncc.html"
with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
# Get title
titleTag = soup.find('title')
bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title"
# Find all headings (h1-h6) which represent navigation points
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
chapters = []
for heading in headings:
# Get chapter title
chapterTitle = heading.get_text().strip()
# Find linked content file
link = heading.find('a')
if not link or not link.get('href'):
continue
contentHref = link.get('href')
contentPath = basePath / contentHref.split('#')[0]
if contentPath.exists():
paragraphs = self._extract_paragraphs(contentPath)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
book = Book(bookTitle)
for chapter in chapters:
book.add_chapter(chapter)
return book
def _parse_daisy3(self, basePath):
"""Parse DAISY 3 format (NCX based)"""
# Find NCX file for title
ncxFiles = list(basePath.glob("*.ncx"))
if not ncxFiles:
ncxFiles = [basePath / "navigation.ncx"]
ncxPath = ncxFiles[0]
with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), features='xml')
# Get title
titleTag = soup.find('docTitle')
if titleTag:
textTag = titleTag.find('text')
bookTitle = textTag.get_text().strip() if textTag else "Unknown Title"
else:
bookTitle = "Unknown Title"
# Find DTBook XML file (main content)
dtbookFiles = list(basePath.glob("*.xml"))
# Exclude navigation.ncx if it was named .xml
dtbookFiles = [f for f in dtbookFiles if not f.name.endswith('.ncx')]
if dtbookFiles:
# Try to parse DTBook using NCX navigation structure first
chapters = self._parse_dtbook_with_ncx(dtbookFiles[0], basePath, soup)
if not chapters:
# Fallback: Parse DTBook XML directly for content
chapters = self._parse_dtbook(dtbookFiles[0])
else:
# Fallback to old method for HTML-based DAISY
chapters = self._parse_daisy3_html(basePath, soup)
book = Book(bookTitle)
for chapter in chapters:
book.add_chapter(chapter)
return book
def _parse_daisy3_html(self, basePath, ncxSoup):
"""Parse DAISY 3 with HTML content files (fallback)"""
navPoints = ncxSoup.find_all('navPoint')
chapters = []
for navPoint in navPoints:
# Get chapter title
navLabel = navPoint.find('navLabel')
if navLabel:
textTag = navLabel.find('text')
chapterTitle = textTag.get_text().strip() if textTag else "Untitled Chapter"
else:
chapterTitle = "Untitled Chapter"
# Find content source
content = navPoint.find('content')
if not content or not content.get('src'):
continue
contentSrc = content.get('src')
contentPath = basePath / contentSrc.split('#')[0]
if contentPath.exists():
paragraphs = self._extract_paragraphs(contentPath)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
return chapters
def _parse_dtbook_with_ncx(self, dtbookPath, basePath, ncxSoup):
"""
Parse DTBook using NCX navigation structure
Args:
dtbookPath: Path to DTBook XML file
basePath: Base directory path
ncxSoup: BeautifulSoup object of parsed NCX
Returns:
List of Chapter objects or None if parsing fails
"""
try:
# Load DTBook content
with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
dtbookSoup = BeautifulSoup(f.read(), features='xml')
# Find all top-level navPoints (chapters)
navMap = ncxSoup.find('navMap')
if not navMap:
return None
chapters = []
for navPoint in navMap.find_all('navPoint', recursive=False):
# Get chapter title
navLabel = navPoint.find('navLabel')
if navLabel:
textTag = navLabel.find('text')
chapterTitle = textTag.get_text().strip() if textTag else "Untitled"
else:
chapterTitle = "Untitled"
# Get content source
content = navPoint.find('content')
if not content or not content.get('src'):
continue
contentSrc = content.get('src')
# Extract fragment identifier (anchor)
parts = contentSrc.split('#')
anchor = parts[1] if len(parts) > 1 else None
if not anchor:
continue
# Find the element in DTBook by ID
section = dtbookSoup.find(id=anchor)
if not section:
continue
# Extract paragraphs from this section
paragraphs = []
for p in section.find_all('p'):
text = p.get_text().strip()
text = re.sub(r'\s+', ' ', text)
if text:
paragraphs.append(text)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
return chapters if chapters else None
except Exception as e:
print(f"Error parsing DTBook with NCX: {e}")
return None
def _parse_dtbook(self, dtbookPath):
"""Parse DTBook XML format"""
with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), features='xml')
chapters = []
# Find all level1 elements (top-level sections)
level1Elements = soup.find_all('level1')
for level1 in level1Elements:
# Get chapter title from h1, h2, or id
chapterTitle = None
# Try to find heading
for hTag in ['h1', 'h2', 'h3']:
heading = level1.find(hTag)
if heading:
chapterTitle = heading.get_text().strip()
break
# Fallback to id
if not chapterTitle:
chapterTitle = level1.get('id', 'Untitled Chapter')
# Extract paragraphs from this level1
paragraphs = []
for p in level1.find_all('p'):
text = p.get_text().strip()
text = re.sub(r'\s+', ' ', text)
if text:
paragraphs.append(text)
if paragraphs:
chapter = Chapter(chapterTitle)
chapter.paragraphs = paragraphs
chapters.append(chapter)
return chapters
def _extract_paragraphs(self, htmlPath):
"""Extract paragraphs from HTML content file"""
with open(htmlPath, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
paragraphs = []
# Find all paragraph tags
for p in soup.find_all('p'):
text = p.get_text().strip()
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
if text:
paragraphs.append(text)
# If no <p> tags, try divs or just get all text
if not paragraphs:
# Try divs
for div in soup.find_all('div'):
text = div.get_text().strip()
text = re.sub(r'\s+', ' ', text)
if text and len(text) > 10: # Avoid tiny fragments
paragraphs.append(text)
# Last resort: split body text by double newlines
if not paragraphs:
body = soup.find('body')
if body:
text = body.get_text()
# Split on multiple newlines or periods followed by newline
chunks = re.split(r'\n\n+', text)
for chunk in chunks:
chunk = chunk.strip()
chunk = re.sub(r'\s+', ' ', chunk)
if chunk:
paragraphs.append(chunk)
return paragraphs
def cleanup(self):
"""Clean up temporary files"""
if self.tempDir and Path(self.tempDir).exists():
shutil.rmtree(self.tempDir)
self.tempDir = None