325 lines
11 KiB
Python
325 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
DAISY Book Parser
|
|
|
|
Handles parsing of DAISY 2.02 and DAISY 3 book formats.
|
|
Extracts structure and content for text-to-speech playback.
|
|
"""
|
|
|
|
import zipfile
|
|
import tempfile
|
|
import shutil
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
from src.book import Book, Chapter
|
|
|
|
|
|
class DaisyParser:
|
|
"""Parser for DAISY format books"""
|
|
|
|
def __init__(self):
|
|
self.tempDir = None
|
|
|
|
def parse(self, daisyPath):
|
|
"""
|
|
Parse a DAISY book (zip file)
|
|
|
|
Args:
|
|
daisyPath: Path to DAISY zip file
|
|
|
|
Returns:
|
|
Book object
|
|
"""
|
|
daisyPath = Path(daisyPath)
|
|
|
|
if not daisyPath.exists():
|
|
raise FileNotFoundError(f"DAISY file not found: {daisyPath}")
|
|
|
|
# Extract zip to temp directory
|
|
self.tempDir = tempfile.mkdtemp(prefix="daisy_")
|
|
tempPath = Path(self.tempDir)
|
|
|
|
try:
|
|
with zipfile.ZipFile(daisyPath, 'r') as zipRef:
|
|
zipRef.extractall(tempPath)
|
|
|
|
# Detect DAISY version and parse accordingly
|
|
if (tempPath / "ncc.html").exists():
|
|
return self._parse_daisy2(tempPath)
|
|
elif (tempPath / "navigation.ncx").exists() or list(tempPath.glob("*.ncx")):
|
|
return self._parse_daisy3(tempPath)
|
|
else:
|
|
raise ValueError("Unknown DAISY format: no ncc.html or navigation.ncx found")
|
|
|
|
except Exception as e:
|
|
self.cleanup()
|
|
raise e
|
|
|
|
def _parse_daisy2(self, basePath):
|
|
"""Parse DAISY 2.02 format (NCC.html based)"""
|
|
nccPath = basePath / "ncc.html"
|
|
|
|
with open(nccPath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
soup = BeautifulSoup(f.read(), 'html.parser')
|
|
|
|
# Get title
|
|
titleTag = soup.find('title')
|
|
bookTitle = titleTag.get_text().strip() if titleTag else "Unknown Title"
|
|
|
|
# Find all headings (h1-h6) which represent navigation points
|
|
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|
|
|
chapters = []
|
|
for heading in headings:
|
|
# Get chapter title
|
|
chapterTitle = heading.get_text().strip()
|
|
|
|
# Find linked content file
|
|
link = heading.find('a')
|
|
if not link or not link.get('href'):
|
|
continue
|
|
|
|
contentHref = link.get('href')
|
|
contentPath = basePath / contentHref.split('#')[0]
|
|
|
|
if contentPath.exists():
|
|
paragraphs = self._extract_paragraphs(contentPath)
|
|
if paragraphs:
|
|
chapter = Chapter(chapterTitle)
|
|
chapter.paragraphs = paragraphs
|
|
chapters.append(chapter)
|
|
|
|
book = Book(bookTitle)
|
|
for chapter in chapters:
|
|
book.add_chapter(chapter)
|
|
return book
|
|
|
|
def _parse_daisy3(self, basePath):
|
|
"""Parse DAISY 3 format (NCX based)"""
|
|
# Find NCX file for title
|
|
ncxFiles = list(basePath.glob("*.ncx"))
|
|
if not ncxFiles:
|
|
ncxFiles = [basePath / "navigation.ncx"]
|
|
|
|
ncxPath = ncxFiles[0]
|
|
|
|
with open(ncxPath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
soup = BeautifulSoup(f.read(), features='xml')
|
|
|
|
# Get title
|
|
titleTag = soup.find('docTitle')
|
|
if titleTag:
|
|
textTag = titleTag.find('text')
|
|
bookTitle = textTag.get_text().strip() if textTag else "Unknown Title"
|
|
else:
|
|
bookTitle = "Unknown Title"
|
|
|
|
# Find DTBook XML file (main content)
|
|
dtbookFiles = list(basePath.glob("*.xml"))
|
|
# Exclude navigation.ncx if it was named .xml
|
|
dtbookFiles = [f for f in dtbookFiles if not f.name.endswith('.ncx')]
|
|
|
|
if dtbookFiles:
|
|
# Try to parse DTBook using NCX navigation structure first
|
|
chapters = self._parse_dtbook_with_ncx(dtbookFiles[0], basePath, soup)
|
|
|
|
if not chapters:
|
|
# Fallback: Parse DTBook XML directly for content
|
|
chapters = self._parse_dtbook(dtbookFiles[0])
|
|
else:
|
|
# Fallback to old method for HTML-based DAISY
|
|
chapters = self._parse_daisy3_html(basePath, soup)
|
|
|
|
book = Book(bookTitle)
|
|
for chapter in chapters:
|
|
book.add_chapter(chapter)
|
|
return book
|
|
|
|
def _parse_daisy3_html(self, basePath, ncxSoup):
|
|
"""Parse DAISY 3 with HTML content files (fallback)"""
|
|
navPoints = ncxSoup.find_all('navPoint')
|
|
|
|
chapters = []
|
|
for navPoint in navPoints:
|
|
# Get chapter title
|
|
navLabel = navPoint.find('navLabel')
|
|
if navLabel:
|
|
textTag = navLabel.find('text')
|
|
chapterTitle = textTag.get_text().strip() if textTag else "Untitled Chapter"
|
|
else:
|
|
chapterTitle = "Untitled Chapter"
|
|
|
|
# Find content source
|
|
content = navPoint.find('content')
|
|
if not content or not content.get('src'):
|
|
continue
|
|
|
|
contentSrc = content.get('src')
|
|
contentPath = basePath / contentSrc.split('#')[0]
|
|
|
|
if contentPath.exists():
|
|
paragraphs = self._extract_paragraphs(contentPath)
|
|
if paragraphs:
|
|
chapter = Chapter(chapterTitle)
|
|
chapter.paragraphs = paragraphs
|
|
chapters.append(chapter)
|
|
|
|
return chapters
|
|
|
|
def _parse_dtbook_with_ncx(self, dtbookPath, basePath, ncxSoup):
|
|
"""
|
|
Parse DTBook using NCX navigation structure
|
|
|
|
Args:
|
|
dtbookPath: Path to DTBook XML file
|
|
basePath: Base directory path
|
|
ncxSoup: BeautifulSoup object of parsed NCX
|
|
|
|
Returns:
|
|
List of Chapter objects or None if parsing fails
|
|
"""
|
|
try:
|
|
# Load DTBook content
|
|
with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
dtbookSoup = BeautifulSoup(f.read(), features='xml')
|
|
|
|
# Find all top-level navPoints (chapters)
|
|
navMap = ncxSoup.find('navMap')
|
|
if not navMap:
|
|
return None
|
|
|
|
chapters = []
|
|
for navPoint in navMap.find_all('navPoint', recursive=False):
|
|
# Get chapter title
|
|
navLabel = navPoint.find('navLabel')
|
|
if navLabel:
|
|
textTag = navLabel.find('text')
|
|
chapterTitle = textTag.get_text().strip() if textTag else "Untitled"
|
|
else:
|
|
chapterTitle = "Untitled"
|
|
|
|
# Get content source
|
|
content = navPoint.find('content')
|
|
if not content or not content.get('src'):
|
|
continue
|
|
|
|
contentSrc = content.get('src')
|
|
|
|
# Extract fragment identifier (anchor)
|
|
parts = contentSrc.split('#')
|
|
anchor = parts[1] if len(parts) > 1 else None
|
|
|
|
if not anchor:
|
|
continue
|
|
|
|
# Find the element in DTBook by ID
|
|
section = dtbookSoup.find(id=anchor)
|
|
if not section:
|
|
continue
|
|
|
|
# Extract paragraphs from this section
|
|
paragraphs = []
|
|
for p in section.find_all('p'):
|
|
text = p.get_text().strip()
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if text:
|
|
paragraphs.append(text)
|
|
|
|
if paragraphs:
|
|
chapter = Chapter(chapterTitle)
|
|
chapter.paragraphs = paragraphs
|
|
chapters.append(chapter)
|
|
|
|
return chapters if chapters else None
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing DTBook with NCX: {e}")
|
|
return None
|
|
|
|
def _parse_dtbook(self, dtbookPath):
|
|
"""Parse DTBook XML format"""
|
|
with open(dtbookPath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
soup = BeautifulSoup(f.read(), features='xml')
|
|
|
|
chapters = []
|
|
|
|
# Find all level1 elements (top-level sections)
|
|
level1Elements = soup.find_all('level1')
|
|
|
|
for level1 in level1Elements:
|
|
# Get chapter title from h1, h2, or id
|
|
chapterTitle = None
|
|
|
|
# Try to find heading
|
|
for hTag in ['h1', 'h2', 'h3']:
|
|
heading = level1.find(hTag)
|
|
if heading:
|
|
chapterTitle = heading.get_text().strip()
|
|
break
|
|
|
|
# Fallback to id
|
|
if not chapterTitle:
|
|
chapterTitle = level1.get('id', 'Untitled Chapter')
|
|
|
|
# Extract paragraphs from this level1
|
|
paragraphs = []
|
|
for p in level1.find_all('p'):
|
|
text = p.get_text().strip()
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if text:
|
|
paragraphs.append(text)
|
|
|
|
if paragraphs:
|
|
chapter = Chapter(chapterTitle)
|
|
chapter.paragraphs = paragraphs
|
|
chapters.append(chapter)
|
|
|
|
return chapters
|
|
|
|
def _extract_paragraphs(self, htmlPath):
|
|
"""Extract paragraphs from HTML content file"""
|
|
with open(htmlPath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
soup = BeautifulSoup(f.read(), 'html.parser')
|
|
|
|
paragraphs = []
|
|
|
|
# Find all paragraph tags
|
|
for p in soup.find_all('p'):
|
|
text = p.get_text().strip()
|
|
# Clean up whitespace
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if text:
|
|
paragraphs.append(text)
|
|
|
|
# If no <p> tags, try divs or just get all text
|
|
if not paragraphs:
|
|
# Try divs
|
|
for div in soup.find_all('div'):
|
|
text = div.get_text().strip()
|
|
text = re.sub(r'\s+', ' ', text)
|
|
if text and len(text) > 10: # Avoid tiny fragments
|
|
paragraphs.append(text)
|
|
|
|
# Last resort: split body text by double newlines
|
|
if not paragraphs:
|
|
body = soup.find('body')
|
|
if body:
|
|
text = body.get_text()
|
|
# Split on multiple newlines or periods followed by newline
|
|
chunks = re.split(r'\n\n+', text)
|
|
for chunk in chunks:
|
|
chunk = chunk.strip()
|
|
chunk = re.sub(r'\s+', ' ', chunk)
|
|
if chunk:
|
|
paragraphs.append(chunk)
|
|
|
|
return paragraphs
|
|
|
|
def cleanup(self):
|
|
"""Clean up temporary files"""
|
|
if self.tempDir and Path(self.tempDir).exists():
|
|
shutil.rmtree(self.tempDir)
|
|
self.tempDir = None
|