170 lines
5.4 KiB
Python
170 lines
5.4 KiB
Python
"""TXT parser for BookStorm - extracts text from plain text files."""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from src.book import Book, Chapter
|
|
|
|
|
|
def detect_chapter_breaks(text):
|
|
"""Detect chapter breaks in text using various heuristics.
|
|
|
|
Returns:
|
|
list of tuples: [(chapterTitle, chapterText), ...]
|
|
"""
|
|
chapters = []
|
|
|
|
# Pattern 1: Common chapter headings (case insensitive)
|
|
# Matches: "Chapter 1", "Chapter One", "CHAPTER 1:", etc.
|
|
chapterPattern = re.compile(
|
|
r'^(Chapter\s+[IVXLCDM\d]+[:\-\s]*.*)$',
|
|
re.MULTILINE | re.IGNORECASE
|
|
)
|
|
|
|
# Pattern 2: Markdown-style headings
|
|
# Matches: "# Chapter Title", "## Section Title"
|
|
markdownPattern = re.compile(
|
|
r'^(#{1,3}\s+.+)$',
|
|
re.MULTILINE
|
|
)
|
|
|
|
# Pattern 3: Page breaks (form feed character)
|
|
pageBreakPattern = re.compile(r'\f+')
|
|
|
|
# Pattern 4: Multiple blank lines (3+)
|
|
multiBlankPattern = re.compile(r'\n\s*\n\s*\n\s*\n+')
|
|
|
|
# Try chapter headings first
|
|
chapterMatches = list(chapterPattern.finditer(text))
|
|
if len(chapterMatches) >= 2:
|
|
# Found multiple chapter markers
|
|
for i, match in enumerate(chapterMatches):
|
|
title = match.group(1).strip()
|
|
start = match.start()
|
|
end = chapterMatches[i + 1].start() if i + 1 < len(chapterMatches) else len(text)
|
|
chapterText = text[start:end].strip()
|
|
# Remove title from chapter text
|
|
chapterText = chapterText[len(match.group(0)):].strip()
|
|
chapters.append((title, chapterText))
|
|
return chapters
|
|
|
|
# Try markdown headings
|
|
markdownMatches = list(markdownPattern.finditer(text))
|
|
if len(markdownMatches) >= 2:
|
|
for i, match in enumerate(markdownMatches):
|
|
title = match.group(1).strip()
|
|
# Remove leading # symbols for title
|
|
title = re.sub(r'^#+\s*', '', title)
|
|
start = match.start()
|
|
end = markdownMatches[i + 1].start() if i + 1 < len(markdownMatches) else len(text)
|
|
chapterText = text[start:end].strip()
|
|
chapterText = chapterText[len(match.group(0)):].strip()
|
|
chapters.append((title, chapterText))
|
|
return chapters
|
|
|
|
# Try page breaks
|
|
pageBreakParts = pageBreakPattern.split(text)
|
|
if len(pageBreakParts) >= 2:
|
|
for i, part in enumerate(pageBreakParts):
|
|
if part.strip():
|
|
title = f"Section {i + 1}"
|
|
chapters.append((title, part.strip()))
|
|
if chapters:
|
|
return chapters
|
|
|
|
# Try multiple blank lines as separators
|
|
multiBlankParts = multiBlankPattern.split(text)
|
|
if len(multiBlankParts) >= 3: # At least 3 sections
|
|
for i, part in enumerate(multiBlankParts):
|
|
if part.strip():
|
|
# Try to extract a title from first line
|
|
lines = part.strip().split('\n', 1)
|
|
if len(lines) > 1 and len(lines[0]) < 100:
|
|
# First line might be a title
|
|
title = lines[0].strip()
|
|
content = lines[1].strip() if len(lines) > 1 else ""
|
|
else:
|
|
title = f"Section {i + 1}"
|
|
content = part.strip()
|
|
chapters.append((title, content))
|
|
if chapters:
|
|
return chapters
|
|
|
|
# No clear chapter breaks found, treat as single chapter
|
|
return [("Full Text", text.strip())]
|
|
|
|
|
|
def split_into_paragraphs(text):
|
|
"""Split text into paragraphs using double newlines.
|
|
|
|
Falls back to single newlines if no double newlines found.
|
|
"""
|
|
# Try splitting on double newlines first
|
|
paragraphs = re.split(r'\n\s*\n', text)
|
|
|
|
# Clean up whitespace
|
|
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
|
|
|
# If we got very few paragraphs, fall back to single newlines
|
|
if len(paragraphs) < 3:
|
|
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
|
|
|
|
return paragraphs
|
|
|
|
|
|
class TxtParser:
|
|
"""Parser for plain text files"""
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def parse(self, txtPath):
|
|
"""Parse a TXT file and return Book object.
|
|
|
|
Args:
|
|
txtPath: Path to the TXT file
|
|
|
|
Returns:
|
|
Book object with detected chapters or single chapter
|
|
"""
|
|
txtPath = Path(txtPath)
|
|
|
|
if not txtPath.exists():
|
|
raise FileNotFoundError(f"TXT file not found: {txtPath}")
|
|
|
|
# Read file with encoding detection
|
|
try:
|
|
with open(txtPath, 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
except UnicodeDecodeError:
|
|
# Try with latin-1 as fallback
|
|
with open(txtPath, 'r', encoding='latin-1') as f:
|
|
text = f.read()
|
|
|
|
# Use filename as book title
|
|
bookTitle = txtPath.stem
|
|
book = Book(title=bookTitle)
|
|
|
|
# Detect chapters
|
|
detectedChapters = detect_chapter_breaks(text)
|
|
|
|
# Process each chapter
|
|
for chapterTitle, chapterText in detectedChapters:
|
|
chapter = Chapter(title=chapterTitle)
|
|
|
|
# Split into paragraphs
|
|
paragraphs = split_into_paragraphs(chapterText)
|
|
|
|
if not paragraphs:
|
|
chapter.add_paragraph("(Empty chapter)")
|
|
else:
|
|
for paragraph in paragraphs:
|
|
chapter.add_paragraph(paragraph)
|
|
|
|
book.add_chapter(chapter)
|
|
|
|
return book
|
|
|
|
def cleanup(self):
|
|
"""Cleanup any resources (no-op for TXT parser)"""
|
|
pass
|