Initial commit.
This commit is contained in:
169
src/txt_parser.py
Normal file
169
src/txt_parser.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""TXT parser for BookStorm - extracts text from plain text files."""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from src.book import Book, Chapter
|
||||
|
||||
|
||||
def detect_chapter_breaks(text):
|
||||
"""Detect chapter breaks in text using various heuristics.
|
||||
|
||||
Returns:
|
||||
list of tuples: [(chapterTitle, chapterText), ...]
|
||||
"""
|
||||
chapters = []
|
||||
|
||||
# Pattern 1: Common chapter headings (case insensitive)
|
||||
# Matches: "Chapter 1", "Chapter One", "CHAPTER 1:", etc.
|
||||
chapterPattern = re.compile(
|
||||
r'^(Chapter\s+[IVXLCDM\d]+[:\-\s]*.*)$',
|
||||
re.MULTILINE | re.IGNORECASE
|
||||
)
|
||||
|
||||
# Pattern 2: Markdown-style headings
|
||||
# Matches: "# Chapter Title", "## Section Title"
|
||||
markdownPattern = re.compile(
|
||||
r'^(#{1,3}\s+.+)$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
# Pattern 3: Page breaks (form feed character)
|
||||
pageBreakPattern = re.compile(r'\f+')
|
||||
|
||||
# Pattern 4: Multiple blank lines (3+)
|
||||
multiBlankPattern = re.compile(r'\n\s*\n\s*\n\s*\n+')
|
||||
|
||||
# Try chapter headings first
|
||||
chapterMatches = list(chapterPattern.finditer(text))
|
||||
if len(chapterMatches) >= 2:
|
||||
# Found multiple chapter markers
|
||||
for i, match in enumerate(chapterMatches):
|
||||
title = match.group(1).strip()
|
||||
start = match.start()
|
||||
end = chapterMatches[i + 1].start() if i + 1 < len(chapterMatches) else len(text)
|
||||
chapterText = text[start:end].strip()
|
||||
# Remove title from chapter text
|
||||
chapterText = chapterText[len(match.group(0)):].strip()
|
||||
chapters.append((title, chapterText))
|
||||
return chapters
|
||||
|
||||
# Try markdown headings
|
||||
markdownMatches = list(markdownPattern.finditer(text))
|
||||
if len(markdownMatches) >= 2:
|
||||
for i, match in enumerate(markdownMatches):
|
||||
title = match.group(1).strip()
|
||||
# Remove leading # symbols for title
|
||||
title = re.sub(r'^#+\s*', '', title)
|
||||
start = match.start()
|
||||
end = markdownMatches[i + 1].start() if i + 1 < len(markdownMatches) else len(text)
|
||||
chapterText = text[start:end].strip()
|
||||
chapterText = chapterText[len(match.group(0)):].strip()
|
||||
chapters.append((title, chapterText))
|
||||
return chapters
|
||||
|
||||
# Try page breaks
|
||||
pageBreakParts = pageBreakPattern.split(text)
|
||||
if len(pageBreakParts) >= 2:
|
||||
for i, part in enumerate(pageBreakParts):
|
||||
if part.strip():
|
||||
title = f"Section {i + 1}"
|
||||
chapters.append((title, part.strip()))
|
||||
if chapters:
|
||||
return chapters
|
||||
|
||||
# Try multiple blank lines as separators
|
||||
multiBlankParts = multiBlankPattern.split(text)
|
||||
if len(multiBlankParts) >= 3: # At least 3 sections
|
||||
for i, part in enumerate(multiBlankParts):
|
||||
if part.strip():
|
||||
# Try to extract a title from first line
|
||||
lines = part.strip().split('\n', 1)
|
||||
if len(lines) > 1 and len(lines[0]) < 100:
|
||||
# First line might be a title
|
||||
title = lines[0].strip()
|
||||
content = lines[1].strip() if len(lines) > 1 else ""
|
||||
else:
|
||||
title = f"Section {i + 1}"
|
||||
content = part.strip()
|
||||
chapters.append((title, content))
|
||||
if chapters:
|
||||
return chapters
|
||||
|
||||
# No clear chapter breaks found, treat as single chapter
|
||||
return [("Full Text", text.strip())]
|
||||
|
||||
|
||||
def split_into_paragraphs(text):
|
||||
"""Split text into paragraphs using double newlines.
|
||||
|
||||
Falls back to single newlines if no double newlines found.
|
||||
"""
|
||||
# Try splitting on double newlines first
|
||||
paragraphs = re.split(r'\n\s*\n', text)
|
||||
|
||||
# Clean up whitespace
|
||||
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
||||
|
||||
# If we got very few paragraphs, fall back to single newlines
|
||||
if len(paragraphs) < 3:
|
||||
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
|
||||
|
||||
return paragraphs
|
||||
|
||||
|
||||
class TxtParser:
|
||||
"""Parser for plain text files"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def parse(self, txtPath):
|
||||
"""Parse a TXT file and return Book object.
|
||||
|
||||
Args:
|
||||
txtPath: Path to the TXT file
|
||||
|
||||
Returns:
|
||||
Book object with detected chapters or single chapter
|
||||
"""
|
||||
txtPath = Path(txtPath)
|
||||
|
||||
if not txtPath.exists():
|
||||
raise FileNotFoundError(f"TXT file not found: {txtPath}")
|
||||
|
||||
# Read file with encoding detection
|
||||
try:
|
||||
with open(txtPath, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
except UnicodeDecodeError:
|
||||
# Try with latin-1 as fallback
|
||||
with open(txtPath, 'r', encoding='latin-1') as f:
|
||||
text = f.read()
|
||||
|
||||
# Use filename as book title
|
||||
bookTitle = txtPath.stem
|
||||
book = Book(title=bookTitle)
|
||||
|
||||
# Detect chapters
|
||||
detectedChapters = detect_chapter_breaks(text)
|
||||
|
||||
# Process each chapter
|
||||
for chapterTitle, chapterText in detectedChapters:
|
||||
chapter = Chapter(title=chapterTitle)
|
||||
|
||||
# Split into paragraphs
|
||||
paragraphs = split_into_paragraphs(chapterText)
|
||||
|
||||
if not paragraphs:
|
||||
chapter.add_paragraph("(Empty chapter)")
|
||||
else:
|
||||
for paragraph in paragraphs:
|
||||
chapter.add_paragraph(paragraph)
|
||||
|
||||
book.add_chapter(chapter)
|
||||
|
||||
return book
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup any resources (no-op for TXT parser)"""
|
||||
pass
|
||||
Reference in New Issue
Block a user