"""TXT parser for BookStorm - extracts text from plain text files.""" import re from pathlib import Path from src.book import Book, Chapter def detect_chapter_breaks(text): """Detect chapter breaks in text using various heuristics. Returns: list of tuples: [(chapterTitle, chapterText), ...] """ chapters = [] # Pattern 1: Common chapter headings (case insensitive) # Matches: "Chapter 1", "Chapter One", "CHAPTER 1:", etc. chapterPattern = re.compile( r'^(Chapter\s+[IVXLCDM\d]+[:\-\s]*.*)$', re.MULTILINE | re.IGNORECASE ) # Pattern 2: Markdown-style headings # Matches: "# Chapter Title", "## Section Title" markdownPattern = re.compile( r'^(#{1,3}\s+.+)$', re.MULTILINE ) # Pattern 3: Page breaks (form feed character) pageBreakPattern = re.compile(r'\f+') # Pattern 4: Multiple blank lines (3+) multiBlankPattern = re.compile(r'\n\s*\n\s*\n\s*\n+') # Try chapter headings first chapterMatches = list(chapterPattern.finditer(text)) if len(chapterMatches) >= 2: # Found multiple chapter markers for i, match in enumerate(chapterMatches): title = match.group(1).strip() start = match.start() end = chapterMatches[i + 1].start() if i + 1 < len(chapterMatches) else len(text) chapterText = text[start:end].strip() # Remove title from chapter text chapterText = chapterText[len(match.group(0)):].strip() chapters.append((title, chapterText)) return chapters # Try markdown headings markdownMatches = list(markdownPattern.finditer(text)) if len(markdownMatches) >= 2: for i, match in enumerate(markdownMatches): title = match.group(1).strip() # Remove leading # symbols for title title = re.sub(r'^#+\s*', '', title) start = match.start() end = markdownMatches[i + 1].start() if i + 1 < len(markdownMatches) else len(text) chapterText = text[start:end].strip() chapterText = chapterText[len(match.group(0)):].strip() chapters.append((title, chapterText)) return chapters # Try page breaks pageBreakParts = pageBreakPattern.split(text) if len(pageBreakParts) >= 2: for i, part in enumerate(pageBreakParts): if part.strip(): title = f"Section {i + 1}" chapters.append((title, part.strip())) if chapters: return chapters # Try multiple blank lines as separators multiBlankParts = multiBlankPattern.split(text) if len(multiBlankParts) >= 3: # At least 3 sections for i, part in enumerate(multiBlankParts): if part.strip(): # Try to extract a title from first line lines = part.strip().split('\n', 1) if len(lines) > 1 and len(lines[0]) < 100: # First line might be a title title = lines[0].strip() content = lines[1].strip() if len(lines) > 1 else "" else: title = f"Section {i + 1}" content = part.strip() chapters.append((title, content)) if chapters: return chapters # No clear chapter breaks found, treat as single chapter return [("Full Text", text.strip())] def split_into_paragraphs(text): """Split text into paragraphs using double newlines. Falls back to single newlines if no double newlines found. """ # Try splitting on double newlines first paragraphs = re.split(r'\n\s*\n', text) # Clean up whitespace paragraphs = [p.strip() for p in paragraphs if p.strip()] # If we got very few paragraphs, fall back to single newlines if len(paragraphs) < 3: paragraphs = [p.strip() for p in text.split('\n') if p.strip()] return paragraphs class TxtParser: """Parser for plain text files""" def __init__(self): pass def parse(self, txtPath): """Parse a TXT file and return Book object. Args: txtPath: Path to the TXT file Returns: Book object with detected chapters or single chapter """ txtPath = Path(txtPath) if not txtPath.exists(): raise FileNotFoundError(f"TXT file not found: {txtPath}") # Read file with encoding detection try: with open(txtPath, 'r', encoding='utf-8') as f: text = f.read() except UnicodeDecodeError: # Try with latin-1 as fallback with open(txtPath, 'r', encoding='latin-1') as f: text = f.read() # Use filename as book title bookTitle = txtPath.stem book = Book(title=bookTitle) # Detect chapters detectedChapters = detect_chapter_breaks(text) # Process each chapter for chapterTitle, chapterText in detectedChapters: chapter = Chapter(title=chapterTitle) # Split into paragraphs paragraphs = split_into_paragraphs(chapterText) if not paragraphs: chapter.add_paragraph("(Empty chapter)") else: for paragraph in paragraphs: chapter.add_paragraph(paragraph) book.add_chapter(chapter) return book def cleanup(self): """Cleanup any resources (no-op for TXT parser)""" pass