Initial commit.

2025-10-04 02:55:01 -04:00
commit 1d19ed377c
16 changed files with 4401 additions and 0 deletions
--- a/src/txt_parser.py
+++ b/src/txt_parser.py
@@ -0,0 +1,169 @@
+"""TXT parser for BookStorm - extracts text from plain text files."""
+
+import re
+from pathlib import Path
+from src.book import Book, Chapter
+
+
+def detect_chapter_breaks(text):
+    """Detect chapter breaks in text using various heuristics.
+
+    Returns:
+        list of tuples: [(chapterTitle, chapterText), ...]
+    """
+    chapters = []
+
+    # Pattern 1: Common chapter headings (case insensitive)
+    # Matches: "Chapter 1", "Chapter One", "CHAPTER 1:", etc.
+    chapterPattern = re.compile(
+        r'^(Chapter\s+[IVXLCDM\d]+[:\-\s]*.*)$',
+        re.MULTILINE | re.IGNORECASE
+    )
+
+    # Pattern 2: Markdown-style headings
+    # Matches: "# Chapter Title", "## Section Title"
+    markdownPattern = re.compile(
+        r'^(#{1,3}\s+.+)$',
+        re.MULTILINE
+    )
+
+    # Pattern 3: Page breaks (form feed character)
+    pageBreakPattern = re.compile(r'\f+')
+
+    # Pattern 4: Multiple blank lines (3+)
+    multiBlankPattern = re.compile(r'\n\s*\n\s*\n\s*\n+')
+
+    # Try chapter headings first
+    chapterMatches = list(chapterPattern.finditer(text))
+    if len(chapterMatches) >= 2:
+        # Found multiple chapter markers
+        for i, match in enumerate(chapterMatches):
+            title = match.group(1).strip()
+            start = match.start()
+            end = chapterMatches[i + 1].start() if i + 1 < len(chapterMatches) else len(text)
+            chapterText = text[start:end].strip()
+            # Remove title from chapter text
+            chapterText = chapterText[len(match.group(0)):].strip()
+            chapters.append((title, chapterText))
+        return chapters
+
+    # Try markdown headings
+    markdownMatches = list(markdownPattern.finditer(text))
+    if len(markdownMatches) >= 2:
+        for i, match in enumerate(markdownMatches):
+            title = match.group(1).strip()
+            # Remove leading # symbols for title
+            title = re.sub(r'^#+\s*', '', title)
+            start = match.start()
+            end = markdownMatches[i + 1].start() if i + 1 < len(markdownMatches) else len(text)
+            chapterText = text[start:end].strip()
+            chapterText = chapterText[len(match.group(0)):].strip()
+            chapters.append((title, chapterText))
+        return chapters
+
+    # Try page breaks
+    pageBreakParts = pageBreakPattern.split(text)
+    if len(pageBreakParts) >= 2:
+        for i, part in enumerate(pageBreakParts):
+            if part.strip():
+                title = f"Section {i + 1}"
+                chapters.append((title, part.strip()))
+        if chapters:
+            return chapters
+
+    # Try multiple blank lines as separators
+    multiBlankParts = multiBlankPattern.split(text)
+    if len(multiBlankParts) >= 3:  # At least 3 sections
+        for i, part in enumerate(multiBlankParts):
+            if part.strip():
+                # Try to extract a title from first line
+                lines = part.strip().split('\n', 1)
+                if len(lines) > 1 and len(lines[0]) < 100:
+                    # First line might be a title
+                    title = lines[0].strip()
+                    content = lines[1].strip() if len(lines) > 1 else ""
+                else:
+                    title = f"Section {i + 1}"
+                    content = part.strip()
+                chapters.append((title, content))
+        if chapters:
+            return chapters
+
+    # No clear chapter breaks found, treat as single chapter
+    return [("Full Text", text.strip())]
+
+
+def split_into_paragraphs(text):
+    """Split text into paragraphs using double newlines.
+
+    Falls back to single newlines if no double newlines found.
+    """
+    # Try splitting on double newlines first
+    paragraphs = re.split(r'\n\s*\n', text)
+
+    # Clean up whitespace
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+
+    # If we got very few paragraphs, fall back to single newlines
+    if len(paragraphs) < 3:
+        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+
+    return paragraphs
+
+
+class TxtParser:
+    """Parser for plain text files"""
+
+    def __init__(self):
+        pass
+
+    def parse(self, txtPath):
+        """Parse a TXT file and return Book object.
+
+        Args:
+            txtPath: Path to the TXT file
+
+        Returns:
+            Book object with detected chapters or single chapter
+        """
+        txtPath = Path(txtPath)
+
+        if not txtPath.exists():
+            raise FileNotFoundError(f"TXT file not found: {txtPath}")
+
+        # Read file with encoding detection
+        try:
+            with open(txtPath, 'r', encoding='utf-8') as f:
+                text = f.read()
+        except UnicodeDecodeError:
+            # Try with latin-1 as fallback
+            with open(txtPath, 'r', encoding='latin-1') as f:
+                text = f.read()
+
+        # Use filename as book title
+        bookTitle = txtPath.stem
+        book = Book(title=bookTitle)
+
+        # Detect chapters
+        detectedChapters = detect_chapter_breaks(text)
+
+        # Process each chapter
+        for chapterTitle, chapterText in detectedChapters:
+            chapter = Chapter(title=chapterTitle)
+
+            # Split into paragraphs
+            paragraphs = split_into_paragraphs(chapterText)
+
+            if not paragraphs:
+                chapter.add_paragraph("(Empty chapter)")
+            else:
+                for paragraph in paragraphs:
+                    chapter.add_paragraph(paragraph)
+
+            book.add_chapter(chapter)
+
+        return book
+
+    def cleanup(self):
+        """Cleanup any resources (no-op for TXT parser)"""
+        pass