From 1fed5922c3846ff54a105fec5ada9a38129e801c Mon Sep 17 00:00:00 2001 From: Storm Dragon Date: Thu, 21 Aug 2025 23:14:47 -0400 Subject: [PATCH] Ocr initial implementation complete. --- src/cthulhu/plugins/OCR/README.md | 210 ++++++++++++++++++++++++++++++ src/cthulhu/plugins/OCR/plugin.py | 67 +--------- src/cthulhu/plugins/meson.build | 2 +- src/cthulhu/settings.py | 2 +- 4 files changed, 213 insertions(+), 68 deletions(-) create mode 100644 src/cthulhu/plugins/OCR/README.md diff --git a/src/cthulhu/plugins/OCR/README.md b/src/cthulhu/plugins/OCR/README.md new file mode 100644 index 0000000..a3f5b25 --- /dev/null +++ b/src/cthulhu/plugins/OCR/README.md @@ -0,0 +1,210 @@ +# OCR Plugin for Cthulhu Screen Reader + +A powerful OCR (Optical Character Recognition) plugin that enables Cthulhu users to extract text from visual content including windows, desktop areas, and clipboard images. Originally based on the ocrdesktop project by Chrys, this plugin integrates seamlessly with Cthulhu's accessibility framework. + +## Features + +- **Window OCR**: Extract text from the currently active window +- **Desktop OCR**: Extract text from the entire desktop screen +- **Clipboard OCR**: Extract text from images copied to the clipboard +- **Voice Announcements**: Clear audio feedback about OCR operations +- **Multi-threading**: Non-blocking OCR processing with progress tracking +- **Text Cleanup**: Automatic post-processing to improve OCR text quality + +## Keybindings + +| Key Combination | Action | Description | +|----------------|--------|-------------| +| `Cthulhu+Control+W` | OCR Active Window | Performs OCR on the currently focused window | +| `Cthulhu+Control+D` | OCR Desktop | Performs OCR on the entire desktop screen | +| `Cthulhu+Control+Shift+C` | OCR Clipboard | Performs OCR on image data from clipboard | + +## Dependencies + +### Required Dependencies +- **python3-pillow** (PIL) - Image processing library +- **python-pytesseract** - Python wrapper for Tesseract OCR +- **tesseract** - OCR engine (with language packs) +- **GTK3/GDK/Wnck** - For screenshot capture (usually pre-installed) + +### Installation Commands + +#### Arch Linux +```bash +sudo pacman -S python-pillow python-pytesseract tesseract tesseract-data-eng +``` + +#### Ubuntu/Debian +```bash +sudo apt install python3-pil python3-pytesseract tesseract-ocr tesseract-ocr-eng +``` + +#### Fedora +```bash +sudo dnf install python3-pillow python3-pytesseract tesseract tesseract-langpack-eng +``` + +### Additional Language Support +To add support for other languages, install additional Tesseract language packs: + +```bash +# Examples for different distributions: +# Arch: sudo pacman -S tesseract-data-fra tesseract-data-deu tesseract-data-spa +# Ubuntu: sudo apt install tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa +# Fedora: sudo dnf install tesseract-langpack-fra tesseract-langpack-deu tesseract-langpack-spa +``` + +## Usage + +1. **Enable the Plugin**: The OCR plugin is enabled by default in Cthulhu. If disabled, you can enable it through: + - Cthulhu Preferences → Plugins → Check "OCR" + - Or ensure `'OCR'` is in the `activePlugins` list in settings.py + +2. **Basic OCR Workflow**: + - Navigate to content you want to OCR + - Press the appropriate key combination + - Listen for "Performing OCR on [window/desktop/clipboard]" + - Wait for processing to complete + - OCR results will be announced via speech + +3. **Best Practices**: + - Ensure good contrast between text and background for better results + - Use window OCR for focused content (faster processing) + - Use desktop OCR for content spanning multiple windows + - Use clipboard OCR for images from web browsers or image viewers + +## Configuration + +### OCR Settings +The plugin uses the following default settings (configurable in plugin.py): + +```python +self._languageCode = 'eng' # Tesseract language code +self._scaleFactor = 3 # Image scaling for better OCR +self._grayscaleImg = False # Convert to grayscale +self._invertImg = False # Invert image colors +self._blackWhiteImg = False # Convert to black/white +self._blackWhiteImgValue = 200 # B/W threshold value +``` + +### Changing OCR Language +To change the default OCR language, modify `self._languageCode` in the plugin's `__init__` method: + +```python +# Examples: +self._languageCode = 'fra' # French +self._languageCode = 'deu' # German +self._languageCode = 'spa' # Spanish +``` + +## Troubleshooting + +### Common Issues + +#### "No text found in OCR scan" +- **Cause**: Poor image quality, unsupported language, or no text in captured area +- **Solutions**: + - Try different OCR mode (window vs desktop) + - Ensure text has good contrast + - Check if correct language pack is installed + - Verify text is actually visible in the captured area + +#### "Missing dependencies" message +- **Cause**: Required Python packages or Tesseract not installed +- **Solution**: Install missing packages using commands above + +#### OCR taking too long +- **Cause**: Large desktop screenshots or complex images +- **Solutions**: + - Use window OCR instead of desktop OCR when possible + - Close unnecessary windows before desktop OCR + - Consider adjusting `_scaleFactor` (lower = faster) + +#### No speech output +- **Cause**: Cthulhu speech settings or audio issues +- **Solutions**: + - Check Cthulhu speech settings + - Test other Cthulhu speech functions + - Verify audio system is working + +### Debug Information +OCR plugin debug messages are logged to Cthulhu's debug output. To enable debug logging: + +```bash +cthulhu --debug > ocr_debug.log 2>&1 +``` + +Look for messages starting with "OCRDesktop:" in the log file. + +## Technical Details + +### Architecture +- **Base Class**: Extends `cthulhu.plugin.Plugin` +- **Threading**: Uses Python threading for non-blocking OCR processing +- **Image Processing**: PIL/Pillow for image manipulation and enhancement +- **OCR Engine**: Tesseract via pytesseract wrapper +- **Integration**: Uses Cthulhu's speech system for output + +### Image Processing Pipeline +1. **Capture**: Screenshot via GDK pixbuf system +2. **Scale**: Enlarge image by scale factor (default 3x) +3. **Transform**: Apply filters (grayscale, invert, etc.) if enabled +4. **OCR**: Process with Tesseract OCR engine +5. **Cleanup**: Remove extra whitespace and format text +6. **Present**: Announce results via Cthulhu speech + +### Text Post-Processing +The plugin automatically cleans OCR output by: +- Removing multiple consecutive spaces +- Eliminating empty lines +- Trimming leading/trailing whitespace +- Removing trailing newlines + +## Development + +### Plugin Structure +``` +src/cthulhu/plugins/OCR/ +├── __init__.py # Package import +├── plugin.py # Main plugin implementation +├── plugin.info # Plugin metadata +├── meson.build # Build system integration +└── README.md # This documentation +``` + +### Key Methods +- `_ocrActiveWindow()`: Captures and OCRs active window +- `_ocrDesktop()`: Captures and OCRs entire desktop +- `_ocrClipboard()`: OCRs image from clipboard +- `_performOCR()`: Core OCR processing logic +- `_presentOCRResult()`: Announces results via speech + +### Extending the Plugin +To add new OCR modes or features: + +1. Add new keybinding in `_registerKeybindings()` +2. Create handler method following pattern `_ocrNewMode()` +3. Implement image capture logic for new mode +4. Use existing `_performOCR()` and `_presentOCRResult()` methods + +## Credits + +- **Original ocrdesktop**: Created by Chrys (chrys87@users.noreply.github.com) +- **Cthulhu Integration**: Adapted by Storm Dragon for Cthulhu plugin system +- **Cthulhu Screen Reader**: https://git.stormux.org/storm/cthulhu +- **Tesseract OCR**: https://github.com/tesseract-ocr/tesseract + +## License + +This plugin is distributed under the GNU Lesser General Public License (LGPL) version 2.1 or later, consistent with the Cthulhu screen reader project. + +## Support + +For issues, questions, or contributions: +- **Cthulhu Repository**: https://git.stormux.org/storm/cthulhu +- **Community**: IRC #stormux on irc.stormux.org +- **Email**: storm_dragon@stormux.org + +--- + +*Part of the Cthulhu Screen Reader project - Making the desktop accessible for everyone.* \ No newline at end of file diff --git a/src/cthulhu/plugins/OCR/plugin.py b/src/cthulhu/plugins/OCR/plugin.py index 3279401..3db6c63 100644 --- a/src/cthulhu/plugins/OCR/plugin.py +++ b/src/cthulhu/plugins/OCR/plugin.py @@ -23,13 +23,7 @@ from mimetypes import MimeTypes from cthulhu.plugin import Plugin, cthulhu_hookimpl from cthulhu import debug -# Import Cthulhu's sound system -try: - from cthulhu import sound - from cthulhu.sound_generator import Tone - SOUND_AVAILABLE = True -except ImportError: - SOUND_AVAILABLE = False +# Note: Removed complex beep system - simple announcements work perfectly! # PIL try: @@ -115,9 +109,6 @@ class OCRDesktop(Plugin): # Progress feedback self._is_processing = False - self._beep_thread = None - self._stop_beeping = False - self._player = None # Color analysis self._kdtDB = None @@ -127,14 +118,6 @@ class OCRDesktop(Plugin): # Set locale for tesseract locale.setlocale(locale.LC_ALL, 'C') - # Initialize sound player for progress beeps - if SOUND_AVAILABLE: - try: - self._player = sound.getPlayer() - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Sound player initialized", True) - except Exception as e: - debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Failed to initialize sound: {e}", True) - # Check dependencies self._checkDependencies() @@ -225,45 +208,6 @@ class OCRDesktop(Plugin): except Exception as e: debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error registering keybindings: {e}", True) - def _startProgressBeeps(self): - """Start playing progress beeps during OCR processing.""" - if not self._player: - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot start beeps - no sound player", True) - return - - if self._beep_thread and self._beep_thread.is_alive(): - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Beeps already running", True) - return - - self._stop_beeping = False - self._beep_thread = threading.Thread(target=self._beepLoop, daemon=True) - self._beep_thread.start() - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Started progress beeps", True) - - def _stopProgressBeeps(self): - """Stop playing progress beeps.""" - self._stop_beeping = True - if self._beep_thread: - self._beep_thread.join(timeout=1.0) - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Stopped progress beeps", True) - - def _beepLoop(self): - """Loop that plays short system bell beeps every 0.5 seconds.""" - while not self._stop_beeping: - try: - # Just use the system bell - we know this works as short beeps - print("\a") - debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: System bell beep", True) - - except Exception as e: - debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: System bell error: {e}", True) - break - - # Wait 0.5 seconds before next beep - for _ in range(50): # Check every 0.01 seconds for quick stopping - if self._stop_beeping: - return - time.sleep(0.01) def _announceOCRStart(self, ocr_type): """Announce the start of OCR operation.""" @@ -288,19 +232,16 @@ class OCRDesktop(Plugin): self._is_processing = True self._announceOCRStart("window") - self._startProgressBeeps() try: if self._screenShotWindow(): self._performOCR() self._presentOCRResult() finally: - self._stopProgressBeeps() self._is_processing = False return True except Exception as e: - self._stopProgressBeeps() self._is_processing = False debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR window: {e}", True) return False @@ -316,19 +257,16 @@ class OCRDesktop(Plugin): self._is_processing = True self._announceOCRStart("desktop") - self._startProgressBeeps() try: if self._screenShotDesktop(): self._performOCR() self._presentOCRResult() finally: - self._stopProgressBeeps() self._is_processing = False return True except Exception as e: - self._stopProgressBeeps() self._is_processing = False debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR desktop: {e}", True) return False @@ -344,19 +282,16 @@ class OCRDesktop(Plugin): self._is_processing = True self._announceOCRStart("clipboard") - self._startProgressBeeps() try: if self._readClipboard(): self._performOCR() self._presentOCRResult() finally: - self._stopProgressBeeps() self._is_processing = False return True except Exception as e: - self._stopProgressBeeps() self._is_processing = False debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR clipboard: {e}", True) return False diff --git a/src/cthulhu/plugins/meson.build b/src/cthulhu/plugins/meson.build index 7f60ae3..f536f81 100644 --- a/src/cthulhu/plugins/meson.build +++ b/src/cthulhu/plugins/meson.build @@ -5,7 +5,7 @@ subdir('Clipboard') subdir('DisplayVersion') subdir('HelloCthulhu') subdir('IndentationAudio') -subdir('OCRDesktop') +subdir('OCR') subdir('PluginManager') subdir('SimplePluginSystem') subdir('hello_world') diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py index 51ef075..c3c89aa 100644 --- a/src/cthulhu/settings.py +++ b/src/cthulhu/settings.py @@ -431,7 +431,7 @@ presentChatRoomLast = False presentLiveRegionFromInactiveTab = False # Plugins -activePlugins = ['AIAssistant', 'DisplayVersion', 'OCRDesktop', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] +activePlugins = ['AIAssistant', 'DisplayVersion', 'OCR', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] # AI Assistant settings (disabled by default for opt-in behavior) aiAssistantEnabled = True