diff --git a/distro-packages/Arch-Linux/PKGBUILD b/distro-packages/Arch-Linux/PKGBUILD
index 1186f51..b0a7540 100644
--- a/distro-packages/Arch-Linux/PKGBUILD
+++ b/distro-packages/Arch-Linux/PKGBUILD
@@ -31,7 +31,6 @@ depends=(
python-dasbus
# AI Assistant dependencies (for screenshots, HTTP requests, and actions)
- python-pillow
python-requests
python-pyautogui
@@ -58,6 +57,12 @@ optdepends=(
'openai-codex: ChatGPT AI provider support'
'gemini-cli: Gemini AI provider support'
'ollama: Local AI model support'
+
+ # OCR plugin dependencies (optional)
+ 'python-pillow: Image processing for OCR and AI Assistant'
+ 'python-pytesseract: Python wrapper for Tesseract OCR engine'
+ 'tesseract: OCR engine for text recognition'
+ 'tesseract-data-eng: English language data for Tesseract'
)
makedepends=(
git
diff --git a/src/cthulhu/cthulhu-setup.ui b/src/cthulhu/cthulhu-setup.ui
index e275007..bac0ecf 100644
--- a/src/cthulhu/cthulhu-setup.ui
+++ b/src/cthulhu/cthulhu-setup.ui
@@ -7,6 +7,20 @@
1
10
+
+
True
diff --git a/src/cthulhu/cthulhuVersion.py b/src/cthulhu/cthulhuVersion.py
index e493402..565763c 100644
--- a/src/cthulhu/cthulhuVersion.py
+++ b/src/cthulhu/cthulhuVersion.py
@@ -23,5 +23,5 @@
# Fork of Orca Screen Reader (GNOME)
# Original source: https://gitlab.gnome.org/GNOME/orca
-version = "2025.08.21"
+version = "2025.08.22"
codeName = "testing"
diff --git a/src/cthulhu/cthulhu_gui_prefs.py b/src/cthulhu/cthulhu_gui_prefs.py
index 07cafc9..8c7ecf4 100644
--- a/src/cthulhu/cthulhu_gui_prefs.py
+++ b/src/cthulhu/cthulhu_gui_prefs.py
@@ -1821,6 +1821,10 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
# AI Assistant settings
#
self._initAIState()
+
+ # OCR Plugin settings
+ #
+ self._initOCRState()
def __initProfileCombo(self):
"""Adding available profiles and setting active as the active one"""
@@ -1945,6 +1949,47 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
self.aiApiKeyEntry.set_placeholder_text("No API key needed - uses local Ollama")
else:
self.aiApiKeyEntry.set_placeholder_text("Path to API key file")
+
+ def _initOCRState(self):
+ """Initialize OCR Plugin tab widgets with current settings."""
+ prefs = self.prefsDict
+
+ # Store widget references
+ self.ocrLanguageEntry = self.get_widget("ocrLanguageEntry")
+ self.ocrScaleSpinButton = self.get_widget("ocrScaleSpinButton")
+ self.ocrGrayscaleCheckButton = self.get_widget("ocrGrayscaleCheckButton")
+ self.ocrInvertCheckButton = self.get_widget("ocrInvertCheckButton")
+ self.ocrBlackWhiteCheckButton = self.get_widget("ocrBlackWhiteCheckButton")
+ self.ocrBlackWhiteValueSpinButton = self.get_widget("ocrBlackWhiteValueSpinButton")
+ self.ocrColorCalculationCheckButton = self.get_widget("ocrColorCalculationCheckButton")
+ self.ocrCopyToClipboardCheckButton = self.get_widget("ocrCopyToClipboardCheckButton")
+
+ # Set language code
+ languageCode = prefs.get("ocrLanguageCode", settings.ocrLanguageCode)
+ self.ocrLanguageEntry.set_text(languageCode)
+
+ # Set scale factor
+ scaleFactor = prefs.get("ocrScaleFactor", settings.ocrScaleFactor)
+ self.ocrScaleSpinButton.set_value(scaleFactor)
+
+ # Set checkboxes
+ grayscale = prefs.get("ocrGrayscaleImg", settings.ocrGrayscaleImg)
+ self.ocrGrayscaleCheckButton.set_active(grayscale)
+
+ invert = prefs.get("ocrInvertImg", settings.ocrInvertImg)
+ self.ocrInvertCheckButton.set_active(invert)
+
+ blackWhite = prefs.get("ocrBlackWhiteImg", settings.ocrBlackWhiteImg)
+ self.ocrBlackWhiteCheckButton.set_active(blackWhite)
+
+ blackWhiteValue = prefs.get("ocrBlackWhiteImgValue", settings.ocrBlackWhiteImgValue)
+ self.ocrBlackWhiteValueSpinButton.set_value(blackWhiteValue)
+
+ colorCalculation = prefs.get("ocrColorCalculation", settings.ocrColorCalculation)
+ self.ocrColorCalculationCheckButton.set_active(colorCalculation)
+
+ copyToClipboard = prefs.get("ocrCopyToClipboard", settings.ocrCopyToClipboard)
+ self.ocrCopyToClipboardCheckButton.set_active(copyToClipboard)
def _updateCthulhuModifier(self):
combobox = self.get_widget("cthulhuModifierComboBox")
@@ -3835,4 +3880,37 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
if 0 <= activeIndex < len(qualities):
self.prefsDict["aiScreenshotQuality"] = qualities[activeIndex]
+ # OCR Plugin Settings Handlers
+ def ocrLanguageChanged(self, widget):
+ """OCR language code entry changed handler"""
+ self.prefsDict["ocrLanguageCode"] = widget.get_text()
+
+ def ocrScaleChanged(self, widget):
+ """OCR scale factor spin button changed handler"""
+ self.prefsDict["ocrScaleFactor"] = int(widget.get_value())
+
+ def ocrGrayscaleToggled(self, widget):
+ """OCR grayscale image checkbox toggled handler"""
+ self.prefsDict["ocrGrayscaleImg"] = widget.get_active()
+
+ def ocrInvertToggled(self, widget):
+ """OCR invert image checkbox toggled handler"""
+ self.prefsDict["ocrInvertImg"] = widget.get_active()
+
+ def ocrBlackWhiteToggled(self, widget):
+ """OCR black and white image checkbox toggled handler"""
+ self.prefsDict["ocrBlackWhiteImg"] = widget.get_active()
+
+ def ocrBlackWhiteValueChanged(self, widget):
+ """OCR black/white threshold spin button changed handler"""
+ self.prefsDict["ocrBlackWhiteImgValue"] = int(widget.get_value())
+
+ def ocrColorCalculationToggled(self, widget):
+ """OCR color calculation checkbox toggled handler"""
+ self.prefsDict["ocrColorCalculation"] = widget.get_active()
+
+ def ocrCopyToClipboardToggled(self, widget):
+ """OCR copy to clipboard checkbox toggled handler"""
+ self.prefsDict["ocrCopyToClipboard"] = widget.get_active()
+
diff --git a/src/cthulhu/plugins/OCR/README.md b/src/cthulhu/plugins/OCR/README.md
index a3f5b25..36d75a6 100644
--- a/src/cthulhu/plugins/OCR/README.md
+++ b/src/cthulhu/plugins/OCR/README.md
@@ -1,15 +1,32 @@
# OCR Plugin for Cthulhu Screen Reader
-A powerful OCR (Optical Character Recognition) plugin that enables Cthulhu users to extract text from visual content including windows, desktop areas, and clipboard images. Originally based on the ocrdesktop project by Chrys, this plugin integrates seamlessly with Cthulhu's accessibility framework.
+A powerful OCR (Optical Character Recognition) plugin that enables Cthulhu users to extract text from visual content and interact with it through precise coordinate mapping. Originally based on the ocrdesktop project by Chrys, this plugin has been enhanced with interactive features and comprehensive settings integration.
## Features
+### 🔍 **OCR Operations**
- **Window OCR**: Extract text from the currently active window
- **Desktop OCR**: Extract text from the entire desktop screen
- **Clipboard OCR**: Extract text from images copied to the clipboard
-- **Voice Announcements**: Clear audio feedback about OCR operations
-- **Multi-threading**: Non-blocking OCR processing with progress tracking
-- **Text Cleanup**: Automatic post-processing to improve OCR text quality
+- **Interactive OCR**: OCR with coordinate mapping for clicking and navigation
+
+### 🎯 **Interactive Coordinate Mapping**
+- **Precise Clicking**: Click any text found in OCR results using exact coordinates
+- **Dual View Modes**: Toggle between text view and interactive coordinate table
+- **Safety Confirmation**: Preview click coordinates before executing
+- **Real-time Navigation**: Browse OCR results and click immediately
+
+### ⚙️ **Comprehensive Settings**
+- **Language Configuration**: Support for all Tesseract language packs
+- **Image Processing**: Grayscale, invert, black/white, and scaling options
+- **Clipboard Integration**: Automatic copying of OCR results to clipboard
+- **Quality Tuning**: Adjustable parameters for optimal OCR accuracy
+
+### 🖥️ **Accessibility Integration**
+- **Voice Announcements**: Clear audio feedback for all operations
+- **Keyboard Navigation**: Full keyboard control of interactive features
+- **Settings GUI**: Integrated settings tab in Cthulhu preferences
+- **Non-blocking Processing**: Multi-threaded operation with progress tracking
## Keybindings
@@ -18,6 +35,15 @@ A powerful OCR (Optical Character Recognition) plugin that enables Cthulhu users
| `Cthulhu+Control+W` | OCR Active Window | Performs OCR on the currently focused window |
| `Cthulhu+Control+D` | OCR Desktop | Performs OCR on the entire desktop screen |
| `Cthulhu+Control+Shift+C` | OCR Clipboard | Performs OCR on image data from clipboard |
+| `Cthulhu+Control+F` | **Interactive OCR** | **Opens OCR results window with coordinate mapping** |
+
+### Interactive OCR Window Controls
+| Key | Action | Description |
+|-----|--------|-------------|
+| `Alt+V` | Toggle View | Switch between text view and coordinate table |
+| `Enter` | Click Selected | Click the text at the selected coordinates |
+| `Escape` | Close Window | Close the OCR results window |
+| `Arrow Keys` | Navigate | Move through OCR results in table view |
## Dependencies
@@ -67,34 +93,68 @@ To add support for other languages, install additional Tesseract language packs:
- Wait for processing to complete
- OCR results will be announced via speech
-3. **Best Practices**:
+3. **Interactive OCR Workflow**:
+ - Press `Cthulhu+Control+F` to open OCR results window
+ - Wait for "Performing OCR on window for interactive results"
+ - Use `Alt+V` to toggle between text and coordinate table views
+ - Navigate with arrow keys in table view to find desired text
+ - Press `Enter` to click on the selected text location
+ - Confirm the click action in the safety dialog
+
+4. **Best Practices**:
- Ensure good contrast between text and background for better results
- Use window OCR for focused content (faster processing)
- Use desktop OCR for content spanning multiple windows
- Use clipboard OCR for images from web browsers or image viewers
+ - Enable "Copy Results to Clipboard" for easy text retrieval
+ - Adjust scale factor for small or blurry text (try 5-7)
## Configuration
-### OCR Settings
-The plugin uses the following default settings (configurable in plugin.py):
+### OCR Settings GUI
+Access comprehensive OCR settings through Cthulhu Preferences:
-```python
-self._languageCode = 'eng' # Tesseract language code
-self._scaleFactor = 3 # Image scaling for better OCR
-self._grayscaleImg = False # Convert to grayscale
-self._invertImg = False # Invert image colors
-self._blackWhiteImg = False # Convert to black/white
-self._blackWhiteImgValue = 200 # B/W threshold value
-```
+1. **Open Cthulhu Preferences**: `~/.local/bin/cthulhu -s`
+2. **Navigate to OCR Tab**: Use keyboard navigation to find the OCR settings tab
+3. **Configure Settings**: Adjust all OCR parameters through the accessible interface
-### Changing OCR Language
-To change the default OCR language, modify `self._languageCode` in the plugin's `__init__` method:
+### Available Settings
-```python
-# Examples:
-self._languageCode = 'fra' # French
-self._languageCode = 'deu' # German
-self._languageCode = 'spa' # Spanish
+#### **Language Configuration**
+- **Language Code**: Tesseract language pack to use (default: 'eng')
+ - Examples: 'fra' (French), 'deu' (German), 'spa' (Spanish)
+ - Use '+' for multiple languages: 'eng+fra' for English and French
+
+#### **Image Processing**
+- **Scale Factor**: Image scaling multiplier (1-10, default: 3)
+ - Higher values improve OCR accuracy for small text
+ - Lower values process faster but may miss details
+- **Grayscale Image**: Convert to grayscale for better text recognition
+- **Invert Image**: Invert colors (useful for white text on dark backgrounds)
+- **Black and White Image**: Convert to pure black/white with threshold
+- **Black/White Threshold**: Threshold value for black/white conversion (0-255, default: 200)
+
+#### **Advanced Features**
+- **Analyze Colors**: Extract color information from OCR regions (requires scipy/webcolors)
+- **Copy Results to Clipboard**: Automatically copy all OCR results to system clipboard
+
+### Configuration File
+Settings are automatically stored in Cthulhu's configuration system:
+- **Global Settings**: `~/.local/share/cthulhu/user-settings.conf`
+- **Profile Settings**: `~/.local/share/cthulhu/app-settings/[profile]/`
+
+### Example Configuration Values
+```json
+{
+ "ocrLanguageCode": "eng",
+ "ocrScaleFactor": 3,
+ "ocrGrayscaleImg": false,
+ "ocrInvertImg": false,
+ "ocrBlackWhiteImg": false,
+ "ocrBlackWhiteImgValue": 200,
+ "ocrColorCalculation": false,
+ "ocrCopyToClipboard": true
+}
```
## Troubleshooting
@@ -127,6 +187,28 @@ self._languageCode = 'spa' # Spanish
- Test other Cthulhu speech functions
- Verify audio system is working
+#### Interactive OCR window doesn't open
+- **Cause**: GTK dependencies missing or display issues
+- **Solutions**:
+ - Ensure GTK3 development packages are installed
+ - Check display/Wayland/X11 compatibility
+ - Verify Cthulhu GUI components are working
+
+#### Click coordinates are inaccurate
+- **Cause**: Window movement, scaling, or coordinate calculation errors
+- **Solutions**:
+ - Ensure window hasn't moved since OCR capture
+ - Try recapturing with `Cthulhu+Control+F`
+ - Check display scaling settings
+ - Verify no window decoration changes occurred
+
+#### Clipboard copy not working
+- **Cause**: Clipboard setting disabled or GTK clipboard issues
+- **Solutions**:
+ - Enable "Copy Results to Clipboard" in OCR settings
+ - Test clipboard functionality with other applications
+ - Check GTK clipboard permissions
+
### Debug Information
OCR plugin debug messages are logged to Cthulhu's debug output. To enable debug logging:
@@ -176,8 +258,18 @@ src/cthulhu/plugins/OCR/
- `_ocrActiveWindow()`: Captures and OCRs active window
- `_ocrDesktop()`: Captures and OCRs entire desktop
- `_ocrClipboard()`: OCRs image from clipboard
-- `_performOCR()`: Core OCR processing logic
-- `_presentOCRResult()`: Announces results via speech
+- `_showOCRResultsWindow()`: **NEW** - Interactive OCR with coordinate mapping
+- `_performOCR()`: Core OCR processing with coordinate extraction
+- `_presentOCRResult()`: Announces results via speech and clipboard
+- `_createOCRResultsWindow()`: **NEW** - Creates interactive GTK results window
+- `_clickSelectedText()`: **NEW** - Executes click at OCR coordinates
+
+### Interactive Features Architecture
+- **Coordinate Mapping**: Uses `pytesseract.image_to_data()` to extract word positions
+- **Screen Transformation**: Converts OCR coordinates to actual screen coordinates
+- **GTK Interface**: Accessible results window with text and table views
+- **Click Safety**: Confirmation dialogs before executing click actions
+- **Settings Integration**: Full integration with Cthulhu's preferences system
### Extending the Plugin
To add new OCR modes or features:
@@ -186,11 +278,30 @@ To add new OCR modes or features:
2. Create handler method following pattern `_ocrNewMode()`
3. Implement image capture logic for new mode
4. Use existing `_performOCR()` and `_presentOCRResult()` methods
+5. For interactive features, extend `_createOCRResultsWindow()` functionality
+
+## Version History
+
+### Version 2.0 (Enhanced Interactive Features)
+- **Interactive OCR Window**: `Cthulhu+Control+F` for coordinate mapping
+- **Precise Clicking**: Click any text found in OCR results
+- **Settings Integration**: Full GUI settings tab in Cthulhu preferences
+- **Clipboard Integration**: Automatic copying with toggle setting
+- **Dual View Modes**: Text view and coordinate table with Alt+V toggle
+- **Safety Features**: Click confirmation dialogs
+- **Enhanced Processing**: Coordinate extraction with quality metrics
+
+### Version 1.0 (Original Implementation)
+- Basic OCR for window, desktop, and clipboard
+- Text extraction and speech output
+- Multi-threading support
+- Text cleanup and formatting
## Credits
- **Original ocrdesktop**: Created by Chrys (chrys87@users.noreply.github.com)
- **Cthulhu Integration**: Adapted by Storm Dragon for Cthulhu plugin system
+- **Interactive Features**: Enhanced coordinate mapping and GUI integration
- **Cthulhu Screen Reader**: https://git.stormux.org/storm/cthulhu
- **Tesseract OCR**: https://github.com/tesseract-ocr/tesseract
diff --git a/src/cthulhu/plugins/OCR/plugin.py b/src/cthulhu/plugins/OCR/plugin.py
index 3db6c63..bc90ebd 100644
--- a/src/cthulhu/plugins/OCR/plugin.py
+++ b/src/cthulhu/plugins/OCR/plugin.py
@@ -22,6 +22,7 @@ from mimetypes import MimeTypes
from cthulhu.plugin import Plugin, cthulhu_hookimpl
from cthulhu import debug
+from cthulhu import settings_manager
# Note: Removed complex beep system - simple announcements work perfectly!
@@ -88,8 +89,12 @@ class OCRDesktop(Plugin):
self._kb_binding_window = None
self._kb_binding_desktop = None
self._kb_binding_clipboard = None
+ self._kb_binding_results_window = None
- # OCR settings
+ # Settings manager
+ self._settings_manager = settings_manager.getManager()
+
+ # OCR settings (will be loaded from settings)
self._languageCode = 'eng'
self._scaleFactor = 3
self._grayscaleImg = False
@@ -98,15 +103,24 @@ class OCRDesktop(Plugin):
self._blackWhiteImgValue = 200
self._colorCalculation = False
self._colorCalculationMax = 3
+ self._copyToClipboard = False
# Internal state
self._img = []
self._modifiedImg = []
self._OCRText = ''
+ self._OCRWords = {}
+ self._OCRWordList = []
self._offsetXpos = 0
self._offsetYpos = 0
self._activated = False
+ # OCR Results Window
+ self._results_window = None
+ self._results_tree = None
+ self._results_textview = None
+ self._current_view_mode = 0 # 0 = text, 1 = tree
+
# Progress feedback
self._is_processing = False
@@ -118,6 +132,9 @@ class OCRDesktop(Plugin):
# Set locale for tesseract
locale.setlocale(locale.LC_ALL, 'C')
+ # Load OCR settings from configuration
+ self._loadOCRSettings()
+
# Check dependencies
self._checkDependencies()
@@ -138,6 +155,23 @@ class OCRDesktop(Plugin):
return False
return True
+ def _loadOCRSettings(self):
+ """Load OCR settings from Cthulhu configuration."""
+ try:
+ self._languageCode = self._settings_manager.getSetting('ocrLanguageCode') or 'eng'
+ self._scaleFactor = self._settings_manager.getSetting('ocrScaleFactor') or 3
+ self._grayscaleImg = self._settings_manager.getSetting('ocrGrayscaleImg') or False
+ self._invertImg = self._settings_manager.getSetting('ocrInvertImg') or False
+ self._blackWhiteImg = self._settings_manager.getSetting('ocrBlackWhiteImg') or False
+ self._blackWhiteImgValue = self._settings_manager.getSetting('ocrBlackWhiteImgValue') or 200
+ self._colorCalculation = self._settings_manager.getSetting('ocrColorCalculation') or False
+ self._colorCalculationMax = self._settings_manager.getSetting('ocrColorCalculationMax') or 3
+ self._copyToClipboard = self._settings_manager.getSetting('ocrCopyToClipboard') or False
+
+ debug.printMessage(debug.LEVEL_INFO, f"OCR settings loaded: lang={self._languageCode}, scale={self._scaleFactor}, clipboard={self._copyToClipboard}", True)
+ except Exception as e:
+ debug.printMessage(debug.LEVEL_INFO, f"OCR settings load error: {e}, using defaults", True)
+
@cthulhu_hookimpl
def activate(self, plugin=None):
"""Activate the plugin."""
@@ -179,6 +213,19 @@ class OCRDesktop(Plugin):
self._activated = False
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin deactivated", True)
+ def refresh_settings(self):
+ """Refresh plugin settings when configuration changes."""
+ try:
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Refreshing settings", True)
+
+ # Reload OCR settings from configuration
+ self._loadOCRSettings()
+
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Settings refreshed successfully", True)
+
+ except Exception as e:
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error refreshing settings: {e}", True)
+
def _registerKeybindings(self):
"""Register plugin keybindings."""
try:
@@ -203,6 +250,13 @@ class OCRDesktop(Plugin):
'kb:cthulhu+control+shift+c'
)
+ # OCR results window
+ self._kb_binding_results_window = self.registerGestureByString(
+ self._showOCRResultsWindow,
+ "Show OCR results window for current window",
+ 'kb:cthulhu+control+f'
+ )
+
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Keybindings registered", True)
except Exception as e:
@@ -409,18 +463,27 @@ class OCRDesktop(Plugin):
return modifiedImg
def _performOCR(self):
- """Perform OCR on captured images."""
+ """Perform OCR on captured images with coordinate data extraction."""
if not PYTESSERACT_AVAILABLE:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Tesseract not available", True)
return
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Starting OCR", True)
self._OCRText = ''
+ self._OCRWords = {}
+ self._OCRWordList = []
for img in self._img:
modifiedImg = self._transformImg(img)
try:
- # Simple text extraction
+ # Extract coordinate data using image_to_data
+ from pytesseract import Output
+ OCRWords = pytesseract.image_to_data(modifiedImg, output_type=Output.DICT,
+ lang=self._languageCode, config='--psm 4')
+ self._appendToOCRWords(OCRWords)
+ self._processOCRWords(OCRWords, modifiedImg)
+
+ # Also extract simple text for speech output
text = pytesseract.image_to_string(modifiedImg, lang=self._languageCode, config='--psm 4')
self._OCRText += text + '\n'
except Exception as e:
@@ -433,31 +496,36 @@ class OCRDesktop(Plugin):
def _cleanOCRText(self):
"""Clean up OCR text output."""
# Remove multiple spaces
- regexSpace = re.compile('[^\S\r\n]{2,}')
+ regexSpace = re.compile(r'[^\S\r\n]{2,}')
self._OCRText = regexSpace.sub(' ', self._OCRText)
# Remove empty lines
- regexSpace = re.compile('\n\s*\n')
+ regexSpace = re.compile(r'\n\s*\n')
self._OCRText = regexSpace.sub('\n', self._OCRText)
# Remove trailing spaces
- regexSpace = re.compile('\s*\n')
+ regexSpace = re.compile(r'\s*\n')
self._OCRText = regexSpace.sub('\n', self._OCRText)
# Remove leading spaces
- regexSpace = re.compile('^\s')
+ regexSpace = re.compile(r'^\s')
self._OCRText = regexSpace.sub('', self._OCRText)
# Remove trailing newlines
self._OCRText = self._OCRText.strip()
def _presentOCRResult(self):
- """Present OCR result to user via speech."""
+ """Present OCR result to user via speech and optionally copy to clipboard."""
try:
if not self._OCRText.strip():
message = "No text found in OCR scan"
else:
message = f"OCR result: {self._OCRText}"
+
+ # Copy to clipboard if enabled
+ if self._copyToClipboard:
+ self._copyTextToClipboard(self._OCRText)
+ message += " (copied to clipboard)"
if self.app:
state = self.app.getDynamicApiManager().getAPI('CthulhuState')
@@ -467,4 +535,335 @@ class OCRDesktop(Plugin):
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Presented result: {len(self._OCRText)} characters", True)
except Exception as e:
- debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error presenting result: {e}", True)
\ No newline at end of file
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error presenting result: {e}", True)
+
+ def _appendToOCRWords(self, OCRWords):
+ """Append OCR words to the main OCR data structure."""
+ for k, v in OCRWords.items():
+ try:
+ x = self._OCRWords[k]
+ if isinstance(v, list):
+ self._OCRWords[k].extend(v)
+ except KeyError:
+ self._OCRWords[k] = v
+
+ def _processOCRWords(self, OCRWords, img):
+ """Process OCR words to extract coordinate data."""
+ boxCounter = len(OCRWords['level'])
+ if boxCounter == 0:
+ return False
+
+ lastPage = -1
+ lastBlock = -1
+ lastPar = -1
+ lastLine = -1
+
+ for i in range(boxCounter):
+ if (len(OCRWords['text'][i]) == 0) or OCRWords['text'][i].isspace():
+ continue
+
+ # Add word to coordinate list
+ self._OCRWordList.append([
+ OCRWords['text'][i], # Text
+ round(OCRWords['height'][i] / 3 * 0.78, 0), # Calculated fontsize
+ self._getColorString(OCRWords, i, img), # Color info
+ 'text', # Object type
+ int(OCRWords['width'][i] / 2 + OCRWords['left'][i]), # X coordinate (center)
+ int(OCRWords['height'][i] / 2 + OCRWords['top'][i]), # Y coordinate (center)
+ int(float(OCRWords['conf'][i])) # Confidence
+ ])
+
+ lastPage = OCRWords['page_num'][i]
+ lastBlock = OCRWords['block_num'][i]
+ lastPar = OCRWords['par_num'][i]
+ lastLine = OCRWords['line_num'][i]
+
+ return True
+
+ def _getColorString(self, box, index, img):
+ """Get color information for OCR text (simplified version)."""
+ if not self._colorCalculation:
+ return 'unknown'
+ if not SCIPY_AVAILABLE or not WEBCOLORS_AVAILABLE:
+ return 'unknown'
+
+ # Simplified color calculation - just return "unknown" for now
+ # Full implementation would require the color analysis from ocrdesktop
+ return 'unknown'
+
+ def _showOCRResultsWindow(self, script=None, inputEvent=None):
+ """Show OCR results window for current window with coordinate mapping."""
+ try:
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR results window requested", True)
+
+ if self._is_processing:
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
+ return True
+
+ self._is_processing = True
+ self._announceOCRStart("window for interactive results")
+
+ try:
+ if self._screenShotWindow():
+ self._performOCR()
+ self._createOCRResultsWindow()
+ finally:
+ self._is_processing = False
+
+ return True
+ except Exception as e:
+ self._is_processing = False
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR results window: {e}", True)
+ return False
+
+ def _createOCRResultsWindow(self):
+ """Create and show the OCR results window with coordinate mapping."""
+ if not GTK_AVAILABLE:
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for results window", True)
+ return
+
+ try:
+ # Create main window
+ self._results_window = Gtk.Window(title="OCR Results - Cthulhu")
+ self._results_window.set_default_size(800, 600)
+ self._results_window.set_modal(True)
+
+ # Create main container
+ vbox = Gtk.VBox()
+
+ # Create menu bar
+ menubar = self._createResultsMenuBar()
+ vbox.pack_start(menubar, False, False, 0)
+
+ # Create text view for OCR text
+ scrolled_text = Gtk.ScrolledWindow()
+ self._results_textview = Gtk.TextView()
+ self._results_textview.set_editable(False)
+ buffer = self._results_textview.get_buffer()
+ buffer.set_text(self._OCRText)
+ scrolled_text.add(self._results_textview)
+
+ # Create tree view for coordinate data
+ scrolled_tree = Gtk.ScrolledWindow()
+ self._results_tree = self._createResultsTreeView()
+ scrolled_tree.add(self._results_tree)
+
+ # Add both views to container
+ vbox.pack_start(scrolled_text, True, True, 0)
+ vbox.pack_start(scrolled_tree, True, True, 0)
+
+ # Set initial view (text only)
+ scrolled_tree.hide()
+
+ self._results_window.add(vbox)
+ self._results_window.connect("destroy", self._onResultsWindowDestroy)
+ self._results_window.connect("key-press-event", self._onResultsKeyPress)
+
+ # Show window
+ self._results_window.show_all()
+ scrolled_tree.hide() # Hide tree initially
+
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Results window created", True)
+
+ except Exception as e:
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error creating results window: {e}", True)
+
+ def _createResultsMenuBar(self):
+ """Create menu bar for results window."""
+ menubar = Gtk.MenuBar()
+
+ # View menu
+ view_menu = Gtk.Menu()
+ view_item = Gtk.MenuItem(label="View")
+ view_item.set_submenu(view_menu)
+
+ # Toggle view option
+ toggle_item = Gtk.MenuItem(label="Toggle View (Alt+V)")
+ toggle_item.connect("activate", self._toggleResultsView)
+ view_menu.append(toggle_item)
+
+ menubar.append(view_item)
+
+ # Actions menu
+ actions_menu = Gtk.Menu()
+ actions_item = Gtk.MenuItem(label="Actions")
+ actions_item.set_submenu(actions_menu)
+
+ # Click action
+ click_item = Gtk.MenuItem(label="Click Selected (Enter)")
+ click_item.connect("activate", self._clickSelectedText)
+ actions_menu.append(click_item)
+
+ menubar.append(actions_item)
+
+ return menubar
+
+ def _createResultsTreeView(self):
+ """Create tree view for OCR results with coordinates."""
+ # Create list store
+ store = Gtk.ListStore(str, str, int, str, str, int, int, int)
+
+ # Create tree view
+ tree = Gtk.TreeView(model=store)
+ tree.set_search_column(0)
+
+ # Add columns
+ columns = [
+ ("Text", 0),
+ ("Font Size", 2),
+ ("Color", 3),
+ ("Type", 4),
+ ("X Position", 5),
+ ("Y Position", 6),
+ ("Confidence", 7)
+ ]
+
+ for title, col_id in columns:
+ renderer = Gtk.CellRendererText()
+ column = Gtk.TreeViewColumn(title, renderer, text=col_id)
+ column.set_sort_column_id(col_id)
+ tree.append_column(column)
+
+ # Populate with OCR data
+ for row in self._OCRWordList:
+ # Transform coordinates back to screen coordinates
+ x_coord = int(row[4] / self._scaleFactor + self._offsetXpos)
+ y_coord = int(row[5] / self._scaleFactor + self._offsetYpos)
+
+ store.append([
+ row[0], # Text
+ str(row[1]), # Font size (as string for display)
+ int(row[1]), # Font size (as int for sorting)
+ row[2], # Color
+ row[3], # Type
+ x_coord, # X coordinate (screen)
+ y_coord, # Y coordinate (screen)
+ row[6] # Confidence
+ ])
+
+ tree.connect("row-activated", self._onTreeRowActivated)
+
+ return tree
+
+ def _toggleResultsView(self, widget):
+ """Toggle between text and tree view."""
+ if not self._results_window:
+ return
+
+ # Get the container
+ vbox = self._results_window.get_child()
+ scrolled_text = vbox.get_children()[1] # Second child (after menubar)
+ scrolled_tree = vbox.get_children()[2] # Third child
+
+ if self._current_view_mode == 0: # Currently showing text
+ scrolled_text.hide()
+ scrolled_tree.show()
+ self._current_view_mode = 1
+ self._results_tree.grab_focus()
+ else: # Currently showing tree
+ scrolled_tree.hide()
+ scrolled_text.show()
+ self._current_view_mode = 0
+ self._results_textview.grab_focus()
+
+ def _onTreeRowActivated(self, tree, path, column):
+ """Handle double-click or Enter on tree row."""
+ self._clickSelectedText(None)
+
+ def _clickSelectedText(self, widget):
+ """Click at the coordinates of the selected text."""
+ if not self._results_tree:
+ return
+
+ selection = self._results_tree.get_selection()
+ if not selection:
+ return
+
+ model, tree_iter = selection.get_selected()
+ if not tree_iter:
+ return
+
+ # Get coordinates
+ x_coord = model.get_value(tree_iter, 5) # X position
+ y_coord = model.get_value(tree_iter, 6) # Y position
+ text = model.get_value(tree_iter, 0) # Text for confirmation
+
+ # Confirm click action
+ dialog = Gtk.MessageDialog(
+ self._results_window,
+ Gtk.DialogFlags.MODAL,
+ Gtk.MessageType.QUESTION,
+ Gtk.ButtonsType.YES_NO,
+ f"Click at coordinates ({x_coord}, {y_coord}) for text '{text}'?"
+ )
+
+ response = dialog.run()
+ dialog.destroy()
+
+ if response == Gtk.ResponseType.YES:
+ try:
+ # Hide window before clicking
+ self._results_window.hide()
+
+ # Perform click using AT-SPI
+ import pyatspi
+ time.sleep(0.5) # Brief delay
+ pyatspi.Registry.generateMouseEvent(x_coord, y_coord, "b1c")
+
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Clicked at ({x_coord}, {y_coord})", True)
+
+ # Destroy window after successful click
+ self._results_window.destroy()
+ self._results_window = None
+
+ except Exception as e:
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error clicking: {e}", True)
+ # Show window again on error
+ self._results_window.show()
+
+ def _onResultsKeyPress(self, widget, event):
+ """Handle key presses in results window."""
+ keyval = event.keyval
+ state = event.state
+
+ # Alt+V to toggle view
+ if (keyval == Gdk.KEY_v or keyval == Gdk.KEY_V) and (state & Gdk.ModifierType.MOD1_MASK):
+ self._toggleResultsView(None)
+ return True
+
+ # Enter to click selected
+ if keyval == Gdk.KEY_Return or keyval == Gdk.KEY_KP_Enter:
+ if self._current_view_mode == 1: # Tree view
+ self._clickSelectedText(None)
+ return True
+
+ # Escape to close
+ if keyval == Gdk.KEY_Escape:
+ self._results_window.destroy()
+ return True
+
+ return False
+
+ def _onResultsWindowDestroy(self, widget):
+ """Handle results window destruction."""
+ self._results_window = None
+ self._results_tree = None
+ self._results_textview = None
+ self._current_view_mode = 0
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Results window destroyed", True)
+
+ def _copyTextToClipboard(self, text):
+ """Copy text to system clipboard."""
+ if not GTK_AVAILABLE:
+ debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for clipboard", True)
+ return False
+
+ try:
+ clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
+ clipboard.set_text(text, -1)
+ clipboard.store()
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Copied {len(text)} characters to clipboard", True)
+ return True
+ except Exception as e:
+ debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error copying to clipboard: {e}", True)
+ return False
\ No newline at end of file
diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py
index c3c89aa..fd92e25 100644
--- a/src/cthulhu/settings.py
+++ b/src/cthulhu/settings.py
@@ -157,7 +157,16 @@ userCustomizableSettings = [
"aiConfirmationRequired",
"aiActionTimeout",
"aiScreenshotQuality",
- "aiMaxContextLength"
+ "aiMaxContextLength",
+ "ocrLanguageCode",
+ "ocrScaleFactor",
+ "ocrGrayscaleImg",
+ "ocrInvertImg",
+ "ocrBlackWhiteImg",
+ "ocrBlackWhiteImgValue",
+ "ocrColorCalculation",
+ "ocrColorCalculationMax",
+ "ocrCopyToClipboard"
]
GENERAL_KEYBOARD_LAYOUT_DESKTOP = 1
@@ -443,3 +452,14 @@ aiConfirmationRequired = True
aiActionTimeout = 30
aiScreenshotQuality = AI_SCREENSHOT_QUALITY_MEDIUM
aiMaxContextLength = 4000
+
+# OCR Plugin settings
+ocrLanguageCode = 'eng'
+ocrScaleFactor = 3
+ocrGrayscaleImg = False
+ocrInvertImg = False
+ocrBlackWhiteImg = False
+ocrBlackWhiteImgValue = 200
+ocrColorCalculation = False
+ocrColorCalculationMax = 3
+ocrCopyToClipboard = False