From 41c91ffc66810ada1d283a2451a701888ed86ca5 Mon Sep 17 00:00:00 2001 From: Storm Dragon Date: Thu, 21 Aug 2025 21:59:29 -0400 Subject: [PATCH] Initial work on ocr integration. Is broken currently. --- src/cthulhu/cthulhuVersion.py | 2 +- src/cthulhu/plugins/OCR/__init__.py | 23 ++ src/cthulhu/plugins/OCR/meson.build | 14 + src/cthulhu/plugins/OCR/plugin.info | 8 + src/cthulhu/plugins/OCR/plugin.py | 535 ++++++++++++++++++++++++++++ src/cthulhu/plugins/meson.build | 1 + src/cthulhu/settings.py | 2 +- 7 files changed, 583 insertions(+), 2 deletions(-) create mode 100644 src/cthulhu/plugins/OCR/__init__.py create mode 100644 src/cthulhu/plugins/OCR/meson.build create mode 100644 src/cthulhu/plugins/OCR/plugin.info create mode 100644 src/cthulhu/plugins/OCR/plugin.py diff --git a/src/cthulhu/cthulhuVersion.py b/src/cthulhu/cthulhuVersion.py index 86477e4..e493402 100644 --- a/src/cthulhu/cthulhuVersion.py +++ b/src/cthulhu/cthulhuVersion.py @@ -23,5 +23,5 @@ # Fork of Orca Screen Reader (GNOME) # Original source: https://gitlab.gnome.org/GNOME/orca -version = "2025.08.19" +version = "2025.08.21" codeName = "testing" diff --git a/src/cthulhu/plugins/OCR/__init__.py b/src/cthulhu/plugins/OCR/__init__.py new file mode 100644 index 0000000..55b30a0 --- /dev/null +++ b/src/cthulhu/plugins/OCR/__init__.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Stormux +# Copyright (c) 2022 Chrys (original ocrdesktop) +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., Franklin Street, Fifth Floor, +# Boston MA 02110-1301 USA. + +"""OCRDesktop plugin package.""" + +from .plugin import OCRDesktop \ No newline at end of file diff --git a/src/cthulhu/plugins/OCR/meson.build b/src/cthulhu/plugins/OCR/meson.build new file mode 100644 index 0000000..09fe8ec --- /dev/null +++ b/src/cthulhu/plugins/OCR/meson.build @@ -0,0 +1,14 @@ +ocrdesktop_python_sources = files([ + '__init__.py', + 'plugin.py' +]) + +python3.install_sources( + ocrdesktop_python_sources, + subdir: 'cthulhu/plugins/OCRDesktop' +) + +install_data( + 'plugin.info', + install_dir: python3.get_install_dir() / 'cthulhu' / 'plugins' / 'OCRDesktop' +) \ No newline at end of file diff --git a/src/cthulhu/plugins/OCR/plugin.info b/src/cthulhu/plugins/OCR/plugin.info new file mode 100644 index 0000000..4066b71 --- /dev/null +++ b/src/cthulhu/plugins/OCR/plugin.info @@ -0,0 +1,8 @@ +name = OCR Desktop +version = 4.0.0 +description = OCR accessibility tool for reading inaccessible windows and dialogs using Tesseract OCR +authors = Storm Dragon +website = https://github.com/chrys87/ocrdesktop +copyright = Copyright 2022 Chrys, Copyright 2025 Stormux +builtin = false +hidden = false \ No newline at end of file diff --git a/src/cthulhu/plugins/OCR/plugin.py b/src/cthulhu/plugins/OCR/plugin.py new file mode 100644 index 0000000..3279401 --- /dev/null +++ b/src/cthulhu/plugins/OCR/plugin.py @@ -0,0 +1,535 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Stormux +# Copyright (c) 2022 Chrys (original ocrdesktop) +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +"""OCRDesktop plugin for Cthulhu screen reader.""" + +import logging +import os +import sys +import locale +import time +import re +import tempfile +import threading +from mimetypes import MimeTypes + +from cthulhu.plugin import Plugin, cthulhu_hookimpl +from cthulhu import debug + +# Import Cthulhu's sound system +try: + from cthulhu import sound + from cthulhu.sound_generator import Tone + SOUND_AVAILABLE = True +except ImportError: + SOUND_AVAILABLE = False + +# PIL +try: + from PIL import Image + from PIL import ImageOps + PIL_AVAILABLE = True +except ImportError: + PIL_AVAILABLE = False + +# pytesseract +try: + import pytesseract + from pytesseract import Output + PYTESSERACT_AVAILABLE = True +except ImportError: + PYTESSERACT_AVAILABLE = False + +# pdf2image +try: + from pdf2image import convert_from_path + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + +# scipy +try: + from scipy.spatial import KDTree + SCIPY_AVAILABLE = True +except ImportError: + SCIPY_AVAILABLE = False + +# webcolors +try: + from webcolors import CSS3_HEX_TO_NAMES + from webcolors import hex_to_rgb + WEBCOLORS_AVAILABLE = True +except ImportError: + WEBCOLORS_AVAILABLE = False + +# GTK/GDK/Wnck +try: + import gi + gi.require_version("Gtk", "3.0") + gi.require_version("Gdk", "3.0") + gi.require_version("Wnck", "3.0") + from gi.repository import Gtk, Gdk, Wnck + GTK_AVAILABLE = True +except ImportError: + GTK_AVAILABLE = False + +logger = logging.getLogger(__name__) + +class OCRDesktop(Plugin): + """OCR Desktop accessibility plugin for reading inaccessible windows.""" + + def __init__(self, *args, **kwargs): + """Initialize the plugin.""" + super().__init__(*args, **kwargs) + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin initialized", True) + + # Keybinding storage + self._kb_binding_window = None + self._kb_binding_desktop = None + self._kb_binding_clipboard = None + + # OCR settings + self._languageCode = 'eng' + self._scaleFactor = 3 + self._grayscaleImg = False + self._invertImg = False + self._blackWhiteImg = False + self._blackWhiteImgValue = 200 + self._colorCalculation = False + self._colorCalculationMax = 3 + + # Internal state + self._img = [] + self._modifiedImg = [] + self._OCRText = '' + self._offsetXpos = 0 + self._offsetYpos = 0 + self._activated = False + + # Progress feedback + self._is_processing = False + self._beep_thread = None + self._stop_beeping = False + self._player = None + + # Color analysis + self._kdtDB = None + self.colorNames = [] + self.colorCache = {} + + # Set locale for tesseract + locale.setlocale(locale.LC_ALL, 'C') + + # Initialize sound player for progress beeps + if SOUND_AVAILABLE: + try: + self._player = sound.getPlayer() + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Sound player initialized", True) + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Failed to initialize sound: {e}", True) + + # Check dependencies + self._checkDependencies() + + def _checkDependencies(self): + """Check if required dependencies are available.""" + missing_deps = [] + + if not PIL_AVAILABLE: + missing_deps.append("python3-pillow") + if not PYTESSERACT_AVAILABLE: + missing_deps.append("python-pytesseract") + if not GTK_AVAILABLE: + missing_deps.append("GTK3/GDK/Wnck") + + if missing_deps: + debug.printMessage(debug.LEVEL_INFO, + f"OCRDesktop: Missing dependencies: {', '.join(missing_deps)}", True) + return False + return True + + @cthulhu_hookimpl + def activate(self, plugin=None): + """Activate the plugin.""" + if plugin is not None and plugin is not self: + return + + if self._activated: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already activated", True) + return + + try: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activation starting", True) + + if not self.app: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: ERROR - No app reference", True) + return + + if not self._checkDependencies(): + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot activate - missing dependencies", True) + return + + # Register keybindings + self._registerKeybindings() + + self._activated = True + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activated successfully", True) + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error activating: {e}", True) + import traceback + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {traceback.format_exc()}", True) + + @cthulhu_hookimpl + def deactivate(self, plugin=None): + """Deactivate the plugin.""" + if plugin is not None and plugin is not self: + return + + self._activated = False + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin deactivated", True) + + def _registerKeybindings(self): + """Register plugin keybindings.""" + try: + # OCR active window + self._kb_binding_window = self.registerGestureByString( + self._ocrActiveWindow, + "OCR read active window", + 'kb:cthulhu+control+w' + ) + + # OCR entire desktop + self._kb_binding_desktop = self.registerGestureByString( + self._ocrDesktop, + "OCR read entire desktop", + 'kb:cthulhu+control+d' + ) + + # OCR from clipboard + self._kb_binding_clipboard = self.registerGestureByString( + self._ocrClipboard, + "OCR read image from clipboard", + 'kb:cthulhu+control+shift+c' + ) + + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Keybindings registered", True) + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error registering keybindings: {e}", True) + + def _startProgressBeeps(self): + """Start playing progress beeps during OCR processing.""" + if not self._player: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot start beeps - no sound player", True) + return + + if self._beep_thread and self._beep_thread.is_alive(): + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Beeps already running", True) + return + + self._stop_beeping = False + self._beep_thread = threading.Thread(target=self._beepLoop, daemon=True) + self._beep_thread.start() + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Started progress beeps", True) + + def _stopProgressBeeps(self): + """Stop playing progress beeps.""" + self._stop_beeping = True + if self._beep_thread: + self._beep_thread.join(timeout=1.0) + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Stopped progress beeps", True) + + def _beepLoop(self): + """Loop that plays short system bell beeps every 0.5 seconds.""" + while not self._stop_beeping: + try: + # Just use the system bell - we know this works as short beeps + print("\a") + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: System bell beep", True) + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: System bell error: {e}", True) + break + + # Wait 0.5 seconds before next beep + for _ in range(50): # Check every 0.01 seconds for quick stopping + if self._stop_beeping: + return + time.sleep(0.01) + + def _announceOCRStart(self, ocr_type): + """Announce the start of OCR operation.""" + try: + message = f"Performing OCR on {ocr_type}" + if self.app: + state = self.app.getDynamicApiManager().getAPI('CthulhuState') + if state and state.activeScript: + state.activeScript.presentMessage(message, resetStyles=False) + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {message}", True) + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error announcing OCR start: {e}", True) + + def _ocrActiveWindow(self, script=None, inputEvent=None): + """OCR the active window.""" + try: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR active window requested", True) + + if self._is_processing: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True) + return True + + self._is_processing = True + self._announceOCRStart("window") + self._startProgressBeeps() + + try: + if self._screenShotWindow(): + self._performOCR() + self._presentOCRResult() + finally: + self._stopProgressBeeps() + self._is_processing = False + + return True + except Exception as e: + self._stopProgressBeeps() + self._is_processing = False + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR window: {e}", True) + return False + + def _ocrDesktop(self, script=None, inputEvent=None): + """OCR the entire desktop.""" + try: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR desktop requested", True) + + if self._is_processing: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True) + return True + + self._is_processing = True + self._announceOCRStart("desktop") + self._startProgressBeeps() + + try: + if self._screenShotDesktop(): + self._performOCR() + self._presentOCRResult() + finally: + self._stopProgressBeeps() + self._is_processing = False + + return True + except Exception as e: + self._stopProgressBeeps() + self._is_processing = False + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR desktop: {e}", True) + return False + + def _ocrClipboard(self, script=None, inputEvent=None): + """OCR image from clipboard.""" + try: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR clipboard requested", True) + + if self._is_processing: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True) + return True + + self._is_processing = True + self._announceOCRStart("clipboard") + self._startProgressBeeps() + + try: + if self._readClipboard(): + self._performOCR() + self._presentOCRResult() + finally: + self._stopProgressBeeps() + self._is_processing = False + + return True + except Exception as e: + self._stopProgressBeeps() + self._is_processing = False + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR clipboard: {e}", True) + return False + + def _screenShotWindow(self): + """Take screenshot of active window.""" + if not GTK_AVAILABLE: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True) + return False + + try: + time.sleep(0.3) # Brief delay + gdkCurrDesktop = Gdk.get_default_root_window() + + currWnckScreen = Wnck.Screen.get_default() + currWnckScreen.force_update() + currWnckWindow = currWnckScreen.get_active_window() + + if not currWnckWindow: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No active window found", True) + return False + + self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight = currWnckWindow.get_geometry() + pixBuff = Gdk.pixbuf_get_from_window(gdkCurrDesktop, self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight) + + if pixBuff: + self._img = [self._pixbuf2image(pixBuff)] + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Window screenshot captured", True) + return True + else: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture window screenshot", True) + return False + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking window screenshot: {e}", True) + return False + + def _screenShotDesktop(self): + """Take screenshot of entire desktop.""" + if not GTK_AVAILABLE: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True) + return False + + try: + time.sleep(0.3) # Brief delay + currDesktop = Gdk.get_default_root_window() + pixBuff = Gdk.pixbuf_get_from_window(currDesktop, 0, 0, currDesktop.get_width(), currDesktop.get_height()) + + if pixBuff: + self._img = [self._pixbuf2image(pixBuff)] + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Desktop screenshot captured", True) + return True + else: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture desktop screenshot", True) + return False + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking desktop screenshot: {e}", True) + return False + + def _readClipboard(self): + """Read image from clipboard.""" + if not GTK_AVAILABLE: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for clipboard", True) + return False + + try: + clipboardObj = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD) + pixBuff = clipboardObj.wait_for_image() + + if pixBuff: + self._img = [self._pixbuf2image(pixBuff)] + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Image read from clipboard", True) + return True + else: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No image found in clipboard", True) + return False + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error reading clipboard: {e}", True) + return False + + def _pixbuf2image(self, pix): + """Convert GdkPixbuf to PIL Image.""" + data = pix.get_pixels() + w = pix.props.width + h = pix.props.height + stride = pix.props.rowstride + mode = "RGB" + if pix.props.has_alpha: + mode = "RGBA" + im = Image.frombytes(mode, (w, h), data, "raw", mode, stride) + return im + + def _scaleImg(self, img): + """Scale image for better OCR results.""" + width_screen, height_screen = img.size + width_screen = width_screen * self._scaleFactor + height_screen = height_screen * self._scaleFactor + scaledImg = img.resize((width_screen, height_screen), Image.Resampling.BICUBIC) + return scaledImg + + def _transformImg(self, img): + """Transform image with various filters for better OCR.""" + modifiedImg = self._scaleImg(img) + + if self._invertImg: + modifiedImg = ImageOps.invert(modifiedImg) + if self._grayscaleImg: + modifiedImg = ImageOps.grayscale(modifiedImg) + if self._blackWhiteImg: + lut = [255 if v > self._blackWhiteImgValue else 0 for v in range(256)] + modifiedImg = modifiedImg.point(lut) + + return modifiedImg + + def _performOCR(self): + """Perform OCR on captured images.""" + if not PYTESSERACT_AVAILABLE: + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Tesseract not available", True) + return + + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Starting OCR", True) + self._OCRText = '' + + for img in self._img: + modifiedImg = self._transformImg(img) + try: + # Simple text extraction + text = pytesseract.image_to_string(modifiedImg, lang=self._languageCode, config='--psm 4') + self._OCRText += text + '\n' + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: OCR error: {e}", True) + + # Clean up text + self._cleanOCRText() + debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR completed", True) + + def _cleanOCRText(self): + """Clean up OCR text output.""" + # Remove multiple spaces + regexSpace = re.compile('[^\S\r\n]{2,}') + self._OCRText = regexSpace.sub(' ', self._OCRText) + + # Remove empty lines + regexSpace = re.compile('\n\s*\n') + self._OCRText = regexSpace.sub('\n', self._OCRText) + + # Remove trailing spaces + regexSpace = re.compile('\s*\n') + self._OCRText = regexSpace.sub('\n', self._OCRText) + + # Remove leading spaces + regexSpace = re.compile('^\s') + self._OCRText = regexSpace.sub('', self._OCRText) + + # Remove trailing newlines + self._OCRText = self._OCRText.strip() + + def _presentOCRResult(self): + """Present OCR result to user via speech.""" + try: + if not self._OCRText.strip(): + message = "No text found in OCR scan" + else: + message = f"OCR result: {self._OCRText}" + + if self.app: + state = self.app.getDynamicApiManager().getAPI('CthulhuState') + if state and state.activeScript: + state.activeScript.presentMessage(message, resetStyles=False) + + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Presented result: {len(self._OCRText)} characters", True) + + except Exception as e: + debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error presenting result: {e}", True) \ No newline at end of file diff --git a/src/cthulhu/plugins/meson.build b/src/cthulhu/plugins/meson.build index 3389098..7f60ae3 100644 --- a/src/cthulhu/plugins/meson.build +++ b/src/cthulhu/plugins/meson.build @@ -5,6 +5,7 @@ subdir('Clipboard') subdir('DisplayVersion') subdir('HelloCthulhu') subdir('IndentationAudio') +subdir('OCRDesktop') subdir('PluginManager') subdir('SimplePluginSystem') subdir('hello_world') diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py index 1a8ea72..51ef075 100644 --- a/src/cthulhu/settings.py +++ b/src/cthulhu/settings.py @@ -431,7 +431,7 @@ presentChatRoomLast = False presentLiveRegionFromInactiveTab = False # Plugins -activePlugins = ['AIAssistant', 'DisplayVersion', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] +activePlugins = ['AIAssistant', 'DisplayVersion', 'OCRDesktop', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] # AI Assistant settings (disabled by default for opt-in behavior) aiAssistantEnabled = True