Initial work on ocr integration. Is broken currently.

2025-08-21 21:59:29 -04:00
parent 03f13140fe
commit 41c91ffc66
7 changed files with 583 additions and 2 deletions
--- a/src/cthulhu/cthulhuVersion.py
+++ b/src/cthulhu/cthulhuVersion.py
@@ -23,5 +23,5 @@
 # Fork of Orca Screen Reader (GNOME)
 # Original source: https://gitlab.gnome.org/GNOME/orca
-version = "2025.08.19"
+version = "2025.08.21"
 codeName = "testing"
--- a/src/cthulhu/plugins/OCR/init.py
+++ b/src/cthulhu/plugins/OCR/init.py
@@ -0,0 +1,23 @@
 #!/usr/bin/env python3
 #
 # Copyright (c) 2025 Stormux
 # Copyright (c) 2022 Chrys (original ocrdesktop)
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the
 # Free Software Foundation, Inc., Franklin Street, Fifth Floor,
 # Boston MA  02110-1301 USA.
 """OCRDesktop plugin package."""
 from .plugin import OCRDesktop
--- a/src/cthulhu/plugins/OCR/meson.build
+++ b/src/cthulhu/plugins/OCR/meson.build
@@ -0,0 +1,14 @@
 ocrdesktop_python_sources = files([
  '__init__.py',
  'plugin.py'
 ])
 python3.install_sources(
  ocrdesktop_python_sources,
  subdir: 'cthulhu/plugins/OCRDesktop'
 )
 install_data(
  'plugin.info',
  install_dir: python3.get_install_dir() / 'cthulhu' / 'plugins' / 'OCRDesktop'
 )
--- a/src/cthulhu/plugins/OCR/plugin.info
+++ b/src/cthulhu/plugins/OCR/plugin.info
@@ -0,0 +1,8 @@
 name = OCR Desktop
 version = 4.0.0
 description = OCR accessibility tool for reading inaccessible windows and dialogs using Tesseract OCR
 authors = Storm Dragon <storm_dragon@stormux.org>
 website = https://github.com/chrys87/ocrdesktop
 copyright = Copyright 2022 Chrys, Copyright 2025 Stormux
 builtin = false
 hidden = false
--- a/src/cthulhu/plugins/OCR/plugin.py
+++ b/src/cthulhu/plugins/OCR/plugin.py
@@ -0,0 +1,535 @@
 #!/usr/bin/env python3
 #
 # Copyright (c) 2025 Stormux
 # Copyright (c) 2022 Chrys (original ocrdesktop)
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 """OCRDesktop plugin for Cthulhu screen reader."""
 import logging
 import os
 import sys
 import locale
 import time
 import re
 import tempfile
 import threading
 from mimetypes import MimeTypes
 from cthulhu.plugin import Plugin, cthulhu_hookimpl
 from cthulhu import debug
 # Import Cthulhu's sound system
 try:
    from cthulhu import sound
    from cthulhu.sound_generator import Tone
    SOUND_AVAILABLE = True
 except ImportError:
    SOUND_AVAILABLE = False
 # PIL
 try:
    from PIL import Image
    from PIL import ImageOps
    PIL_AVAILABLE = True
 except ImportError:
    PIL_AVAILABLE = False
 # pytesseract
 try:
    import pytesseract
    from pytesseract import Output
    PYTESSERACT_AVAILABLE = True
 except ImportError:
    PYTESSERACT_AVAILABLE = False
 # pdf2image
 try:
    from pdf2image import convert_from_path
    PDF2IMAGE_AVAILABLE = True
 except ImportError:
    PDF2IMAGE_AVAILABLE = False
 # scipy
 try:
    from scipy.spatial import KDTree
    SCIPY_AVAILABLE = True
 except ImportError:
    SCIPY_AVAILABLE = False
 # webcolors
 try:
    from webcolors import CSS3_HEX_TO_NAMES
    from webcolors import hex_to_rgb
    WEBCOLORS_AVAILABLE = True
 except ImportError:
    WEBCOLORS_AVAILABLE = False
 # GTK/GDK/Wnck
 try:
    import gi
    gi.require_version("Gtk", "3.0")
    gi.require_version("Gdk", "3.0")
    gi.require_version("Wnck", "3.0")
    from gi.repository import Gtk, Gdk, Wnck
    GTK_AVAILABLE = True
 except ImportError:
    GTK_AVAILABLE = False
 logger = logging.getLogger(__name__)
 class OCRDesktop(Plugin):
    """OCR Desktop accessibility plugin for reading inaccessible windows."""
    def __init__(self, *args, **kwargs):
        """Initialize the plugin."""
        super().__init__(*args, **kwargs)
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin initialized", True)
        # Keybinding storage
        self._kb_binding_window = None
        self._kb_binding_desktop = None
        self._kb_binding_clipboard = None
        # OCR settings
        self._languageCode = 'eng'
        self._scaleFactor = 3
        self._grayscaleImg = False
        self._invertImg = False
        self._blackWhiteImg = False
        self._blackWhiteImgValue = 200
        self._colorCalculation = False
        self._colorCalculationMax = 3
        # Internal state
        self._img = []
        self._modifiedImg = []
        self._OCRText = ''
        self._offsetXpos = 0
        self._offsetYpos = 0
        self._activated = False
        # Progress feedback
        self._is_processing = False
        self._beep_thread = None
        self._stop_beeping = False
        self._player = None
        # Color analysis
        self._kdtDB = None
        self.colorNames = []
        self.colorCache = {}
        # Set locale for tesseract
        locale.setlocale(locale.LC_ALL, 'C')
        # Initialize sound player for progress beeps
        if SOUND_AVAILABLE:
            try:
                self._player = sound.getPlayer()
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Sound player initialized", True)
            except Exception as e:
                debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Failed to initialize sound: {e}", True)
        # Check dependencies
        self._checkDependencies()
    def _checkDependencies(self):
        """Check if required dependencies are available."""
        missing_deps = []
        if not PIL_AVAILABLE:
            missing_deps.append("python3-pillow")
        if not PYTESSERACT_AVAILABLE:
            missing_deps.append("python-pytesseract")
        if not GTK_AVAILABLE:
            missing_deps.append("GTK3/GDK/Wnck")
        if missing_deps:
            debug.printMessage(debug.LEVEL_INFO, 
                f"OCRDesktop: Missing dependencies: {', '.join(missing_deps)}", True)
            return False
        return True
    @cthulhu_hookimpl
    def activate(self, plugin=None):
        """Activate the plugin."""
        if plugin is not None and plugin is not self:
            return
        if self._activated:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already activated", True)
            return
        try:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activation starting", True)
            if not self.app:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: ERROR - No app reference", True)
                return
            if not self._checkDependencies():
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot activate - missing dependencies", True)
                return
            # Register keybindings
            self._registerKeybindings()
            self._activated = True
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activated successfully", True)
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error activating: {e}", True)
            import traceback
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {traceback.format_exc()}", True)
    @cthulhu_hookimpl
    def deactivate(self, plugin=None):
        """Deactivate the plugin."""
        if plugin is not None and plugin is not self:
            return
        self._activated = False
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin deactivated", True)
    def _registerKeybindings(self):
        """Register plugin keybindings."""
        try:
            # OCR active window
            self._kb_binding_window = self.registerGestureByString(
                self._ocrActiveWindow,
                "OCR read active window",
                'kb:cthulhu+control+w'
            )
            # OCR entire desktop
            self._kb_binding_desktop = self.registerGestureByString(
                self._ocrDesktop,
                "OCR read entire desktop",
                'kb:cthulhu+control+d'
            )
            # OCR from clipboard
            self._kb_binding_clipboard = self.registerGestureByString(
                self._ocrClipboard,
                "OCR read image from clipboard",
                'kb:cthulhu+control+shift+c'
            )
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Keybindings registered", True)
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error registering keybindings: {e}", True)
    def _startProgressBeeps(self):
        """Start playing progress beeps during OCR processing."""
        if not self._player:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot start beeps - no sound player", True)
            return
        if self._beep_thread and self._beep_thread.is_alive():
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Beeps already running", True)
            return
        self._stop_beeping = False
        self._beep_thread = threading.Thread(target=self._beepLoop, daemon=True)
        self._beep_thread.start()
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Started progress beeps", True)
    def _stopProgressBeeps(self):
        """Stop playing progress beeps."""
        self._stop_beeping = True
        if self._beep_thread:
            self._beep_thread.join(timeout=1.0)
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Stopped progress beeps", True)
    def _beepLoop(self):
        """Loop that plays short system bell beeps every 0.5 seconds."""
        while not self._stop_beeping:
            try:
                # Just use the system bell - we know this works as short beeps
                print("\a")
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: System bell beep", True)
            except Exception as e:
                debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: System bell error: {e}", True)
                break
            # Wait 0.5 seconds before next beep
            for _ in range(50):  # Check every 0.01 seconds for quick stopping  
                if self._stop_beeping:
                    return
                time.sleep(0.01)
    def _announceOCRStart(self, ocr_type):
        """Announce the start of OCR operation."""
        try:
            message = f"Performing OCR on {ocr_type}"
            if self.app:
                state = self.app.getDynamicApiManager().getAPI('CthulhuState')
                if state and state.activeScript:
                    state.activeScript.presentMessage(message, resetStyles=False)
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {message}", True)
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error announcing OCR start: {e}", True)
    def _ocrActiveWindow(self, script=None, inputEvent=None):
        """OCR the active window."""
        try:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR active window requested", True)
            if self._is_processing:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
                return True
            self._is_processing = True
            self._announceOCRStart("window")
            self._startProgressBeeps()
            try:
                if self._screenShotWindow():
                    self._performOCR()
                    self._presentOCRResult()
            finally:
                self._stopProgressBeeps()
                self._is_processing = False
            return True
        except Exception as e:
            self._stopProgressBeeps()
            self._is_processing = False
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR window: {e}", True)
            return False
    def _ocrDesktop(self, script=None, inputEvent=None):
        """OCR the entire desktop."""
        try:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR desktop requested", True)
            if self._is_processing:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
                return True
            self._is_processing = True
            self._announceOCRStart("desktop")
            self._startProgressBeeps()
            try:
                if self._screenShotDesktop():
                    self._performOCR()
                    self._presentOCRResult()
            finally:
                self._stopProgressBeeps()
                self._is_processing = False
            return True
        except Exception as e:
            self._stopProgressBeeps()
            self._is_processing = False
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR desktop: {e}", True)
            return False
    def _ocrClipboard(self, script=None, inputEvent=None):
        """OCR image from clipboard."""
        try:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR clipboard requested", True)
            if self._is_processing:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
                return True
            self._is_processing = True
            self._announceOCRStart("clipboard")
            self._startProgressBeeps()
            try:
                if self._readClipboard():
                    self._performOCR()
                    self._presentOCRResult()
            finally:
                self._stopProgressBeeps()
                self._is_processing = False
            return True
        except Exception as e:
            self._stopProgressBeeps()
            self._is_processing = False
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR clipboard: {e}", True)
            return False
    def _screenShotWindow(self):
        """Take screenshot of active window."""
        if not GTK_AVAILABLE:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True)
            return False
        try:
            time.sleep(0.3)  # Brief delay
            gdkCurrDesktop = Gdk.get_default_root_window()
            currWnckScreen = Wnck.Screen.get_default()
            currWnckScreen.force_update()
            currWnckWindow = currWnckScreen.get_active_window()
            if not currWnckWindow:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No active window found", True)
                return False
            self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight = currWnckWindow.get_geometry()
            pixBuff = Gdk.pixbuf_get_from_window(gdkCurrDesktop, self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight)
            if pixBuff:
                self._img = [self._pixbuf2image(pixBuff)]
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Window screenshot captured", True)
                return True
            else:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture window screenshot", True)
                return False
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking window screenshot: {e}", True)
            return False
    def _screenShotDesktop(self):
        """Take screenshot of entire desktop."""
        if not GTK_AVAILABLE:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True)
            return False
        try:
            time.sleep(0.3)  # Brief delay
            currDesktop = Gdk.get_default_root_window()
            pixBuff = Gdk.pixbuf_get_from_window(currDesktop, 0, 0, currDesktop.get_width(), currDesktop.get_height())
            if pixBuff:
                self._img = [self._pixbuf2image(pixBuff)]
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Desktop screenshot captured", True)
                return True
            else:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture desktop screenshot", True)
                return False
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking desktop screenshot: {e}", True)
            return False
    def _readClipboard(self):
        """Read image from clipboard."""
        if not GTK_AVAILABLE:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for clipboard", True)
            return False
        try:
            clipboardObj = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
            pixBuff = clipboardObj.wait_for_image()
            if pixBuff:
                self._img = [self._pixbuf2image(pixBuff)]
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Image read from clipboard", True)
                return True
            else:
                debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No image found in clipboard", True)
                return False
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error reading clipboard: {e}", True)
            return False
    def _pixbuf2image(self, pix):
        """Convert GdkPixbuf to PIL Image."""
        data = pix.get_pixels()
        w = pix.props.width
        h = pix.props.height
        stride = pix.props.rowstride
        mode = "RGB"
        if pix.props.has_alpha:
            mode = "RGBA"
        im = Image.frombytes(mode, (w, h), data, "raw", mode, stride)
        return im
    def _scaleImg(self, img):
        """Scale image for better OCR results."""
        width_screen, height_screen = img.size
        width_screen = width_screen * self._scaleFactor
        height_screen = height_screen * self._scaleFactor
        scaledImg = img.resize((width_screen, height_screen), Image.Resampling.BICUBIC)
        return scaledImg
    def _transformImg(self, img):
        """Transform image with various filters for better OCR."""
        modifiedImg = self._scaleImg(img)
        if self._invertImg:
            modifiedImg = ImageOps.invert(modifiedImg)
        if self._grayscaleImg:
            modifiedImg = ImageOps.grayscale(modifiedImg)
        if self._blackWhiteImg:
            lut = [255 if v > self._blackWhiteImgValue else 0 for v in range(256)]
            modifiedImg = modifiedImg.point(lut)
        return modifiedImg
    def _performOCR(self):
        """Perform OCR on captured images."""
        if not PYTESSERACT_AVAILABLE:
            debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Tesseract not available", True)
            return
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Starting OCR", True)
        self._OCRText = ''
        for img in self._img:
            modifiedImg = self._transformImg(img)
            try:
                # Simple text extraction
                text = pytesseract.image_to_string(modifiedImg, lang=self._languageCode, config='--psm 4')
                self._OCRText += text + '\n'
            except Exception as e:
                debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: OCR error: {e}", True)
        # Clean up text
        self._cleanOCRText()
        debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR completed", True)
    def _cleanOCRText(self):
        """Clean up OCR text output."""
        # Remove multiple spaces
        regexSpace = re.compile('[^\S\r\n]{2,}')
        self._OCRText = regexSpace.sub(' ', self._OCRText)
        # Remove empty lines
        regexSpace = re.compile('\n\s*\n')
        self._OCRText = regexSpace.sub('\n', self._OCRText)
        # Remove trailing spaces
        regexSpace = re.compile('\s*\n')
        self._OCRText = regexSpace.sub('\n', self._OCRText)
        # Remove leading spaces
        regexSpace = re.compile('^\s')
        self._OCRText = regexSpace.sub('', self._OCRText)
        # Remove trailing newlines
        self._OCRText = self._OCRText.strip()
    def _presentOCRResult(self):
        """Present OCR result to user via speech."""
        try:
            if not self._OCRText.strip():
                message = "No text found in OCR scan"
            else:
                message = f"OCR result: {self._OCRText}"
            if self.app:
                state = self.app.getDynamicApiManager().getAPI('CthulhuState')
                if state and state.activeScript:
                    state.activeScript.presentMessage(message, resetStyles=False)
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Presented result: {len(self._OCRText)} characters", True)
        except Exception as e:
            debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error presenting result: {e}", True)
--- a/src/cthulhu/plugins/meson.build
+++ b/src/cthulhu/plugins/meson.build
@@ -5,6 +5,7 @@ subdir('Clipboard')
 subdir('DisplayVersion')
 subdir('HelloCthulhu')
 subdir('IndentationAudio')
 subdir('OCRDesktop')
 subdir('PluginManager')
 subdir('SimplePluginSystem')
 subdir('hello_world')
--- a/src/cthulhu/settings.py
+++ b/src/cthulhu/settings.py
@@ -431,7 +431,7 @@ presentChatRoomLast = False
 presentLiveRegionFromInactiveTab = False
 # Plugins
-activePlugins = ['AIAssistant', 'DisplayVersion', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu']
+activePlugins = ['AIAssistant', 'DisplayVersion', 'OCRDesktop', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu']
 # AI Assistant settings (disabled by default for opt-in behavior)
 aiAssistantEnabled = True