Initial work on ocr integration. Is broken currently.

This commit is contained in:
Storm Dragon
2025-08-21 21:59:29 -04:00
parent 03f13140fe
commit 41c91ffc66
7 changed files with 583 additions and 2 deletions

View File

@@ -23,5 +23,5 @@
# Fork of Orca Screen Reader (GNOME) # Fork of Orca Screen Reader (GNOME)
# Original source: https://gitlab.gnome.org/GNOME/orca # Original source: https://gitlab.gnome.org/GNOME/orca
version = "2025.08.19" version = "2025.08.21"
codeName = "testing" codeName = "testing"

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Stormux
# Copyright (c) 2022 Chrys (original ocrdesktop)
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., Franklin Street, Fifth Floor,
# Boston MA 02110-1301 USA.
"""OCRDesktop plugin package."""
from .plugin import OCRDesktop

View File

@@ -0,0 +1,14 @@
ocrdesktop_python_sources = files([
'__init__.py',
'plugin.py'
])
python3.install_sources(
ocrdesktop_python_sources,
subdir: 'cthulhu/plugins/OCRDesktop'
)
install_data(
'plugin.info',
install_dir: python3.get_install_dir() / 'cthulhu' / 'plugins' / 'OCRDesktop'
)

View File

@@ -0,0 +1,8 @@
name = OCR Desktop
version = 4.0.0
description = OCR accessibility tool for reading inaccessible windows and dialogs using Tesseract OCR
authors = Storm Dragon <storm_dragon@stormux.org>
website = https://github.com/chrys87/ocrdesktop
copyright = Copyright 2022 Chrys, Copyright 2025 Stormux
builtin = false
hidden = false

View File

@@ -0,0 +1,535 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Stormux
# Copyright (c) 2022 Chrys (original ocrdesktop)
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
"""OCRDesktop plugin for Cthulhu screen reader."""
import logging
import os
import sys
import locale
import time
import re
import tempfile
import threading
from mimetypes import MimeTypes
from cthulhu.plugin import Plugin, cthulhu_hookimpl
from cthulhu import debug
# Import Cthulhu's sound system
try:
from cthulhu import sound
from cthulhu.sound_generator import Tone
SOUND_AVAILABLE = True
except ImportError:
SOUND_AVAILABLE = False
# PIL
try:
from PIL import Image
from PIL import ImageOps
PIL_AVAILABLE = True
except ImportError:
PIL_AVAILABLE = False
# pytesseract
try:
import pytesseract
from pytesseract import Output
PYTESSERACT_AVAILABLE = True
except ImportError:
PYTESSERACT_AVAILABLE = False
# pdf2image
try:
from pdf2image import convert_from_path
PDF2IMAGE_AVAILABLE = True
except ImportError:
PDF2IMAGE_AVAILABLE = False
# scipy
try:
from scipy.spatial import KDTree
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
# webcolors
try:
from webcolors import CSS3_HEX_TO_NAMES
from webcolors import hex_to_rgb
WEBCOLORS_AVAILABLE = True
except ImportError:
WEBCOLORS_AVAILABLE = False
# GTK/GDK/Wnck
try:
import gi
gi.require_version("Gtk", "3.0")
gi.require_version("Gdk", "3.0")
gi.require_version("Wnck", "3.0")
from gi.repository import Gtk, Gdk, Wnck
GTK_AVAILABLE = True
except ImportError:
GTK_AVAILABLE = False
logger = logging.getLogger(__name__)
class OCRDesktop(Plugin):
"""OCR Desktop accessibility plugin for reading inaccessible windows."""
def __init__(self, *args, **kwargs):
"""Initialize the plugin."""
super().__init__(*args, **kwargs)
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin initialized", True)
# Keybinding storage
self._kb_binding_window = None
self._kb_binding_desktop = None
self._kb_binding_clipboard = None
# OCR settings
self._languageCode = 'eng'
self._scaleFactor = 3
self._grayscaleImg = False
self._invertImg = False
self._blackWhiteImg = False
self._blackWhiteImgValue = 200
self._colorCalculation = False
self._colorCalculationMax = 3
# Internal state
self._img = []
self._modifiedImg = []
self._OCRText = ''
self._offsetXpos = 0
self._offsetYpos = 0
self._activated = False
# Progress feedback
self._is_processing = False
self._beep_thread = None
self._stop_beeping = False
self._player = None
# Color analysis
self._kdtDB = None
self.colorNames = []
self.colorCache = {}
# Set locale for tesseract
locale.setlocale(locale.LC_ALL, 'C')
# Initialize sound player for progress beeps
if SOUND_AVAILABLE:
try:
self._player = sound.getPlayer()
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Sound player initialized", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Failed to initialize sound: {e}", True)
# Check dependencies
self._checkDependencies()
def _checkDependencies(self):
"""Check if required dependencies are available."""
missing_deps = []
if not PIL_AVAILABLE:
missing_deps.append("python3-pillow")
if not PYTESSERACT_AVAILABLE:
missing_deps.append("python-pytesseract")
if not GTK_AVAILABLE:
missing_deps.append("GTK3/GDK/Wnck")
if missing_deps:
debug.printMessage(debug.LEVEL_INFO,
f"OCRDesktop: Missing dependencies: {', '.join(missing_deps)}", True)
return False
return True
@cthulhu_hookimpl
def activate(self, plugin=None):
"""Activate the plugin."""
if plugin is not None and plugin is not self:
return
if self._activated:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already activated", True)
return
try:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activation starting", True)
if not self.app:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: ERROR - No app reference", True)
return
if not self._checkDependencies():
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot activate - missing dependencies", True)
return
# Register keybindings
self._registerKeybindings()
self._activated = True
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin activated successfully", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error activating: {e}", True)
import traceback
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {traceback.format_exc()}", True)
@cthulhu_hookimpl
def deactivate(self, plugin=None):
"""Deactivate the plugin."""
if plugin is not None and plugin is not self:
return
self._activated = False
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Plugin deactivated", True)
def _registerKeybindings(self):
"""Register plugin keybindings."""
try:
# OCR active window
self._kb_binding_window = self.registerGestureByString(
self._ocrActiveWindow,
"OCR read active window",
'kb:cthulhu+control+w'
)
# OCR entire desktop
self._kb_binding_desktop = self.registerGestureByString(
self._ocrDesktop,
"OCR read entire desktop",
'kb:cthulhu+control+d'
)
# OCR from clipboard
self._kb_binding_clipboard = self.registerGestureByString(
self._ocrClipboard,
"OCR read image from clipboard",
'kb:cthulhu+control+shift+c'
)
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Keybindings registered", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error registering keybindings: {e}", True)
def _startProgressBeeps(self):
"""Start playing progress beeps during OCR processing."""
if not self._player:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Cannot start beeps - no sound player", True)
return
if self._beep_thread and self._beep_thread.is_alive():
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Beeps already running", True)
return
self._stop_beeping = False
self._beep_thread = threading.Thread(target=self._beepLoop, daemon=True)
self._beep_thread.start()
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Started progress beeps", True)
def _stopProgressBeeps(self):
"""Stop playing progress beeps."""
self._stop_beeping = True
if self._beep_thread:
self._beep_thread.join(timeout=1.0)
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Stopped progress beeps", True)
def _beepLoop(self):
"""Loop that plays short system bell beeps every 0.5 seconds."""
while not self._stop_beeping:
try:
# Just use the system bell - we know this works as short beeps
print("\a")
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: System bell beep", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: System bell error: {e}", True)
break
# Wait 0.5 seconds before next beep
for _ in range(50): # Check every 0.01 seconds for quick stopping
if self._stop_beeping:
return
time.sleep(0.01)
def _announceOCRStart(self, ocr_type):
"""Announce the start of OCR operation."""
try:
message = f"Performing OCR on {ocr_type}"
if self.app:
state = self.app.getDynamicApiManager().getAPI('CthulhuState')
if state and state.activeScript:
state.activeScript.presentMessage(message, resetStyles=False)
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: {message}", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error announcing OCR start: {e}", True)
def _ocrActiveWindow(self, script=None, inputEvent=None):
"""OCR the active window."""
try:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR active window requested", True)
if self._is_processing:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
return True
self._is_processing = True
self._announceOCRStart("window")
self._startProgressBeeps()
try:
if self._screenShotWindow():
self._performOCR()
self._presentOCRResult()
finally:
self._stopProgressBeeps()
self._is_processing = False
return True
except Exception as e:
self._stopProgressBeeps()
self._is_processing = False
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR window: {e}", True)
return False
def _ocrDesktop(self, script=None, inputEvent=None):
"""OCR the entire desktop."""
try:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR desktop requested", True)
if self._is_processing:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
return True
self._is_processing = True
self._announceOCRStart("desktop")
self._startProgressBeeps()
try:
if self._screenShotDesktop():
self._performOCR()
self._presentOCRResult()
finally:
self._stopProgressBeeps()
self._is_processing = False
return True
except Exception as e:
self._stopProgressBeeps()
self._is_processing = False
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR desktop: {e}", True)
return False
def _ocrClipboard(self, script=None, inputEvent=None):
"""OCR image from clipboard."""
try:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR clipboard requested", True)
if self._is_processing:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Already processing, ignoring request", True)
return True
self._is_processing = True
self._announceOCRStart("clipboard")
self._startProgressBeeps()
try:
if self._readClipboard():
self._performOCR()
self._presentOCRResult()
finally:
self._stopProgressBeeps()
self._is_processing = False
return True
except Exception as e:
self._stopProgressBeeps()
self._is_processing = False
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error in OCR clipboard: {e}", True)
return False
def _screenShotWindow(self):
"""Take screenshot of active window."""
if not GTK_AVAILABLE:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True)
return False
try:
time.sleep(0.3) # Brief delay
gdkCurrDesktop = Gdk.get_default_root_window()
currWnckScreen = Wnck.Screen.get_default()
currWnckScreen.force_update()
currWnckWindow = currWnckScreen.get_active_window()
if not currWnckWindow:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No active window found", True)
return False
self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight = currWnckWindow.get_geometry()
pixBuff = Gdk.pixbuf_get_from_window(gdkCurrDesktop, self._offsetXpos, self._offsetYpos, wnckWidth, wnckHeight)
if pixBuff:
self._img = [self._pixbuf2image(pixBuff)]
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Window screenshot captured", True)
return True
else:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture window screenshot", True)
return False
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking window screenshot: {e}", True)
return False
def _screenShotDesktop(self):
"""Take screenshot of entire desktop."""
if not GTK_AVAILABLE:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for screenshots", True)
return False
try:
time.sleep(0.3) # Brief delay
currDesktop = Gdk.get_default_root_window()
pixBuff = Gdk.pixbuf_get_from_window(currDesktop, 0, 0, currDesktop.get_width(), currDesktop.get_height())
if pixBuff:
self._img = [self._pixbuf2image(pixBuff)]
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Desktop screenshot captured", True)
return True
else:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Failed to capture desktop screenshot", True)
return False
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error taking desktop screenshot: {e}", True)
return False
def _readClipboard(self):
"""Read image from clipboard."""
if not GTK_AVAILABLE:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: GTK not available for clipboard", True)
return False
try:
clipboardObj = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
pixBuff = clipboardObj.wait_for_image()
if pixBuff:
self._img = [self._pixbuf2image(pixBuff)]
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Image read from clipboard", True)
return True
else:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: No image found in clipboard", True)
return False
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error reading clipboard: {e}", True)
return False
def _pixbuf2image(self, pix):
"""Convert GdkPixbuf to PIL Image."""
data = pix.get_pixels()
w = pix.props.width
h = pix.props.height
stride = pix.props.rowstride
mode = "RGB"
if pix.props.has_alpha:
mode = "RGBA"
im = Image.frombytes(mode, (w, h), data, "raw", mode, stride)
return im
def _scaleImg(self, img):
"""Scale image for better OCR results."""
width_screen, height_screen = img.size
width_screen = width_screen * self._scaleFactor
height_screen = height_screen * self._scaleFactor
scaledImg = img.resize((width_screen, height_screen), Image.Resampling.BICUBIC)
return scaledImg
def _transformImg(self, img):
"""Transform image with various filters for better OCR."""
modifiedImg = self._scaleImg(img)
if self._invertImg:
modifiedImg = ImageOps.invert(modifiedImg)
if self._grayscaleImg:
modifiedImg = ImageOps.grayscale(modifiedImg)
if self._blackWhiteImg:
lut = [255 if v > self._blackWhiteImgValue else 0 for v in range(256)]
modifiedImg = modifiedImg.point(lut)
return modifiedImg
def _performOCR(self):
"""Perform OCR on captured images."""
if not PYTESSERACT_AVAILABLE:
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Tesseract not available", True)
return
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: Starting OCR", True)
self._OCRText = ''
for img in self._img:
modifiedImg = self._transformImg(img)
try:
# Simple text extraction
text = pytesseract.image_to_string(modifiedImg, lang=self._languageCode, config='--psm 4')
self._OCRText += text + '\n'
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: OCR error: {e}", True)
# Clean up text
self._cleanOCRText()
debug.printMessage(debug.LEVEL_INFO, "OCRDesktop: OCR completed", True)
def _cleanOCRText(self):
"""Clean up OCR text output."""
# Remove multiple spaces
regexSpace = re.compile('[^\S\r\n]{2,}')
self._OCRText = regexSpace.sub(' ', self._OCRText)
# Remove empty lines
regexSpace = re.compile('\n\s*\n')
self._OCRText = regexSpace.sub('\n', self._OCRText)
# Remove trailing spaces
regexSpace = re.compile('\s*\n')
self._OCRText = regexSpace.sub('\n', self._OCRText)
# Remove leading spaces
regexSpace = re.compile('^\s')
self._OCRText = regexSpace.sub('', self._OCRText)
# Remove trailing newlines
self._OCRText = self._OCRText.strip()
def _presentOCRResult(self):
"""Present OCR result to user via speech."""
try:
if not self._OCRText.strip():
message = "No text found in OCR scan"
else:
message = f"OCR result: {self._OCRText}"
if self.app:
state = self.app.getDynamicApiManager().getAPI('CthulhuState')
if state and state.activeScript:
state.activeScript.presentMessage(message, resetStyles=False)
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Presented result: {len(self._OCRText)} characters", True)
except Exception as e:
debug.printMessage(debug.LEVEL_INFO, f"OCRDesktop: Error presenting result: {e}", True)

View File

@@ -5,6 +5,7 @@ subdir('Clipboard')
subdir('DisplayVersion') subdir('DisplayVersion')
subdir('HelloCthulhu') subdir('HelloCthulhu')
subdir('IndentationAudio') subdir('IndentationAudio')
subdir('OCRDesktop')
subdir('PluginManager') subdir('PluginManager')
subdir('SimplePluginSystem') subdir('SimplePluginSystem')
subdir('hello_world') subdir('hello_world')

View File

@@ -431,7 +431,7 @@ presentChatRoomLast = False
presentLiveRegionFromInactiveTab = False presentLiveRegionFromInactiveTab = False
# Plugins # Plugins
activePlugins = ['AIAssistant', 'DisplayVersion', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] activePlugins = ['AIAssistant', 'DisplayVersion', 'OCRDesktop', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu']
# AI Assistant settings (disabled by default for opt-in behavior) # AI Assistant settings (disabled by default for opt-in behavior)
aiAssistantEnabled = True aiAssistantEnabled = True