diff --git a/CLAUDE.md b/CLAUDE.md index 6b8e302..f6b5d2f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -335,10 +335,417 @@ subprojects/spiel.wrap # Subproject integration 3. **Plugin System**: How to maintain Cthulhu's plugin advantage while integrating Orca improvements? 4. **Version Strategy**: Selective feature backporting vs. major version sync? +## AI Assistant Integration + +### **NEW FEATURE**: AI-Powered Accessibility Assistant +Cthulhu now includes an optional AI assistant plugin for enhanced accessibility support: + +- **Vision Analysis**: Screenshots + AT-SPI data for understanding unlabeled UI elements +- **Safe Actions**: Confirmed element clicking and navigation assistance +- **Multi-Provider Support**: Claude, ChatGPT, Gemini, and Ollama backends +- **Privacy-First**: Disabled by default, requires explicit opt-in and API key configuration + +### AI Assistant Configuration +```bash +# Access via Cthulhu Preferences +~/.local/bin/cthulhu -s # Opens preferences dialog +# Navigate to "AI Assistant" tab +# 1. Check "Enable AI Assistant" +# 2. Select provider (Claude, ChatGPT, Gemini, Ollama) +# 3. Set API key file path +# 4. Configure safety and quality settings +``` + +### AI Provider Setup + +#### 1. Claude (Anthropic) - **Recommended** +```bash +# Get API key from: https://console.anthropic.com/ +# 1. Sign up/login → "Get API Keys" → Create new key +# 2. Copy the key (starts with "sk-ant-...") +# 3. Save to file: +mkdir -p ~/.config/cthulhu +echo "sk-ant-your-actual-key-here" > ~/.config/cthulhu/claude-api-key +chmod 600 ~/.config/cthulhu/claude-api-key + +# Pricing: ~$3 per million input tokens, ~$15 per million output tokens +# Best vision capabilities and safety for accessibility use +``` + +#### 2. ChatGPT (OpenAI) +```bash +# Get API key from: https://platform.openai.com/api-keys +# 1. Sign up/login → "Create new secret key" +# 2. Copy immediately (can't view again, starts with "sk-...") +# 3. Save to file: +mkdir -p ~/.config/cthulhu +echo "sk-your-actual-openai-key" > ~/.config/cthulhu/openai-api-key +chmod 600 ~/.config/cthulhu/openai-api-key + +# Pricing: ~$2.50 per million input tokens, ~$10 per million output tokens +# Good vision capabilities, widely supported +``` + +#### 3. Gemini (Google) +```bash +# Get API key from: https://aistudio.google.com/app/apikey +# 1. Sign up/login → "Create API key" +# 2. Copy the generated key +# 3. Save to file: +mkdir -p ~/.config/cthulhu +echo "your-actual-gemini-key" > ~/.config/cthulhu/gemini-api-key +chmod 600 ~/.config/cthulhu/gemini-api-key + +# Pricing: Free tier (15 requests/min), then ~$1.25 per million tokens +# Good for testing, has generous free allowance +``` + +#### 4. Ollama (Local) - **Privacy-Focused** +```bash +# Install Ollama (no API key needed!) +sudo pacman -S ollama # Arch Linux +# OR: curl -fsSL https://ollama.ai/install.sh | sh + +# Start service +systemctl --user enable ollama +systemctl --user start ollama + +# Download vision-capable model (required for AI assistant) +ollama pull llama3.2-vision # 7.9GB download +# OR smaller model: ollama pull moondream # 1.7GB + +# Verify installation +ollama list # Should show downloaded models + +# No API key needed - runs entirely offline! +# Free to use, privacy-focused, but slower than cloud providers +``` + +### AI Assistant Usage Patterns +- **Information Queries**: "What does this unlabeled button do?" +- **Navigation Help**: "Where is the login form?" +- **Action Assistance**: "Click the submit button" (with confirmation) +- **Layout Understanding**: "Describe the main sections of this page" + +### Safety Framework +- **Confirmation Required**: All actions require user approval by default +- **Action Descriptions**: Clear explanation of what will happen +- **Safe Defaults**: Conservative timeouts and quality settings +- **Privacy Protection**: API keys stored securely, no data logging + +### Troubleshooting AI Assistant Setup + +#### Common Issues +```bash +# Check if AI settings loaded correctly +~/.local/bin/cthulhu -s # Open preferences, check AI Assistant tab + +# Verify API key file permissions and format +ls -la ~/.config/cthulhu/*-api-key # Should show 600 permissions +cat ~/.config/cthulhu/claude-api-key # Should contain only the API key + +# Test Ollama connection +curl http://localhost:11434/api/version # Should return Ollama version +ollama ps # Should show running models + +# Check dependencies +python3 -c "import requests, PIL; print('Dependencies OK')" + +# Test screenshot capability (requires X11/Wayland) +python3 -c " +from gi.repository import Gdk +window = Gdk.get_default_root_window() +print('Screenshot capability available') +" +``` + +#### Required Permissions +- **File Access**: API key files in `~/.config/cthulhu/` +- **Screen Access**: Screenshot capture (automatic on most setups) +- **Network Access**: HTTP requests to AI providers (except Ollama) +- **AT-SPI Access**: Accessibility tree traversal (enabled by default) + +## Cthulhu Plugin System - Developer Reference + +### **Plugin Architecture Overview** + +Cthulhu uses a **pluggy-based plugin system** with the following components: + +1. **Plugin Manager**: `src/cthulhu/plugin_system_manager.py` - Central plugin loading/management +2. **Base Plugin Class**: `src/cthulhu/plugin.py` - Provides common functionality +3. **Hook System**: Uses `@cthulhu_hookimpl` decorators for lifecycle management +4. **Plugin Discovery**: Automatic scanning of `src/cthulhu/plugins/` and `~/.local/share/cthulhu/plugins/` + +### **Plugin Directory Structure** + +Every plugin must follow this exact structure: +``` +src/cthulhu/plugins/YourPlugin/ +├── __init__.py # Import: from .plugin import YourPlugin +├── plugin.py # Main plugin class +├── plugin.info # Metadata (name, version, description) +└── Makefile.am # Build system integration +``` + +### **Essential Plugin Files** + +#### **`__init__.py`** - Package Import +```python +from .plugin import YourPlugin +``` + +#### **`plugin.info`** - Metadata +```ini +name = Your Plugin Name +version = 1.0.0 +description = What your plugin does +authors = Your Name +website = https://example.com +copyright = Copyright 2025 +builtin = false +hidden = false +``` + +#### **`Makefile.am`** - Build Integration +```makefile +cthulhu_python_PYTHON = \ + __init__.py \ + plugin.info \ + plugin.py + +cthulhu_pythondir=$(pkgpythondir)/plugins/YourPlugin +``` + +### **Plugin Class Template** + +```python +#!/usr/bin/env python3 +import logging +from cthulhu.plugin import Plugin, cthulhu_hookimpl + +logger = logging.getLogger(__name__) + +class YourPlugin(Plugin): + """Your plugin description.""" + + def __init__(self, *args, **kwargs): + """Initialize the plugin.""" + super().__init__(*args, **kwargs) + logger.info("YourPlugin initialized") + + # Keybinding storage - use individual variables, NOT dictionaries + self._kb_binding = None + + @cthulhu_hookimpl + def activate(self, plugin=None): + """Activate the plugin.""" + if plugin is not None and plugin is not self: + return + + try: + logger.info("=== YourPlugin activation starting ===") + + # Register keybindings + self._register_keybinding() + + logger.info("YourPlugin activated successfully") + return True + + except Exception as e: + logger.error(f"Error activating YourPlugin: {e}") + return False + + @cthulhu_hookimpl + def deactivate(self, plugin=None): + """Deactivate the plugin.""" + if plugin is not None and plugin is not self: + return + + logger.info("Deactivating YourPlugin") + self._kb_binding = None + return True + + def _register_keybinding(self): + """Register plugin keybindings.""" + try: + # CRITICAL: Use this exact parameter order! + self._kb_binding = self.registerGestureByString( + self._your_handler_method, # Handler method (first) + "Description of action", # Description (second) + 'kb:cthulhu+your+keys' # Gesture string (third) + ) + + if self._kb_binding: + logger.info(f"Registered keybinding: {gesture_string}") + else: + logger.error(f"Failed to register keybinding") + + except Exception as e: + logger.error(f"Error registering keybinding: {e}") + + def _your_handler_method(self, script=None, inputEvent=None): + """Handle the keybinding activation.""" + try: + logger.info("Keybinding triggered") + + # Your plugin logic here + + return True + except Exception as e: + logger.error(f"Error in handler: {e}") + return False +``` + +### **🚨 CRITICAL Keybinding Patterns** + +#### **✅ CORRECT Pattern (What Works)** +```python +# Individual binding storage (NOT dictionaries) +self._kb_binding = None +self._kb_binding_action1 = None +self._kb_binding_action2 = None + +# Correct registerGestureByString parameter order +self._kb_binding = self.registerGestureByString( + self._handler_method, # 1st: Handler method + "Action description", # 2nd: Description + 'kb:cthulhu+your+keys' # 3rd: Gesture string +) +``` + +#### **❌ INCORRECT Patterns (What Fails)** +```python +# DON'T use dictionaries for keybinding storage +self._kb_bindings = {} # ❌ WRONG +self._kb_bindings['action'] = self.registerGestureByString(...) # ❌ WRONG + +# DON'T use wrong parameter order +self.registerGestureByString( + 'kb:cthulhu+keys', # ❌ WRONG ORDER + "Description", + self._handler_method +) + +# DON'T use description as handler parameter +self.registerGestureByString( + self._handler_method, + 'kb:cthulhu+keys', # ❌ WRONG ORDER + "Description" +) +``` + +### **Plugin Registration & Activation** + +#### **Add to Build System** +1. **Add to `src/cthulhu/plugins/Makefile.am`**: + ```makefile + SUBDIRS = YourPlugin OtherPlugin1 OtherPlugin2 ... + ``` + +2. **Add to `configure.ac`**: + ``` + src/cthulhu/plugins/YourPlugin/Makefile + ``` + +#### **Add to Default Active Plugins** +In `src/cthulhu/settings.py`: +```python +activePlugins = ['YourPlugin', 'DisplayVersion', 'PluginManager', ...] +``` + +### **Plugin Lifecycle Events** + +1. **`__init__`**: Plugin instance created +2. **`activate`**: Plugin enabled (register keybindings, connect events) +3. **`deactivate`**: Plugin disabled (cleanup, disconnect) + +**Note**: `activate()` may be called multiple times for different script contexts. + +### **Common Plugin Patterns** + +#### **Settings Integration** +```python +from cthulhu import settings_manager + +class YourPlugin(Plugin): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._settings_manager = settings_manager.getManager() + + def activate(self, plugin=None): + # Check if plugin should be active + enabled = self._settings_manager.getSetting('yourPluginEnabled') + if not enabled: + return +``` + +#### **Message Presentation** +```python +def _present_message(self, message): + """Present a message to the user via speech.""" + try: + if self.app: + state = self.app.getDynamicApiManager().getAPI('CthulhuState') + if state and state.activeScript: + state.activeScript.presentMessage(message, resetStyles=False) + except Exception as e: + logger.error(f"Error presenting message: {e}") +``` + +#### **Sound Generation** +```python +from cthulhu import sound +from cthulhu.sound_generator import Tone + +def _play_sound(self): + player = sound.getPlayer() + tone = Tone(duration=0.15, frequency=400, volumeMultiplier=0.7) + player.play(tone, interrupt=False) +``` + +### **Debugging Plugin Issues** + +#### **Common Debug Techniques** +1. **Add debug output to both logger and print**: + ```python + logger.info("Plugin message") + print("DEBUG: Plugin message") # Shows in terminal + ``` + +2. **Check plugin loading**: + ```python + # In __init__ + with open('/tmp/your_plugin_debug.log', 'a') as f: + f.write("Plugin loaded\n") + ``` + +3. **Verify keybinding registration**: + ```python + if self._kb_binding: + print(f"DEBUG: Keybinding registered: {self._kb_binding}") + else: + print("DEBUG: Keybinding registration FAILED") + ``` + +#### **Common Issues & Solutions** + +| Issue | Symptom | Solution | +|-------|---------|----------| +| Plugin not loading | No __init__ debug output | Check `activePlugins` list | +| Keybindings not working | "stored for later registration" | Use correct parameter order | +| Import errors | Plugin fails to activate | Check module imports and dependencies | +| Settings not loading | Default values used | Verify settings key names | + +### **Working Plugin Examples** +- **`DisplayVersion`**: Simple keybinding + message +- **`PluginManager`**: GUI dialog + settings management +- **`IndentationAudio`**: Event listening + sound generation +- **`AIAssistant`**: Complex settings + multi-keybinding + external APIs + ## D-Bus Remote Controller Integration -### **NEW FEATURE**: D-Bus Service for Remote Control -Cthulhu now includes a D-Bus service (ported from Orca v49.alpha) for external control and automation: +### **EXISTING FEATURE**: D-Bus Service for Remote Control +Cthulhu includes a D-Bus service (ported from Orca v49.alpha) for external control and automation: - **Service Name**: `org.stormux.Cthulhu.Service` - **Object Path**: `/org/stormux/Cthulhu/Service` diff --git a/configure.ac b/configure.ac index cb631d5..ad5b4a2 100644 --- a/configure.ac +++ b/configure.ac @@ -133,6 +133,7 @@ src/cthulhu/scripts/toolkits/Qt/Makefile src/cthulhu/scripts/toolkits/WebKitGtk/Makefile src/cthulhu/scripts/toolkits/gtk/Makefile src/cthulhu/plugins/Makefile +src/cthulhu/plugins/AIAssistant/Makefile src/cthulhu/plugins/ByeCthulhu/Makefile src/cthulhu/plugins/HelloCthulhu/Makefile src/cthulhu/plugins/Clipboard/Makefile diff --git a/cthulhu-autostart.desktop b/cthulhu-autostart.desktop index 65c42ee..bd0dc0c 100644 --- a/cthulhu-autostart.desktop +++ b/cthulhu-autostart.desktop @@ -1,46 +1,59 @@ [Desktop Entry] Type=Application Name[an]=Lector de pantalla Cthulhu -Name[ast]=Llector de pantalla Cthulhu -Name[be]=ÐÑÑÐÑÑÐ Cthulhu -Name[bg]=Cthulhu âÐÐÐ ÑÑÑName[bs]=Cthulhu ÄtaÄekrana +Name[ast]=Llector de pantalla d'Cthulhu +Name[be]=Чытач з экрана Cthulhu +Name[bg]=Cthulhu — екранен четец +Name[bs]=Cthulhu čitač ekrana Name[ca]=Lector de pantalla Cthulhu -Name[cs]=ÄeÄa obrazovky Cthulhu -Name[da]=SkÃmlÃeren Cthulhu +Name[cs]=Čtečka obrazovky Cthulhu +Name[da]=Skærmlæseren Cthulhu Name[de]=Cthulhu-Bildschirmleser -Name[el]=ÎÎÎÏÎ ÎÏÎ Cthulhu +Name[el]=Αναγνώστης οθόνης Cthulhu Name[en_GB]=Cthulhu Screen Reader -Name[eo]=Ekranlegilo Cthulhu +Name[eo]=Ekranlegilo Orko Name[es]=Lector de pantalla Cthulhu Name[eu]=Cthulhu pantaila-irakurlea -Name[fa]=ØØâØ Cthulhu -Name[fi]=Cthulhu-nÃtÃlukija -Name[fr]=Lecteur dâran Cthulhu +Name[fa]=صفحه‌خوان اورکا +Name[fi]=Cthulhu-näytönlukija +Name[fr]=Lecteur d’écran Cthulhu Name[gl]=Lector da pantalla Cthulhu -Name[he]=××××× Cthulhu -Name[hi]=Cthulhu àààà -Name[hu]=Cthulhu kÃernyÅlvasÃName[id]=Pembaca Layar Cthulhu -Name[is]=Cthulhu skjÃestur +Name[he]=מקריא המסך Cthulhu +Name[hi]=Cthulhu स्क्रीन वाचक +Name[hu]=Orka képernyőolvasó +Name[id]=Pembaca Layar Cthulhu +Name[is]=Cthulhu skjálestur Name[it]=Lettore schermo Cthulhu -Name[ka]=Cthulhu - ááááááName[kk]=Cthulhu ÑÑÐÐ ÐÐÑ ÒÐÐÐÑ -Name[lt]=Cthulhu ekrano skaityklÄName[lv]=Cthulhu ekrÄa lasÄÄs -Name[mk]=Cthulhu ÑÑÑÐ ÐÑÐÑName[nb]=Cthulhu skjermleser -Name[ne]=Cthulhu àà ààame[nl]=Cthulhu schermlezer +Name[ka]=Cthulhu - ეკრანის მკითხველი +Name[kk]=Cthulhu экраннан оқитын қолданбасы +Name[lt]=Cthulhu ekrano skaityklė +Name[lv]=Cthulhu ekrāna lasītājs +Name[mk]=Cthulhu читач на екранот +Name[nb]=Cthulhu skjermleser +Name[ne]=ओर्का दृष्टि वाचक +Name[nl]=Cthulhu schermlezer Name[oc]=Lector d'ecran Cthulhu -Name[pa]=Cthulhu ààààame[pl]=Czytnik ekranowy Cthulhu -Name[pt]=Leitor de ecrÃCthulhu +Name[pa]=ਓਰਕਾ ਸਕਰੀਨ ਰੀਡਰ +Name[pl]=Czytnik ekranowy Cthulhu +Name[pt]=Leitor de ecrã Cthulhu Name[pt_BR]=Leitor de tela Cthulhu Name[ro]=Cititorul de ecran Cthulhu -Name[ru]=ÐÑÐÑ ÐÐÐ Cthulhu -Name[sl]=Zaslonski bralnik Cthulhu -Name[sr]=ÐÑÑÐÑÐ Cthulhu -Name[sv]=Cthulhu skÃmlÃare -Name[ta]=Cthulhu àààà -Name[te]=Cthulhu ààà -Name[tg]=ÐÐÐÐ ÑÑÐ Cthulhu +Name[ru]=Экранный диктор Cthulhu +Name[sl]=Zaslonski bralnik Orka +Name[sr]=Читач екрана Орка +Name[sv]=Cthulhu skärmläsare +Name[ta]=ஆர்கா திரை படிப்பி +Name[te]=ఓర్కా తెరచదువరి +Name[tg]=Хонандаи экрани Cthulhu Name[tr]=Cthulhu Ekran Okuyucu -Name[ug]=Cthulhu ØÙØ ØÙØ -Name[uk]=ÐÑÑÐÐ ÑÑÐÑÐÐÑÐ ÂthulhuÂName[zh_CN]=Cthulhu åèName[zh_HK]=Cthulhu èèName=Cthulhu Screen Reader +Name[ug]=Cthulhu ئېكران ئوقۇغۇ +Name[uk]=Інструмент читання з екрана «Cthulhu» +Name[zh_CN]=Cthulhu 屏幕阅读器 +Name[zh_HK]=Cthulhu 螢幕閱讀器 +Name=Cthulhu Screen Reader Exec=cthulhu NoDisplay=true +AutostartCondition=GSettings org.gnome.desktop.a11y.applications screen-reader-enabled X-GNOME-AutoRestart=true +#X-GNOME-Autostart-Phase=Initialization +OnlyShowIn=GNOME;MATE;Unity;Cinnamon; diff --git a/distro-packages/Arch-Linux/PKGBUILD b/distro-packages/Arch-Linux/PKGBUILD index 2b44725..8389027 100644 --- a/distro-packages/Arch-Linux/PKGBUILD +++ b/distro-packages/Arch-Linux/PKGBUILD @@ -31,6 +31,10 @@ depends=( python-dasbus libpeas + # AI Assistant dependencies (for screenshots and HTTP requests) + python-pillow + python-requests + # Desktop integration gsettings-desktop-schemas hicolor-icon-theme @@ -49,6 +53,12 @@ optdepends=( 'festival: Alternative TTS engine' 'flite: Lightweight TTS engine' 'espeak: Legacy TTS engine' + + # AI Assistant providers (optional) + 'python-anthropic: Claude AI provider support' + 'python-openai: ChatGPT AI provider support' + 'python-google-generativeai: Gemini AI provider support' + 'ollama: Local AI model support' ) makedepends=( git diff --git a/src/cthulhu/cthulhu-setup.ui b/src/cthulhu/cthulhu-setup.ui index c8ab565..9bf7933 100644 --- a/src/cthulhu/cthulhu-setup.ui +++ b/src/cthulhu/cthulhu-setup.ui @@ -3412,6 +3412,211 @@ False + + + True + False + 12 + 12 + 12 + 12 + 6 + 12 + + + Enable AI Assistant + True + True + False + True + True + + + + 0 + 0 + 2 + + + + + True + False + start + _Provider: + True + aiProviderCombo + + + 0 + 1 + + + + + True + False + True + + Claude (Anthropic) + ChatGPT (OpenAI) + Gemini (Google) + Ollama (Local) + + + + + 1 + 1 + + + + + True + False + start + API _Key File: + True + aiApiKeyEntry + + + 0 + 2 + + + + + True + False + 6 + + + True + True + True + Path to API key file + + + + True + True + 0 + + + + + _Browse... + True + True + True + True + + + + False + False + 1 + + + + + 1 + 2 + + + + + Require confirmation before AI actions + True + True + False + True + True + True + + + + 0 + 3 + 2 + + + + + True + False + start + Ollama _Model: + True + aiOllamaModelEntry + + + 0 + 4 + + + + + True + True + True + llama3.2-vision + Model name for Ollama (e.g., llama3.2-vision) + + + + 1 + 4 + + + + + True + False + start + Screenshot _Quality: + True + aiScreenshotQualityCombo + + + 0 + 5 + + + + + True + False + True + 1 + + Low + Medium + High + + + + + 1 + 5 + + + + + 8 + + + + + True + False + AI Assistant + + + 8 + False + + True diff --git a/src/cthulhu/cthulhuVersion.py b/src/cthulhu/cthulhuVersion.py index ae202e3..3cf47f9 100644 --- a/src/cthulhu/cthulhuVersion.py +++ b/src/cthulhu/cthulhuVersion.py @@ -23,5 +23,5 @@ # Fork of Orca Screen Reader (GNOME) # Original source: https://gitlab.gnome.org/GNOME/orca -version = "2025.08.02" +version = "2025.08.03" codeName = "testing" diff --git a/src/cthulhu/cthulhu_gui_prefs.py b/src/cthulhu/cthulhu_gui_prefs.py index dd7c613..b0470dd 100644 --- a/src/cthulhu/cthulhu_gui_prefs.py +++ b/src/cthulhu/cthulhu_gui_prefs.py @@ -1815,6 +1815,10 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): self.__initProfileCombo() if self.script.app: self.get_widget('profilesFrame').set_sensitive(False) + + # AI Assistant settings + # + self._initAIState() def __initProfileCombo(self): """Adding available profiles and setting active as the active one""" @@ -1842,6 +1846,66 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): """Get available user profiles.""" return _settingsManager.availableProfiles() + def _initAIState(self): + """Initialize AI Assistant tab widgets with current settings.""" + prefs = self.prefsDict + + # Store widget references + self.enableAICheckButton = self.get_widget("enableAICheckButton") + self.aiProviderCombo = self.get_widget("aiProviderCombo") + self.aiApiKeyEntry = self.get_widget("aiApiKeyEntry") + self.aiOllamaModelEntry = self.get_widget("aiOllamaModelEntry") + self.aiConfirmationCheckButton = self.get_widget("aiConfirmationCheckButton") + self.aiScreenshotQualityCombo = self.get_widget("aiScreenshotQualityCombo") + + # Set enable AI checkbox + enabled = prefs.get("aiAssistantEnabled", settings.aiAssistantEnabled) + self.enableAICheckButton.set_active(enabled) + + # Set provider combo + provider = prefs.get("aiProvider", settings.aiProvider) + providerIndex = 0 # Default to Claude + if provider == settings.AI_PROVIDER_CHATGPT: + providerIndex = 1 + elif provider == settings.AI_PROVIDER_GEMINI: + providerIndex = 2 + elif provider == settings.AI_PROVIDER_OLLAMA: + providerIndex = 3 + self.aiProviderCombo.set_active(providerIndex) + + # Set API key file + apiKeyFile = prefs.get("aiApiKeyFile", settings.aiApiKeyFile) + self.aiApiKeyEntry.set_text(apiKeyFile) + + # Set Ollama model + ollamaModel = prefs.get("aiOllamaModel", settings.aiOllamaModel) + self.aiOllamaModelEntry.set_text(ollamaModel) + + # Set confirmation checkbox + confirmationRequired = prefs.get("aiConfirmationRequired", settings.aiConfirmationRequired) + self.aiConfirmationCheckButton.set_active(confirmationRequired) + + # Set screenshot quality combo + quality = prefs.get("aiScreenshotQuality", settings.aiScreenshotQuality) + qualityIndex = 1 # Default to medium + if quality == settings.AI_SCREENSHOT_QUALITY_LOW: + qualityIndex = 0 + elif quality == settings.AI_SCREENSHOT_QUALITY_HIGH: + qualityIndex = 2 + self.aiScreenshotQualityCombo.set_active(qualityIndex) + + # Enable/disable controls based on AI enabled state + self._updateAIControlsState(enabled) + + def _updateAIControlsState(self, enabled): + """Enable or disable AI controls based on AI enabled state.""" + self.aiProviderCombo.set_sensitive(enabled) + self.aiApiKeyEntry.set_sensitive(enabled) + self.aiOllamaModelEntry.set_sensitive(enabled) + self.aiConfirmationCheckButton.set_sensitive(enabled) + self.aiScreenshotQualityCombo.set_sensitive(enabled) + self.get_widget("aiApiKeyBrowseButton").set_sensitive(enabled) + def _updateCthulhuModifier(self): combobox = self.get_widget("cthulhuModifierComboBox") keystring = ", ".join(self.prefsDict["cthulhuModifierKeys"]) @@ -3573,4 +3637,85 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): self._populateKeyBindings() self.__initProfileCombo() + + # AI Assistant signal handlers + + def enableAIToggled(self, widget): + """Enable AI Assistant checkbox toggled handler""" + enabled = widget.get_active() + self.prefsDict["aiAssistantEnabled"] = enabled + self._updateAIControlsState(enabled) + + # Auto-enable/disable the AIAssistant plugin based on preference + self._updateAIPluginState(enabled) + + def aiProviderChanged(self, widget): + """AI Provider combo box changed handler""" + providers = [settings.AI_PROVIDER_CLAUDE, settings.AI_PROVIDER_CHATGPT, + settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA] + activeIndex = widget.get_active() + if 0 <= activeIndex < len(providers): + self.prefsDict["aiProvider"] = providers[activeIndex] + + def aiApiKeyChanged(self, widget): + """AI API key file entry changed handler""" + self.prefsDict["aiApiKeyFile"] = widget.get_text() + + def aiOllamaModelChanged(self, widget): + """AI Ollama model entry changed handler""" + self.prefsDict["aiOllamaModel"] = widget.get_text() + + def aiApiKeyBrowseClicked(self, widget): + """AI API key browse button clicked handler""" + dialog = Gtk.FileChooserDialog( + title="Select API Key File", + parent=self, + action=Gtk.FileChooserAction.OPEN + ) + dialog.add_buttons( + Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, + Gtk.STOCK_OPEN, Gtk.ResponseType.OK + ) + + response = dialog.run() + if response == Gtk.ResponseType.OK: + filename = dialog.get_filename() + self.aiApiKeyEntry.set_text(filename) + self.prefsDict["aiApiKeyFile"] = filename + + dialog.destroy() + + def aiConfirmationToggled(self, widget): + """AI confirmation required checkbox toggled handler""" + self.prefsDict["aiConfirmationRequired"] = widget.get_active() + + def aiScreenshotQualityChanged(self, widget): + """AI screenshot quality combo box changed handler""" + qualities = [settings.AI_SCREENSHOT_QUALITY_LOW, + settings.AI_SCREENSHOT_QUALITY_MEDIUM, + settings.AI_SCREENSHOT_QUALITY_HIGH] + activeIndex = widget.get_active() + if 0 <= activeIndex < len(qualities): + self.prefsDict["aiScreenshotQuality"] = qualities[activeIndex] + + def _updateAIPluginState(self, enabled): + """Enable or disable the AIAssistant plugin in activePlugins list.""" + try: + activePlugins = self.prefsDict.get("activePlugins", settings.activePlugins[:]) + + if enabled: + # Add AIAssistant to active plugins if not already there + if "AIAssistant" not in activePlugins: + activePlugins.insert(0, "AIAssistant") # Add at beginning for priority + self.prefsDict["activePlugins"] = activePlugins + print(f"DEBUG: Added AIAssistant to activePlugins: {activePlugins}") + else: + # Remove AIAssistant from active plugins + if "AIAssistant" in activePlugins: + activePlugins.remove("AIAssistant") + self.prefsDict["activePlugins"] = activePlugins + print(f"DEBUG: Removed AIAssistant from activePlugins: {activePlugins}") + + except Exception as e: + print(f"DEBUG: Error updating AI plugin state: {e}") diff --git a/src/cthulhu/plugins/AIAssistant/Makefile.am b/src/cthulhu/plugins/AIAssistant/Makefile.am new file mode 100644 index 0000000..f919766 --- /dev/null +++ b/src/cthulhu/plugins/AIAssistant/Makefile.am @@ -0,0 +1,7 @@ +cthulhu_python_PYTHON = \ + __init__.py \ + plugin.info \ + plugin.py \ + ai_providers.py + +cthulhu_pythondir=$(pkgpythondir)/plugins/AIAssistant \ No newline at end of file diff --git a/src/cthulhu/plugins/AIAssistant/__init__.py b/src/cthulhu/plugins/AIAssistant/__init__.py new file mode 100644 index 0000000..34733fd --- /dev/null +++ b/src/cthulhu/plugins/AIAssistant/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Stormux +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., Franklin Street, Fifth Floor, +# Boston MA 02110-1301 USA. + +"""AI Assistant plugin package.""" + +from .plugin import AIAssistant \ No newline at end of file diff --git a/src/cthulhu/plugins/AIAssistant/ai_providers.py b/src/cthulhu/plugins/AIAssistant/ai_providers.py new file mode 100644 index 0000000..00dfd69 --- /dev/null +++ b/src/cthulhu/plugins/AIAssistant/ai_providers.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Stormux +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +"""AI providers for the AI Assistant plugin.""" + +import logging +import json +import requests +from abc import ABC, abstractmethod + +logger = logging.getLogger(__name__) + +class AIProvider(ABC): + """Abstract base class for AI providers.""" + + def __init__(self, api_key=None, model=None, **kwargs): + self.api_key = api_key + self.model = model + self.kwargs = kwargs + + @abstractmethod + def describe_screen(self, screenshot_data, accessibility_data): + """Generate a description of the current screen.""" + pass + + @abstractmethod + def answer_question(self, question, screenshot_data, accessibility_data): + """Answer a question about the current screen/focus.""" + pass + + @abstractmethod + def suggest_actions(self, request, screenshot_data, accessibility_data): + """Suggest actions to accomplish a user's request.""" + pass + + def _prepare_system_prompt(self, task_type): + """Prepare system prompt based on task type.""" + base_prompt = """You are an AI assistant helping a screen reader user navigate and interact with computer applications. You have access to: + +1. A screenshot of the current screen +2. Detailed accessibility tree information about UI elements +3. Information about the currently focused element + +The user is using the Cthulhu screen reader, so they cannot see the screen visually. Your responses should be clear, concise, and focused on accessibility. + +""" + + if task_type == "describe": + return base_prompt + """Your task: Provide a clear, structured description of what's on the screen. Focus on: +- Main UI elements and their layout +- Current focus location +- Available actions and navigation options +- Any important visual information not captured in accessibility data + +Keep descriptions concise but informative.""" + + elif task_type == "question": + return base_prompt + """Your task: Answer the user's question about the current screen or focused element. Use both the screenshot and accessibility data to provide accurate, helpful information. + +Be specific and actionable in your responses.""" + + elif task_type == "action": + return base_prompt + """Your task: Analyze the user's action request and suggest specific steps to accomplish it. Consider: +- Current focus and context +- Available UI elements that can accomplish the task +- Safest and most efficient approach +- Any potential risks or confirmations needed + +Provide step-by-step instructions that can be executed via accessibility APIs.""" + + return base_prompt + + +class ClaudeProvider(AIProvider): + """Claude AI provider using Anthropic's API.""" + + def __init__(self, api_key, model="claude-3-5-sonnet-20241022", **kwargs): + super().__init__(api_key, model, **kwargs) + self.base_url = "https://api.anthropic.com/v1/messages" + self.headers = { + "Content-Type": "application/json", + "X-API-Key": self.api_key, + "anthropic-version": "2023-06-01" + } + + def describe_screen(self, screenshot_data, accessibility_data): + """Generate a description using Claude.""" + try: + prompt = self._build_prompt("describe", None, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Claude describe error: {e}") + return f"Error getting screen description: {e}" + + def answer_question(self, question, screenshot_data, accessibility_data): + """Answer a question using Claude.""" + try: + prompt = self._build_prompt("question", question, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Claude question error: {e}") + return f"Error answering question: {e}" + + def suggest_actions(self, request, screenshot_data, accessibility_data): + """Suggest actions using Claude.""" + try: + prompt = self._build_prompt("action", request, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Claude action error: {e}") + return f"Error suggesting actions: {e}" + + def _build_prompt(self, task_type, user_input, accessibility_data): + """Build the complete prompt for Claude.""" + prompt = f"Current accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." + + return prompt + + def _make_request(self, prompt, screenshot_data): + """Make request to Claude API.""" + try: + # Prepare the message content + content = [ + { + "type": "text", + "text": prompt + } + ] + + # Add screenshot if available + if screenshot_data: + content.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": f"image/{screenshot_data['format']}", + "data": screenshot_data['data'] + } + }) + + payload = { + "model": self.model, + "max_tokens": 1000, + "messages": [ + { + "role": "user", + "content": content + } + ], + "system": self._prepare_system_prompt("describe") # Will be made dynamic later + } + + response = requests.post( + self.base_url, + headers=self.headers, + json=payload, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + return result['content'][0]['text'] + else: + error_msg = f"Claude API error {response.status_code}: {response.text}" + logger.error(error_msg) + return error_msg + + except requests.RequestException as e: + error_msg = f"Network error contacting Claude: {e}" + logger.error(error_msg) + return error_msg + except Exception as e: + error_msg = f"Unexpected error with Claude API: {e}" + logger.error(error_msg) + return error_msg + + +class OllamaProvider(AIProvider): + """Ollama local AI provider.""" + + def __init__(self, model="llama3.2-vision", base_url="http://localhost:11434", **kwargs): + super().__init__(model=model, **kwargs) + self.base_url = base_url + + def describe_screen(self, screenshot_data, accessibility_data): + """Generate a description using Ollama.""" + try: + prompt = self._build_prompt("describe", None, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Ollama describe error: {e}") + return f"Error getting screen description: {e}" + + def answer_question(self, question, screenshot_data, accessibility_data): + """Answer a question using Ollama.""" + try: + prompt = self._build_prompt("question", question, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Ollama question error: {e}") + return f"Error answering question: {e}" + + def suggest_actions(self, request, screenshot_data, accessibility_data): + """Suggest actions using Ollama.""" + try: + prompt = self._build_prompt("action", request, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Ollama action error: {e}") + return f"Error suggesting actions: {e}" + + def _build_prompt(self, task_type, user_input, accessibility_data): + """Build the complete prompt for Ollama.""" + system_prompt = self._prepare_system_prompt(task_type) + + prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." + + return prompt + + def _make_request(self, prompt, screenshot_data): + """Make request to Ollama API.""" + try: + # For Ollama, we'll use the generate endpoint + payload = { + "model": self.model, + "prompt": prompt, + "stream": False + } + + # Note: Ollama vision support varies by model + # For now, we'll send text-only requests + # TODO: Add image support when Ollama vision models are more stable + + response = requests.post( + f"{self.base_url}/api/generate", + json=payload, + timeout=60 # Ollama can be slower + ) + + if response.status_code == 200: + result = response.json() + return result.get('response', 'No response from Ollama') + else: + error_msg = f"Ollama API error {response.status_code}: {response.text}" + logger.error(error_msg) + return error_msg + + except requests.RequestException as e: + error_msg = f"Network error contacting Ollama: {e}" + logger.error(error_msg) + return error_msg + except Exception as e: + error_msg = f"Unexpected error with Ollama API: {e}" + logger.error(error_msg) + return error_msg + + +def create_provider(provider_type, **kwargs): + """Factory function to create AI providers.""" + if provider_type == "claude": + return ClaudeProvider(**kwargs) + elif provider_type == "ollama": + return OllamaProvider(**kwargs) + else: + raise ValueError(f"Unknown provider type: {provider_type}") \ No newline at end of file diff --git a/src/cthulhu/plugins/AIAssistant/plugin.info b/src/cthulhu/plugins/AIAssistant/plugin.info new file mode 100644 index 0000000..ed2b929 --- /dev/null +++ b/src/cthulhu/plugins/AIAssistant/plugin.info @@ -0,0 +1,8 @@ +name = AI Assistant +version = 1.0.0 +description = AI-powered accessibility assistant for analyzing screens and taking actions +authors = Stormux +website = https://stormux.org +copyright = Copyright 2025 +builtin = false +hidden = false \ No newline at end of file diff --git a/src/cthulhu/plugins/AIAssistant/plugin.py b/src/cthulhu/plugins/AIAssistant/plugin.py new file mode 100644 index 0000000..ab53610 --- /dev/null +++ b/src/cthulhu/plugins/AIAssistant/plugin.py @@ -0,0 +1,727 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Stormux +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +"""AI Assistant plugin for Cthulhu screen reader.""" + +import logging +import os +import json +import base64 +from io import BytesIO + +import gi +gi.require_version('Gdk', '3.0') +gi.require_version('GdkPixbuf', '2.0') +gi.require_version('Atspi', '2.0') +gi.require_version('Gtk', '3.0') +from gi.repository import Gdk, GdkPixbuf, Atspi, Gtk + +from cthulhu.plugin import Plugin, cthulhu_hookimpl +from cthulhu import settings +from cthulhu import settings_manager +from cthulhu import cthulhu_state +from cthulhu import ax_object +from cthulhu import ax_utilities +from cthulhu.ax_utilities_state import AXUtilitiesState +from cthulhu.plugins.AIAssistant.ai_providers import create_provider + +logger = logging.getLogger(__name__) + +class AIAssistant(Plugin): + """AI-powered accessibility assistant plugin. + + Provides AI-enhanced accessibility features including: + - Screen analysis using screenshots and AT-SPI data + - Natural language queries about UI elements + - Safe action assistance with user confirmation + - Multi-provider AI support (Claude, ChatGPT, Gemini, Ollama) + """ + + def __init__(self, *args, **kwargs): + """Initialize the AI Assistant plugin.""" + super().__init__(*args, **kwargs) + logger.info("AI Assistant plugin initialized") + print("DEBUG: AI Assistant plugin __init__ called") + + # Write to a debug file so we can see if the plugin is being loaded + try: + with open('/tmp/ai_assistant_debug.log', 'a') as f: + f.write("AI Assistant plugin __init__ called\n") + except: + pass + + # Keybinding storage + self._kb_binding_activate = None + self._kb_binding_question = None + self._kb_binding_describe = None + + # AI provider and settings + self._provider_type = None + self._ai_provider = None + self._api_key = None + self._ollama_model = None + self._settings_manager = settings_manager.getManager() + + # Plugin enabled state + self._enabled = False + + # Pre-captured screen data (to avoid capturing dialog itself) + self._current_screen_data = None + + @cthulhu_hookimpl + def activate(self, plugin=None): + """Activate the AI Assistant plugin.""" + if plugin is not None and plugin is not self: + return + + try: + logger.info("=== AI Assistant plugin activation starting ===") + print("DEBUG: AI Assistant plugin activation starting") + + # Check if AI Assistant is enabled in settings + enabled = self._settings_manager.getSetting('aiAssistantEnabled') + print(f"DEBUG: AI Assistant enabled setting: {enabled}") + if not enabled: + logger.info("AI Assistant is disabled in settings, skipping activation") + print("DEBUG: AI Assistant is disabled in settings, skipping activation") + return + + # Load AI settings + self._load_ai_settings() + + # Check if we have valid configuration + if not self._validate_configuration(): + logger.warning("AI Assistant configuration invalid, skipping activation") + return + + # Initialize AI provider + self._initialize_ai_provider() + + # Register keybindings only if configuration is valid + self._register_keybindings() + + self._enabled = True + logger.info("AI Assistant plugin activated successfully") + print("DEBUG: AI Assistant plugin activated successfully") + + except Exception as e: + logger.error(f"Error activating AI Assistant plugin: {e}") + import traceback + logger.error(traceback.format_exc()) + + @cthulhu_hookimpl + def deactivate(self, plugin=None): + """Deactivate the AI Assistant plugin.""" + if plugin is not None and plugin is not self: + return + + logger.info("Deactivating AI Assistant plugin") + + # Unregister keybindings + self._unregister_keybindings() + + self._enabled = False + + def _load_ai_settings(self): + """Load AI Assistant settings from Cthulhu configuration.""" + try: + # Get provider + provider = self._settings_manager.getSetting('aiProvider') + self._provider_type = provider or settings.AI_PROVIDER_CLAUDE + + # Load API key from file + api_key_file = self._settings_manager.getSetting('aiApiKeyFile') + if api_key_file and os.path.isfile(api_key_file): + with open(api_key_file, 'r') as f: + self._api_key = f.read().strip() + else: + self._api_key = None + + # Load Ollama model + self._ollama_model = self._settings_manager.getSetting('aiOllamaModel') + if not self._ollama_model: + self._ollama_model = settings.aiOllamaModel + + logger.info(f"AI settings loaded: provider={self._provider_type}, " + f"api_key_configured={bool(self._api_key)}, " + f"ollama_model={self._ollama_model}") + + except Exception as e: + logger.error(f"Error loading AI settings: {e}") + + def _validate_configuration(self): + """Validate AI Assistant configuration.""" + if not self._provider_type: + logger.warning("No AI provider configured") + return False + + # Ollama doesn't need an API key + if self._provider_type == settings.AI_PROVIDER_OLLAMA: + return self._check_ollama_availability() + + # Other providers need API keys + if not self._api_key: + logger.warning(f"No API key configured for provider {self._provider_type}") + return False + + return True + + def _check_ollama_availability(self): + """Check if Ollama is available and has vision models.""" + try: + import requests + # Check if Ollama is running + response = requests.get('http://localhost:11434/api/version', timeout=5) + if response.status_code == 200: + logger.info("Ollama service is available") + return True + else: + logger.warning("Ollama service not responding") + return False + except Exception as e: + logger.warning(f"Ollama not available: {e}") + return False + + def _initialize_ai_provider(self): + """Initialize the AI provider based on settings.""" + try: + if self._provider_type == settings.AI_PROVIDER_CLAUDE: + self._ai_provider = create_provider("claude", api_key=self._api_key) + elif self._provider_type == settings.AI_PROVIDER_OLLAMA: + self._ai_provider = create_provider("ollama", model=self._ollama_model) + else: + logger.error(f"Unsupported provider type: {self._provider_type}") + return False + + logger.info(f"AI provider initialized: {self._provider_type}") + return True + + except Exception as e: + logger.error(f"Error initializing AI provider: {e}") + return False + + def _register_keybindings(self): + """Register AI Assistant keybindings.""" + try: + # Main AI Assistant activation - avoid conflict with Actions + self._kb_binding_activate = self.registerGestureByString( + self._handle_ai_activate, + "Activate AI Assistant", + 'kb:cthulhu+control+shift+a' + ) + + # Ask question about current focus + self._kb_binding_question = self.registerGestureByString( + self._handle_ai_question, + "Ask AI about current focus", + 'kb:cthulhu+control+shift+q' + ) + + # Describe current screen + self._kb_binding_describe = self.registerGestureByString( + self._handle_ai_describe, + "AI describe current screen", + 'kb:cthulhu+control+shift+d' + ) + + logger.info("AI Assistant keybindings registered") + print(f"DEBUG: AI Assistant keybindings registered - activate: {self._kb_binding_activate}, question: {self._kb_binding_question}, describe: {self._kb_binding_describe}") + + except Exception as e: + logger.error(f"Error registering AI keybindings: {e}") + + def _unregister_keybindings(self): + """Unregister AI Assistant keybindings.""" + # Keybindings are automatically cleaned up when plugin deactivates + self._kb_binding_activate = None + self._kb_binding_question = None + self._kb_binding_describe = None + + def _handle_ai_activate(self, script=None, inputEvent=None): + """Handle main AI Assistant activation.""" + try: + logger.info("AI Assistant activation requested") + print("DEBUG: AI Assistant activation keybinding triggered!") + + if not self._enabled: + print("DEBUG: AI Assistant not enabled, presenting message") + self._present_message("AI Assistant is not enabled") + return True + + # For now, just show status until Phase 5 adds the action interface + if self._ai_provider: + provider_name = self._provider_type.title() + self._present_message(f"AI Assistant ready using {provider_name}. Press D to describe screen, Q to ask questions.") + else: + self._present_message("AI Assistant not properly configured. Check settings.") + + return True + + except Exception as e: + logger.error(f"Error in AI activate handler: {e}") + return False + + def _handle_ai_question(self, script=None, inputEvent=None): + """Handle AI question request.""" + try: + logger.info("AI question requested") + + if not self._enabled: + self._present_message("AI Assistant is not enabled") + return True + + if not self._ai_provider: + self._present_message("AI provider not available. Check configuration.") + return True + + # IMPORTANT: Collect screen data BEFORE opening dialog + # This captures the actual window the user is asking about + self._present_message("AI Assistant capturing screen data...") + self._current_screen_data = self._collect_ai_data() + + if not self._current_screen_data: + self._present_message("Could not collect screen data for analysis") + return True + + # Now show question dialog + self._show_question_dialog() + + return True + + except Exception as e: + logger.error(f"Error in AI question handler: {e}") + return False + + def _handle_ai_describe(self, script=None, inputEvent=None): + """Handle AI screen description request.""" + try: + logger.info("AI screen description requested") + + if not self._enabled: + self._present_message("AI Assistant is not enabled") + return True + + # Use AI to describe the current screen + if not self._ai_provider: + self._present_message("AI provider not available. Check configuration.") + return True + + self._present_message("AI Assistant analyzing screen...") + + # Collect data and get AI description + data = self._collect_ai_data() + if data: + try: + response = self._ai_provider.describe_screen( + data.get('screenshot'), + data.get('accessibility') + ) + self._present_message(response) + except Exception as e: + logger.error(f"Error getting AI screen description: {e}") + self._present_message(f"Error getting AI screen description: {e}") + else: + self._present_message("Could not collect screen data for analysis") + + return True + + except Exception as e: + logger.error(f"Error in AI describe handler: {e}") + return False + + def _present_message(self, message): + """Present a message to the user via speech.""" + try: + if self.app: + state = self.app.getDynamicApiManager().getAPI('CthulhuState') + if state and state.activeScript: + state.activeScript.presentMessage(message, resetStyles=False) + else: + logger.warning("No active script available for message presentation") + else: + logger.warning("No app reference available for message presentation") + except Exception as e: + logger.error(f"Error presenting message: {e}") + + def _capture_screenshot(self): + """Capture a screenshot of the current display.""" + try: + # Get the default display and root window + display = Gdk.Display.get_default() + if not display: + logger.error("No display available for screenshot") + return None + + screen = display.get_default_screen() + root_window = screen.get_root_window() + + # Get screen dimensions + width = screen.get_width() + height = screen.get_height() + + # Capture the screenshot + pixbuf = Gdk.pixbuf_get_from_window(root_window, 0, 0, width, height) + + if not pixbuf: + logger.error("Failed to capture screenshot") + return None + + # Convert to base64 for AI transmission + success, buffer = pixbuf.save_to_bufferv("png", [], []) + if not success: + logger.error("Failed to save pixbuf to buffer") + return None + image_data = base64.b64encode(buffer).decode('utf-8') + + logger.info(f"Screenshot captured: {width}x{height}") + return { + 'format': 'png', + 'width': width, + 'height': height, + 'data': image_data + } + + except Exception as e: + logger.error(f"Error capturing screenshot: {e}") + return None + + def _get_accessibility_tree(self): + """Get accessibility tree information for the current focus.""" + try: + # Get the current focus object + focus_obj = cthulhu_state.locusOfFocus + if not focus_obj: + logger.warning("No focus object available") + return None + + # Collect accessibility information + tree_data = { + 'focus': self._serialize_ax_object(focus_obj), + 'context': [] + } + + # Get parent context (up to 3 levels) + parent = ax_object.AXObject.get_parent(focus_obj) + level = 0 + while parent and level < 3: + tree_data['context'].append(self._serialize_ax_object(parent)) + parent = ax_object.AXObject.get_parent(parent) + level += 1 + + # Get children of focus (if any) + child_count = ax_object.AXObject.get_child_count(focus_obj) + if child_count > 0: + children = [] + for i in range(min(child_count, 10)): # Limit to first 10 + child = ax_object.AXObject.get_child(focus_obj, i) + if child: + children.append(self._serialize_ax_object(child)) + if children: + tree_data['children'] = children + + logger.info(f"Accessibility tree collected for {ax_object.AXObject.get_name(focus_obj) or 'unnamed object'}") + return tree_data + + except Exception as e: + logger.error(f"Error getting accessibility tree: {e}") + return None + + def _serialize_ax_object(self, obj): + """Serialize an accessibility object to JSON-compatible format.""" + try: + if not obj: + return None + + return { + 'name': ax_object.AXObject.get_name(obj) or '', + 'role': ax_object.AXObject.get_role_name(obj) or '', + 'description': ax_object.AXObject.get_description(obj) or '', + 'text': self._get_object_text(obj), + 'value': self._get_object_value(obj), + 'states': self._get_object_states(obj), + 'attributes': self._get_object_attributes(obj), + 'position': self._get_object_position(obj) + } + + except Exception as e: + logger.error(f"Error serializing accessibility object: {e}") + return None + + def _get_object_text(self, obj): + """Get text content from an accessibility object.""" + try: + # Use script utilities to get displayed text if available + if cthulhu_state.activeScript and hasattr(cthulhu_state.activeScript, 'utilities'): + try: + text = cthulhu_state.activeScript.utilities.displayedText(obj) + if text: + return text.strip() + except: + pass + + # Fallback: try direct AT-SPI text interface + try: + if ax_object.AXObject.supports_text(obj): + text_iface = obj.queryText() + if text_iface: + text = text_iface.getText(0, -1) + if text: + return text.strip() + except: + pass + + return "" + + except Exception as e: + logger.error(f"Error getting object text: {e}") + return "" + + def _get_object_value(self, obj): + """Get value from an accessibility object.""" + try: + if ax_object.AXObject.supports_value(obj): + try: + value_iface = obj.queryValue() + if value_iface: + return str(value_iface.currentValue) or "" + except: + pass + return "" + + except Exception as e: + logger.error(f"Error getting object value: {e}") + return "" + + def _get_object_states(self, obj): + """Get state information from an accessibility object.""" + try: + states = [] + if AXUtilitiesState.is_focused(obj): + states.append("focused") + if AXUtilitiesState.is_selected(obj): + states.append("selected") + if AXUtilitiesState.is_expanded(obj): + states.append("expanded") + if AXUtilitiesState.is_checked(obj): + states.append("checked") + if AXUtilitiesState.is_sensitive(obj): + states.append("sensitive") + if AXUtilitiesState.is_showing(obj): + states.append("showing") + if AXUtilitiesState.is_visible(obj): + states.append("visible") + + return states + + except Exception as e: + logger.error(f"Error getting object states: {e}") + return [] + + def _get_object_attributes(self, obj): + """Get attributes from an accessibility object.""" + try: + attrs = {} + + # Get object attributes from AT-SPI + try: + if hasattr(obj, 'get_attributes'): + obj_attrs = obj.get_attributes() + if obj_attrs: + attrs['object_attributes'] = dict(obj_attrs) + except: + pass + + return attrs + + except Exception as e: + logger.error(f"Error getting object attributes: {e}") + return {} + + def _get_object_position(self, obj): + """Get position and size information from an accessibility object.""" + try: + if hasattr(obj, 'queryComponent'): + component = obj.queryComponent() + if component: + extents = component.getExtents(Atspi.CoordType.SCREEN) + return { + 'x': extents.x, + 'y': extents.y, + 'width': extents.width, + 'height': extents.height + } + return None + + except Exception as e: + logger.error(f"Error getting object position: {e}") + return None + + def _collect_ai_data(self): + """Collect both screenshot and accessibility data for AI analysis.""" + try: + logger.info("Collecting AI data (screenshot + accessibility tree)") + + # Collect both types of data + screenshot = self._capture_screenshot() + accessibility_tree = self._get_accessibility_tree() + + data = { + 'timestamp': __import__('time').time(), + 'screenshot': screenshot, + 'accessibility': accessibility_tree + } + + # Add current application context + if cthulhu_state.activeScript: + app_name = getattr(cthulhu_state.activeScript, 'name', 'unknown') + data['application'] = app_name + + logger.info("AI data collection completed") + return data + + except Exception as e: + logger.error(f"Error collecting AI data: {e}") + return None + + def _show_question_dialog(self): + """Show a dialog for the user to enter their question.""" + try: + dialog = Gtk.Dialog( + title="AI Assistant Question", + parent=None, + flags=Gtk.DialogFlags.MODAL, + buttons=( + Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, + Gtk.STOCK_OK, Gtk.ResponseType.OK + ) + ) + + dialog.set_default_size(500, 200) + + # Create the question entry + content_area = dialog.get_content_area() + + label = Gtk.Label(label="Enter your question about the current screen:") + label.set_halign(Gtk.Align.START) + content_area.pack_start(label, False, False, 10) + + entry = Gtk.Entry() + entry.set_placeholder_text("What would you like to know?") + entry.set_activates_default(True) + content_area.pack_start(entry, False, False, 10) + + dialog.set_default_response(Gtk.ResponseType.OK) + dialog.show_all() + + # Set focus to the entry + entry.grab_focus() + + response = dialog.run() + + if response == Gtk.ResponseType.OK: + question = entry.get_text().strip() + if question: + # Transform dialog to show processing and response + self._transform_dialog_for_response(dialog, question) + else: + dialog.destroy() + self._present_message("No question entered") + else: + dialog.destroy() + self._present_message("Question cancelled") + + except Exception as e: + logger.error(f"Error showing question dialog: {e}") + self._present_message(f"Error showing question dialog: {e}") + + def _transform_dialog_for_response(self, dialog, question): + """Transform the question dialog to show AI processing and response.""" + try: + # Clear existing content + content_area = dialog.get_content_area() + for child in content_area.get_children(): + content_area.remove(child) + + # Remove existing buttons + for child in dialog.get_action_area().get_children(): + dialog.get_action_area().remove(child) + + # Change title + dialog.set_title("AI Assistant Response") + + # Show question and processing message + question_label = Gtk.Label() + question_label.set_markup(f"Question: {question}") + question_label.set_line_wrap(True) + question_label.set_halign(Gtk.Align.START) + content_area.pack_start(question_label, False, False, 10) + + # Processing label (will be updated with response) + self._response_label = Gtk.Label(label="Processing your question...") + self._response_label.set_line_wrap(True) + self._response_label.set_halign(Gtk.Align.START) + self._response_label.set_selectable(True) # Allow text selection + content_area.pack_start(self._response_label, True, True, 10) + + # Add close button + close_button = dialog.add_button(Gtk.STOCK_CLOSE, Gtk.ResponseType.CLOSE) + dialog.set_default_response(Gtk.ResponseType.CLOSE) + + # Resize for response content + dialog.set_default_size(600, 400) + dialog.show_all() + + # Focus the response label so screen reader announces it + self._response_label.grab_focus() + + # Process question asynchronously + self._process_user_question_async(dialog, question) + + except Exception as e: + logger.error(f"Error transforming dialog: {e}") + dialog.destroy() + self._present_message(f"Error showing response: {e}") + + def _process_user_question_async(self, dialog, question): + """Process the user's question and update dialog with response.""" + try: + # Use the pre-captured screen data (captured before dialog opened) + data = self._current_screen_data + if data: + try: + response = self._ai_provider.answer_question( + question, + data.get('screenshot'), + data.get('accessibility') + ) + + # Update the response label + self._response_label.set_markup(f"Response:\n{response}") + + # Also speak the response + self._present_message(response) + + # Set up dialog close handler + def on_response(dialog, response_id): + dialog.destroy() + + dialog.connect("response", on_response) + + except Exception as e: + logger.error(f"Error getting AI response: {e}") + self._response_label.set_markup(f"Error: {e}") + self._present_message(f"Error getting AI response: {e}") + else: + self._response_label.set_markup("Error: No screen data available") + self._present_message("No screen data available") + + except Exception as e: + logger.error(f"Error processing user question: {e}") + self._response_label.set_markup(f"Error: {e}") + self._present_message(f"Error processing question: {e}") diff --git a/src/cthulhu/plugins/Makefile.am b/src/cthulhu/plugins/Makefile.am index e311901..e99b790 100644 --- a/src/cthulhu/plugins/Makefile.am +++ b/src/cthulhu/plugins/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = Clipboard DisplayVersion IndentationAudio PluginManager hello_world self_voice ByeCthulhu HelloCthulhu SimplePluginSystem +SUBDIRS = AIAssistant Clipboard DisplayVersion IndentationAudio PluginManager hello_world self_voice ByeCthulhu HelloCthulhu SimplePluginSystem cthulhu_pythondir=$(pkgpythondir)/plugins diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py index 0c4e1a8..e979097 100644 --- a/src/cthulhu/settings.py +++ b/src/cthulhu/settings.py @@ -148,7 +148,15 @@ userCustomizableSettings = [ "sayAllContextLandmark", "sayAllContextNonLandmarkForm", "sayAllContextList", - "sayAllContextTable" + "sayAllContextTable", + "aiAssistantEnabled", + "aiProvider", + "aiApiKeyFile", + "aiOllamaModel", + "aiConfirmationRequired", + "aiActionTimeout", + "aiScreenshotQuality", + "aiMaxContextLength" ] GENERAL_KEYBOARD_LAYOUT_DESKTOP = 1 @@ -188,6 +196,16 @@ CHAT_SPEAK_ALL = 0 CHAT_SPEAK_ALL_IF_FOCUSED = 1 CHAT_SPEAK_FOCUSED_CHANNEL = 2 +# AI Assistant constants +AI_PROVIDER_CLAUDE = "claude" +AI_PROVIDER_CHATGPT = "chatgpt" +AI_PROVIDER_GEMINI = "gemini" +AI_PROVIDER_OLLAMA = "ollama" + +AI_SCREENSHOT_QUALITY_LOW = "low" +AI_SCREENSHOT_QUALITY_MEDIUM = "medium" +AI_SCREENSHOT_QUALITY_HIGH = "high" + DEFAULT_VOICE = "default" UPPERCASE_VOICE = "uppercase" HYPERLINK_VOICE = "hyperlink" @@ -413,4 +431,14 @@ presentChatRoomLast = False presentLiveRegionFromInactiveTab = False # Plugins -activePlugins = ['DisplayVersion', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] +activePlugins = ['AIAssistant', 'DisplayVersion', 'PluginManager', 'HelloCthulhu', 'ByeCthulhu'] + +# AI Assistant settings (disabled by default for opt-in behavior) +aiAssistantEnabled = False +aiProvider = AI_PROVIDER_CLAUDE +aiApiKeyFile = "" +aiOllamaModel = "llama3.2-vision" +aiConfirmationRequired = True +aiActionTimeout = 30 +aiScreenshotQuality = AI_SCREENSHOT_QUALITY_MEDIUM +aiMaxContextLength = 4000