diff --git a/CLAUDE.md b/CLAUDE.md index f6b5d2f..df17b2c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -356,6 +356,11 @@ Cthulhu now includes an optional AI assistant plugin for enhanced accessibility # 4. Configure safety and quality settings ``` +### AI Assistant Keybindings +- **Cthulhu+Control+Shift+Q**: Ask questions about current screen +- **Cthulhu+Control+Shift+D**: Describe current screen +- **Cthulhu+Control+Shift+A**: Request actions (click, type, copy) + ### AI Provider Setup #### 1. Claude (Anthropic) - **Recommended** @@ -424,14 +429,16 @@ ollama list # Should show downloaded models ### AI Assistant Usage Patterns - **Information Queries**: "What does this unlabeled button do?" - **Navigation Help**: "Where is the login form?" -- **Action Assistance**: "Click the submit button" (with confirmation) +- **Action Assistance**: "Click the submit button", "Type hello world and press enter" - **Layout Understanding**: "Describe the main sections of this page" +- **Text Operations**: "Copy this text to clipboard", "Enter my username in the field" ### Safety Framework - **Confirmation Required**: All actions require user approval by default -- **Action Descriptions**: Clear explanation of what will happen +- **Action Descriptions**: Clear explanation of what will happen before execution - **Safe Defaults**: Conservative timeouts and quality settings - **Privacy Protection**: API keys stored securely, no data logging +- **Action Types**: Click, Type, Copy operations via PyAutoGUI (Wayland/X11 compatible) ### Troubleshooting AI Assistant Setup @@ -449,7 +456,7 @@ curl http://localhost:11434/api/version # Should return Ollama version ollama ps # Should show running models # Check dependencies -python3 -c "import requests, PIL; print('Dependencies OK')" +python3 -c "import requests, PIL, pyautogui; print('Dependencies OK')" # Test screenshot capability (requires X11/Wayland) python3 -c " @@ -464,6 +471,7 @@ print('Screenshot capability available') - **Screen Access**: Screenshot capture (automatic on most setups) - **Network Access**: HTTP requests to AI providers (except Ollama) - **AT-SPI Access**: Accessibility tree traversal (enabled by default) +- **Input Synthesis**: PyAutoGUI for action execution (click, type, copy) ## Cthulhu Plugin System - Developer Reference diff --git a/distro-packages/Arch-Linux/PKGBUILD b/distro-packages/Arch-Linux/PKGBUILD index 8389027..8dafb8c 100644 --- a/distro-packages/Arch-Linux/PKGBUILD +++ b/distro-packages/Arch-Linux/PKGBUILD @@ -1,7 +1,7 @@ # Maintainer: Storm Dragon pkgname=cthulhu -pkgver=2025.08.02 +pkgver=2025.08.03 pkgrel=1 pkgdesc="Desktop-agnostic screen reader with plugin system, forked from Orca" url="https://git.stormux.org/storm/cthulhu" @@ -31,9 +31,10 @@ depends=( python-dasbus libpeas - # AI Assistant dependencies (for screenshots and HTTP requests) + # AI Assistant dependencies (for screenshots, HTTP requests, and actions) python-pillow python-requests + python-pyautogui # Desktop integration gsettings-desktop-schemas diff --git a/src/cthulhu/cthulhu-setup.ui b/src/cthulhu/cthulhu-setup.ui index 9bf7933..cec0d39 100644 --- a/src/cthulhu/cthulhu-setup.ui +++ b/src/cthulhu/cthulhu-setup.ui @@ -3458,10 +3458,11 @@ False True + Claude Code (Enhanced) Claude (Anthropic) ChatGPT (OpenAI) Gemini (Google) - Ollama (Local) + Ollama (Local - Free) @@ -3504,13 +3505,14 @@ - - _Browse... + + Get _Claude API Key True True True True - + Open browser to get Claude API key and save automatically + False diff --git a/src/cthulhu/cthulhu_gui_prefs.py b/src/cthulhu/cthulhu_gui_prefs.py index b0470dd..516e6ac 100644 --- a/src/cthulhu/cthulhu_gui_prefs.py +++ b/src/cthulhu/cthulhu_gui_prefs.py @@ -1864,13 +1864,15 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): # Set provider combo provider = prefs.get("aiProvider", settings.aiProvider) - providerIndex = 0 # Default to Claude - if provider == settings.AI_PROVIDER_CHATGPT: + providerIndex = 0 # Default to Claude Code + if provider == settings.AI_PROVIDER_CLAUDE: providerIndex = 1 - elif provider == settings.AI_PROVIDER_GEMINI: + elif provider == settings.AI_PROVIDER_CHATGPT: providerIndex = 2 - elif provider == settings.AI_PROVIDER_OLLAMA: + elif provider == settings.AI_PROVIDER_GEMINI: providerIndex = 3 + elif provider == settings.AI_PROVIDER_OLLAMA: + providerIndex = 4 self.aiProviderCombo.set_active(providerIndex) # Set API key file @@ -1904,7 +1906,39 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): self.aiOllamaModelEntry.set_sensitive(enabled) self.aiConfirmationCheckButton.set_sensitive(enabled) self.aiScreenshotQualityCombo.set_sensitive(enabled) - self.get_widget("aiApiKeyBrowseButton").set_sensitive(enabled) + try: + self.get_widget("aiGetClaudeKeyButton").set_sensitive(enabled) + except: + pass # Button might not exist in older UI files + + # Update provider-specific controls if AI is enabled + if enabled: + current_provider = self.prefsDict.get("aiProvider", settings.aiProvider) + self._updateProviderControls(current_provider) + + def _updateProviderControls(self, provider): + """Update visibility/sensitivity of provider-specific controls.""" + # API key controls (needed for Claude, ChatGPT, Gemini - not for Claude Code or Ollama) + api_key_needed = provider in [settings.AI_PROVIDER_CLAUDE, settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI] + self.aiApiKeyEntry.set_sensitive(api_key_needed) + + # Get Claude API Key button (only for Claude Code) + try: + claude_button = self.get_widget("aiGetClaudeKeyButton") + claude_button.set_visible(provider == settings.AI_PROVIDER_CLAUDE_CODE) + except: + pass # Button might not exist + + # Ollama model entry (only for Ollama) + self.aiOllamaModelEntry.set_sensitive(provider == settings.AI_PROVIDER_OLLAMA) + + # Update labels based on provider + if provider == settings.AI_PROVIDER_CLAUDE_CODE: + self.aiApiKeyEntry.set_placeholder_text("No API key needed - uses Claude Code CLI") + elif provider == settings.AI_PROVIDER_OLLAMA: + self.aiApiKeyEntry.set_placeholder_text("No API key needed - uses local Ollama") + else: + self.aiApiKeyEntry.set_placeholder_text("Path to API key file") def _updateCthulhuModifier(self): combobox = self.get_widget("cthulhuModifierComboBox") @@ -3645,17 +3679,16 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): enabled = widget.get_active() self.prefsDict["aiAssistantEnabled"] = enabled self._updateAIControlsState(enabled) - - # Auto-enable/disable the AIAssistant plugin based on preference - self._updateAIPluginState(enabled) def aiProviderChanged(self, widget): """AI Provider combo box changed handler""" - providers = [settings.AI_PROVIDER_CLAUDE, settings.AI_PROVIDER_CHATGPT, - settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA] + providers = [settings.AI_PROVIDER_CLAUDE_CODE, settings.AI_PROVIDER_CLAUDE, + settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA] activeIndex = widget.get_active() if 0 <= activeIndex < len(providers): - self.prefsDict["aiProvider"] = providers[activeIndex] + provider = providers[activeIndex] + self.prefsDict["aiProvider"] = provider + self._updateProviderControls(provider) def aiApiKeyChanged(self, widget): """AI API key file entry changed handler""" @@ -3665,6 +3698,98 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): """AI Ollama model entry changed handler""" self.prefsDict["aiOllamaModel"] = widget.get_text() + def aiGetClaudeKeyClicked(self, widget): + """Get Claude API Key button clicked handler""" + import subprocess + import os + + try: + # Open browser to Claude API key page + subprocess.run(["xdg-open", "https://console.anthropic.com/"], check=True) + + # Show dialog with instructions + dialog = Gtk.MessageDialog( + parent=self, + flags=Gtk.DialogFlags.MODAL, + type=Gtk.MessageType.INFO, + buttons=Gtk.ButtonsType.OK, + message_format="Claude API Key Setup" + ) + dialog.format_secondary_text( + "Browser opened to get your Claude API key.\n\n" + "💡 TIP: Claude offers $5 free credit, then ~$20/month for Pro.\n" + "Ollama is also available as a free alternative.\n\n" + "Steps:\n" + "1. Sign up or log in to your Anthropic account\n" + "2. Go to 'API Keys' in Account Settings\n" + "3. Click 'Create Key' and copy the API key\n" + "4. Click OK below when you have your key ready\n" + "5. Paste the API key when prompted" + ) + dialog.run() + dialog.destroy() + + # Show API key input dialog + key_dialog = Gtk.MessageDialog( + parent=self, + flags=Gtk.DialogFlags.MODAL, + type=Gtk.MessageType.QUESTION, + buttons=Gtk.ButtonsType.OK_CANCEL, + message_format="Enter Claude API Key" + ) + key_dialog.format_secondary_text("Paste your Claude API key (starts with 'sk-ant-'):") + + # Add text entry to dialog + entry = Gtk.Entry() + entry.set_placeholder_text("sk-ant-your-api-key-here...") + entry.set_visibility(False) # Hide key for security + entry.set_width_chars(50) + key_dialog.get_content_area().pack_start(entry, False, False, 0) + entry.show() + + response = key_dialog.run() + api_key = entry.get_text().strip() + key_dialog.destroy() + + if response == Gtk.ResponseType.OK and api_key: + # Save API key to file + config_dir = os.path.expanduser("~/.local/share/cthulhu") + os.makedirs(config_dir, exist_ok=True) + api_key_file = os.path.join(config_dir, "claude-api-key") + + with open(api_key_file, 'w') as f: + f.write(api_key) + os.chmod(api_key_file, 0o600) # Secure file permissions + + # Update GUI + self.get_widget("aiApiKeyEntry").set_text(api_key_file) + self.prefsDict["aiApiKeyFile"] = api_key_file + + # Success message + success_dialog = Gtk.MessageDialog( + parent=self, + flags=Gtk.DialogFlags.MODAL, + type=Gtk.MessageType.INFO, + buttons=Gtk.ButtonsType.OK, + message_format="API Key Saved Successfully" + ) + success_dialog.format_secondary_text(f"Claude API key saved to:\n{api_key_file}") + success_dialog.run() + success_dialog.destroy() + + except Exception as e: + # Error dialog + error_dialog = Gtk.MessageDialog( + parent=self, + flags=Gtk.DialogFlags.MODAL, + type=Gtk.MessageType.ERROR, + buttons=Gtk.ButtonsType.OK, + message_format="Error Setting Up API Key" + ) + error_dialog.format_secondary_text(f"Failed to open browser or save API key:\n{str(e)}") + error_dialog.run() + error_dialog.destroy() + def aiApiKeyBrowseClicked(self, widget): """AI API key browse button clicked handler""" dialog = Gtk.FileChooserDialog( @@ -3698,24 +3823,4 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): if 0 <= activeIndex < len(qualities): self.prefsDict["aiScreenshotQuality"] = qualities[activeIndex] - def _updateAIPluginState(self, enabled): - """Enable or disable the AIAssistant plugin in activePlugins list.""" - try: - activePlugins = self.prefsDict.get("activePlugins", settings.activePlugins[:]) - - if enabled: - # Add AIAssistant to active plugins if not already there - if "AIAssistant" not in activePlugins: - activePlugins.insert(0, "AIAssistant") # Add at beginning for priority - self.prefsDict["activePlugins"] = activePlugins - print(f"DEBUG: Added AIAssistant to activePlugins: {activePlugins}") - else: - # Remove AIAssistant from active plugins - if "AIAssistant" in activePlugins: - activePlugins.remove("AIAssistant") - self.prefsDict["activePlugins"] = activePlugins - print(f"DEBUG: Removed AIAssistant from activePlugins: {activePlugins}") - - except Exception as e: - print(f"DEBUG: Error updating AI plugin state: {e}") diff --git a/src/cthulhu/plugins/AIAssistant/ai_providers.py b/src/cthulhu/plugins/AIAssistant/ai_providers.py index 00dfd69..04149f6 100644 --- a/src/cthulhu/plugins/AIAssistant/ai_providers.py +++ b/src/cthulhu/plugins/AIAssistant/ai_providers.py @@ -66,13 +66,40 @@ Keep descriptions concise but informative.""" Be specific and actionable in your responses.""" elif task_type == "action": - return base_prompt + """Your task: Analyze the user's action request and suggest specific steps to accomplish it. Consider: -- Current focus and context -- Available UI elements that can accomplish the task -- Safest and most efficient approach -- Any potential risks or confirmations needed + return base_prompt + """Your task: I WILL EXECUTE THE ACTION IMMEDIATELY - provide the structured response format only. -Provide step-by-step instructions that can be executed via accessibility APIs.""" +🚨 CRITICAL RULES: +- I am the action execution system - I WILL perform the action, not give instructions +- NEVER provide programming code, implementation steps, or "how to" instructions +- NEVER mention ASCII codes, KEY_DOWN events, or technical implementation details +- Always use ACTION TYPE: COPY, TYPE, CLICK, SCROLL, or NAVIGATE +- Be direct about what I will do + +STRICT ACTION TYPE MAPPING: +- "find [element] and enter [text]" → ACTION TYPE: TYPE (I will locate element and type text) +- "type [text]" → ACTION TYPE: TYPE (I will type the specified text) +- "copy [text] to clipboard" → ACTION TYPE: COPY (I will copy to clipboard) +- "click [element]" → ACTION TYPE: CLICK (I will click the element) + +MANDATORY RESPONSE FORMAT: + +**ACTION ANALYSIS**: I will [specific action] +**TARGET ELEMENT**: [element description] +**ACTION TYPE**: [TYPE/COPY/CLICK/SCROLL/NAVIGATE] +**SAFETY CHECK**: [any concerns] +**STEP-BY-STEP**: What I will execute (NO CODE, NO IMPLEMENTATION DETAILS) + +Example for "find edit box and enter text": +**ACTION ANALYSIS**: I will locate the edit box and type the specified text into it +**TARGET ELEMENT**: Text input field ("What's on your mind?" edit box) +**ACTION TYPE**: TYPE +**SAFETY CHECK**: This will input text into the focused text field +**STEP-BY-STEP**: +1. Locate the edit box in the interface +2. Focus on the edit box +3. Type the requested text into the field + +🚨 NEVER GIVE PROGRAMMING CODE OR TECHNICAL INSTRUCTIONS""" return base_prompt @@ -188,6 +215,118 @@ class ClaudeProvider(AIProvider): return error_msg +class ClaudeCodeProvider(AIProvider): + """Claude Code CLI provider - uses installed Claude Code application.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # No API key needed - uses Claude Code CLI directly + + def describe_screen(self, screenshot_data, accessibility_data): + """Generate a description using Claude Code CLI.""" + try: + prompt = self._build_prompt("describe", None, accessibility_data) + return self._call_claude_code(prompt) + except Exception as e: + logger.error(f"Claude Code describe error: {e}") + return f"Error getting screen description: {e}" + + def answer_question(self, question, screenshot_data, accessibility_data): + """Answer a question using Claude Code CLI.""" + try: + prompt = self._build_prompt("question", question, accessibility_data) + return self._call_claude_code(prompt) + except Exception as e: + logger.error(f"Claude Code question error: {e}") + return f"Error answering question: {e}" + + def suggest_actions(self, request, screenshot_data, accessibility_data): + """Suggest actions using Claude Code CLI.""" + try: + prompt = self._build_prompt("action", request, accessibility_data) + return self._call_claude_code(prompt) + except Exception as e: + logger.error(f"Claude Code action error: {e}") + return f"Error suggesting actions: {e}" + + def _build_prompt(self, task_type, user_input, accessibility_data): + """Build the complete prompt for Claude Code.""" + import json + + system_prompt = self._get_system_prompt(task_type) + + prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nProvide the action analysis in the required format." + + return prompt + + def _call_claude_code(self, prompt): + """Call Claude Code CLI with the prompt.""" + import subprocess + + try: + # Call Claude Code CLI with the prompt directly + result = subprocess.run( + ['claude', '--print', '--output-format', 'text', prompt], + capture_output=True, + text=True, + timeout=60 + ) + + if result.returncode == 0: + return result.stdout.strip() + else: + error_msg = f"Claude Code CLI error: {result.stderr}" + logger.error(error_msg) + return error_msg + + except subprocess.TimeoutExpired: + error_msg = "Claude Code CLI timed out" + logger.error(error_msg) + return error_msg + except Exception as e: + error_msg = f"Error calling Claude Code CLI: {e}" + logger.error(error_msg) + return error_msg + + def _get_system_prompt(self, task_type): + """Get system prompt for Claude Code.""" + base_prompt = """You are Claude Code helping a screen reader user navigate and interact with computer applications. You have expert understanding of terminal commands, programming, and accessibility. + +The user is using the Cthulhu screen reader and cannot see the screen visually. Provide expert technical assistance. + +""" + + if task_type == "action": + return base_prompt + """CRITICAL: I WILL EXECUTE THE ACTION IMMEDIATELY. + +🚨 EXPERT COMMAND UNDERSTANDING: +- "type echo 'hello world'" → Extract exactly: echo 'hello world' +- "run ls -la" → Extract exactly: ls -la +- "execute ./configure --prefix=/usr" → Extract exactly: ./configure --prefix=/usr + +MANDATORY FORMAT: +**ACTION ANALYSIS**: I will [specific action] +**TARGET ELEMENT**: [element description] +**ACTION TYPE**: [TYPE/COPY/CLICK/SCROLL/NAVIGATE] +**SAFETY CHECK**: [assessment] +**STEP-BY-STEP**: What I will execute""" + + elif task_type == "describe": + return base_prompt + "Provide technical descriptions focusing on development tools, terminals, and accessibility elements." + + elif task_type == "question": + return base_prompt + "Answer with expert technical knowledge about programming, terminals, and system operations." + + return base_prompt + + class OllamaProvider(AIProvider): """Ollama local AI provider.""" @@ -279,6 +418,8 @@ def create_provider(provider_type, **kwargs): """Factory function to create AI providers.""" if provider_type == "claude": return ClaudeProvider(**kwargs) + elif provider_type == "claude_code": + return ClaudeCodeProvider(**kwargs) elif provider_type == "ollama": return OllamaProvider(**kwargs) else: diff --git a/src/cthulhu/plugins/AIAssistant/plugin.py b/src/cthulhu/plugins/AIAssistant/plugin.py index ab53610..1e93c2e 100644 --- a/src/cthulhu/plugins/AIAssistant/plugin.py +++ b/src/cthulhu/plugins/AIAssistant/plugin.py @@ -161,9 +161,11 @@ class AIAssistant(Plugin): logger.warning("No AI provider configured") return False - # Ollama doesn't need an API key + # Providers that don't need API keys if self._provider_type == settings.AI_PROVIDER_OLLAMA: return self._check_ollama_availability() + elif self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE: + return self._check_claude_code_availability() # Other providers need API keys if not self._api_key: @@ -188,11 +190,40 @@ class AIAssistant(Plugin): logger.warning(f"Ollama not available: {e}") return False + def _check_claude_code_availability(self): + """Check if Claude Code CLI is available.""" + try: + import subprocess + import shutil + + # First check if claude command exists in PATH + if not shutil.which('claude'): + logger.warning("Claude Code CLI not found in PATH") + return False + + # Quick test to see if it responds + result = subprocess.run(['claude', '--version'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0: + logger.info("Claude Code CLI is available") + return True + else: + logger.warning(f"Claude Code CLI not responding: {result.stderr}") + return False + except subprocess.TimeoutExpired: + logger.warning("Claude Code CLI timeout") + return False + except Exception as e: + logger.warning(f"Claude Code CLI not available: {e}") + return False + def _initialize_ai_provider(self): """Initialize the AI provider based on settings.""" try: if self._provider_type == settings.AI_PROVIDER_CLAUDE: self._ai_provider = create_provider("claude", api_key=self._api_key) + elif self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE: + self._ai_provider = create_provider("claude_code") elif self._provider_type == settings.AI_PROVIDER_OLLAMA: self._ai_provider = create_provider("ollama", model=self._ollama_model) else: @@ -244,7 +275,7 @@ class AIAssistant(Plugin): self._kb_binding_describe = None def _handle_ai_activate(self, script=None, inputEvent=None): - """Handle main AI Assistant activation.""" + """Handle main AI Assistant activation - now shows action dialog.""" try: logger.info("AI Assistant activation requested") print("DEBUG: AI Assistant activation keybinding triggered!") @@ -254,12 +285,20 @@ class AIAssistant(Plugin): self._present_message("AI Assistant is not enabled") return True - # For now, just show status until Phase 5 adds the action interface - if self._ai_provider: - provider_name = self._provider_type.title() - self._present_message(f"AI Assistant ready using {provider_name}. Press D to describe screen, Q to ask questions.") - else: - self._present_message("AI Assistant not properly configured. Check settings.") + if not self._ai_provider: + self._present_message("AI provider not available. Check configuration.") + return True + + # NEW: Show action request dialog for Phase 5 + self._present_message("AI Assistant capturing screen data for action...") + self._current_screen_data = self._collect_ai_data() + + if not self._current_screen_data: + self._present_message("Could not collect screen data for action") + return True + + # Show action request dialog + self._show_action_dialog() return True @@ -403,7 +442,8 @@ class AIAssistant(Plugin): # Collect accessibility information tree_data = { 'focus': self._serialize_ax_object(focus_obj), - 'context': [] + 'context': [], + 'actionable_elements': [] # New: For action system } # Get parent context (up to 3 levels) @@ -425,6 +465,9 @@ class AIAssistant(Plugin): if children: tree_data['children'] = children + # NEW: Collect actionable elements for AI actions + tree_data['actionable_elements'] = self._collect_actionable_elements() + logger.info(f"Accessibility tree collected for {ax_object.AXObject.get_name(focus_obj) or 'unnamed object'}") return tree_data @@ -432,13 +475,13 @@ class AIAssistant(Plugin): logger.error(f"Error getting accessibility tree: {e}") return None - def _serialize_ax_object(self, obj): + def _serialize_ax_object(self, obj, include_actions=False): """Serialize an accessibility object to JSON-compatible format.""" try: if not obj: return None - return { + data = { 'name': ax_object.AXObject.get_name(obj) or '', 'role': ax_object.AXObject.get_role_name(obj) or '', 'description': ax_object.AXObject.get_description(obj) or '', @@ -449,6 +492,14 @@ class AIAssistant(Plugin): 'position': self._get_object_position(obj) } + # Include action information for actionable elements + if include_actions: + data['actions'] = self._get_object_actions(obj) + data['is_actionable'] = self._is_actionable_element(obj) + data['action_coordinates'] = self._get_action_coordinates(obj) + + return data + except Exception as e: logger.error(f"Error serializing accessibility object: {e}") return None @@ -617,6 +668,7 @@ class AIAssistant(Plugin): content_area.pack_start(entry, False, False, 10) dialog.set_default_response(Gtk.ResponseType.OK) + # Show all widgets including buttons dialog.show_all() # Set focus to the entry @@ -725,3 +777,771 @@ class AIAssistant(Plugin): logger.error(f"Error processing user question: {e}") self._response_label.set_markup(f"Error: {e}") self._present_message(f"Error processing question: {e}") + + # ============================================================================ + # NEW: Action System Methods for Phase 5 + # ============================================================================ + + def _collect_actionable_elements(self): + """Collect all actionable elements in the current window for AI analysis.""" + try: + logger.info("Collecting actionable elements for action system") + actionable_elements = [] + + # Get the current application's window + if not cthulhu_state.activeScript: + return actionable_elements + + # Start from the application root + app = cthulhu_state.activeScript.app + if not app: + return actionable_elements + + # Recursively find actionable elements + self._find_actionable_elements_recursive(app, actionable_elements, max_depth=5) + + logger.info(f"Found {len(actionable_elements)} actionable elements") + return actionable_elements + + except Exception as e: + logger.error(f"Error collecting actionable elements: {e}") + return [] + + def _find_actionable_elements_recursive(self, obj, actionable_elements, current_depth=0, max_depth=5): + """Recursively find actionable elements in the accessibility tree.""" + try: + if current_depth >= max_depth: + return + + if not obj: + return + + # Check if this element is actionable + if self._is_actionable_element(obj): + element_data = self._serialize_ax_object(obj, include_actions=True) + if element_data and element_data.get('is_actionable'): + actionable_elements.append(element_data) + + # Recurse through children + try: + child_count = ax_object.AXObject.get_child_count(obj) + for i in range(min(child_count, 20)): # Limit children to prevent overflow + child = ax_object.AXObject.get_child(obj, i) + if child: + self._find_actionable_elements_recursive( + child, actionable_elements, current_depth + 1, max_depth + ) + except: + pass + + except Exception as e: + logger.debug(f"Error in recursive element search: {e}") + + def _is_actionable_element(self, obj): + """Determine if an accessibility object is actionable for AI operations.""" + try: + if not obj: + return False + + # Get role and states + role = ax_object.AXObject.get_role_name(obj) + states = self._get_object_states(obj) + + # Define actionable roles + actionable_roles = { + 'push button', 'button', 'toggle button', 'radio button', 'check box', + 'menu item', 'list item', 'tree item', 'tab', 'link', 'entry', 'text', + 'password text', 'combo box', 'spin button', 'slider', 'scroll bar' + } + + # Check if role is actionable + if role and role.lower() in actionable_roles: + # Additional checks for enabled/visible states + if states: + state_names = [state.lower() for state in states] + # Element must be enabled and visible + if 'enabled' in state_names and 'visible' in state_names: + return True + + # Check for specific action interfaces + try: + actions = self._get_object_actions(obj) + if actions and len(actions) > 0: + return True + except: + pass + + return False + + except Exception as e: + logger.debug(f"Error checking if element is actionable: {e}") + return False + + def _get_object_actions(self, obj): + """Get available actions for an accessibility object.""" + try: + actions = [] + + # Check for AT-SPI action interface + try: + if hasattr(obj, 'queryAction'): + action_iface = obj.queryAction() + if action_iface: + action_count = action_iface.get_nActions() + for i in range(action_count): + action_name = action_iface.getName(i) + action_desc = action_iface.getDescription(i) + actions.append({ + 'name': action_name or '', + 'description': action_desc or '', + 'index': i + }) + except: + pass + + return actions + + except Exception as e: + logger.debug(f"Error getting object actions: {e}") + return [] + + def _get_action_coordinates(self, obj): + """Get coordinates for performing actions on an object.""" + try: + position = self._get_object_position(obj) + if position and 'x' in position and 'y' in position: + # Calculate center point for clicking + center_x = position['x'] + (position.get('width', 0) // 2) + center_y = position['y'] + (position.get('height', 0) // 2) + + return { + 'center_x': center_x, + 'center_y': center_y, + 'bounds': position + } + + return None + + except Exception as e: + logger.debug(f"Error getting action coordinates: {e}") + return None + + # ============================================================================ + # Action Dialog and Execution Methods + # ============================================================================ + + def _show_action_dialog(self): + """Show dialog for entering action requests.""" + try: + # Create dialog without parent first + dialog = Gtk.Dialog( + title="AI Assistant Actions", + flags=Gtk.DialogFlags.MODAL | Gtk.DialogFlags.DESTROY_WITH_PARENT, + buttons=( + Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, + "Analyze", Gtk.ResponseType.OK + ) + ) + + # Make dialog fully accessible + dialog.set_resizable(True) + dialog.set_modal(True) + dialog.set_type_hint(Gdk.WindowTypeHint.DIALOG) + + dialog.set_default_size(600, 300) + + content_area = dialog.get_content_area() + + # Instruction label + label = Gtk.Label() + label.set_markup("Tell the AI what you want to do:\n" + + "Examples: 'Click the Continue button', 'Enter storm into username field', " + + "'Copy this text to clipboard'") + label.set_line_wrap(True) + label.set_halign(Gtk.Align.START) + content_area.pack_start(label, False, False, 10) + + # Action entry + entry = Gtk.Entry() + entry.set_placeholder_text("What would you like me to do?") + entry.set_activates_default(True) + content_area.pack_start(entry, False, False, 10) + + dialog.set_default_response(Gtk.ResponseType.OK) + dialog.show_all() + + entry.grab_focus() + + response = dialog.run() + + if response == Gtk.ResponseType.OK: + action_request = entry.get_text().strip() + if action_request: + # Close this dialog and create a new confirmation dialog + dialog.destroy() + self._show_action_confirmation_dialog(action_request) + else: + dialog.destroy() + self._present_message("No action request entered") + else: + dialog.destroy() + self._present_message("Action request cancelled") + + except Exception as e: + logger.error(f"Error showing action dialog: {e}") + self._present_message(f"Error showing action dialog: {e}") + + def _show_action_confirmation_dialog(self, action_request): + """Show a fresh confirmation dialog for the action.""" + try: + # Create completely new dialog + dialog = Gtk.Dialog( + title="AI Assistant - Confirm Action", + flags=Gtk.DialogFlags.MODAL | Gtk.DialogFlags.DESTROY_WITH_PARENT + ) + + # Add buttons directly during dialog creation + cancel_button = dialog.add_button("Cancel", Gtk.ResponseType.CANCEL) + execute_button = dialog.add_button("Execute Action", Gtk.ResponseType.ACCEPT) + + # Configure dialog + dialog.set_default_size(600, 400) + dialog.set_resizable(True) + dialog.set_type_hint(Gdk.WindowTypeHint.DIALOG) + + # Configure buttons + execute_button.set_sensitive(False) # Disabled until analysis complete + execute_button.set_can_focus(True) + execute_button.set_can_default(True) + cancel_button.set_can_focus(True) + + dialog.set_default_response(Gtk.ResponseType.ACCEPT) + + content_area = dialog.get_content_area() + + # Status label + status_label = Gtk.Label(label="AI is analyzing your request...") + status_label.set_halign(Gtk.Align.START) + content_area.pack_start(status_label, False, False, 10) + + # Analysis text view + scrolled = Gtk.ScrolledWindow() + scrolled.set_policy(Gtk.PolicyType.AUTOMATIC, Gtk.PolicyType.AUTOMATIC) + scrolled.set_size_request(550, 300) + + text_view = Gtk.TextView() + text_view.set_editable(False) + text_view.set_wrap_mode(Gtk.WrapMode.WORD) + scrolled.add(text_view) + content_area.pack_start(scrolled, True, True, 10) + + # Store references for updating + self._action_dialog = dialog + self._action_status_label = status_label + self._action_text_view = text_view + self._action_execute_button = execute_button + self._current_action_request = action_request + + # Show everything + dialog.show_all() + + # Set up response handler + def on_response(dialog, response_id): + if response_id == Gtk.ResponseType.ACCEPT: + # Execute the confirmed action + if hasattr(self, '_parsed_action'): + # Close dialog FIRST, then execute action + dialog.destroy() + # Add small delay to let dialog close + from gi.repository import GLib + GLib.timeout_add(500, lambda: self._execute_confirmed_action(self._parsed_action)) + else: + dialog.destroy() + + dialog.connect("response", on_response) + + # Start AI analysis in background + import threading + analysis_thread = threading.Thread( + target=self._analyze_action_request, + args=(action_request,) + ) + analysis_thread.daemon = True + analysis_thread.start() + + # Show dialog (non-blocking) + dialog.present() + + except Exception as e: + logger.error(f"Error showing confirmation dialog: {e}") + self._present_message(f"Error showing confirmation dialog: {e}") + + def _transform_dialog_for_action_analysis(self, dialog, action_request): + """Transform action dialog to show AI analysis and confirmation.""" + try: + # Clear existing content + content_area = dialog.get_content_area() + for child in content_area.get_children(): + content_area.remove(child) + + # Clear existing buttons and add new ones + action_area = dialog.get_action_area() + for child in action_area.get_children(): + child.destroy() # Use destroy instead of remove + + # Add new buttons with proper accessibility + cancel_button = Gtk.Button.new_with_label("Cancel") + execute_button = Gtk.Button.new_with_label("Execute Action") + + # Set up button responses + cancel_button.connect("clicked", lambda b: dialog.response(Gtk.ResponseType.CANCEL)) + execute_button.connect("clicked", lambda b: dialog.response(Gtk.ResponseType.ACCEPT)) + + # Add buttons to action area + action_area.pack_start(cancel_button, False, False, 0) + action_area.pack_start(execute_button, False, False, 0) + + # Make buttons accessible and focusable + cancel_button.set_can_focus(True) + execute_button.set_can_focus(True) + execute_button.set_sensitive(False) # Disabled until analysis complete + + # Set default response for Enter key + dialog.set_default_response(Gtk.ResponseType.ACCEPT) + execute_button.set_can_default(True) + execute_button.grab_default() + + # Status label + status_label = Gtk.Label(label="AI is analyzing your request...") + status_label.set_halign(Gtk.Align.START) + content_area.pack_start(status_label, False, False, 10) + + # Analysis text view + scrolled = Gtk.ScrolledWindow() + scrolled.set_policy(Gtk.PolicyType.AUTOMATIC, Gtk.PolicyType.AUTOMATIC) + scrolled.set_size_request(550, 200) + + text_view = Gtk.TextView() + text_view.set_editable(False) + text_view.set_wrap_mode(Gtk.WrapMode.WORD) + scrolled.add(text_view) + content_area.pack_start(scrolled, True, True, 10) + + # Store references for updating + self._action_dialog = dialog + self._action_status_label = status_label + self._action_text_view = text_view + self._action_execute_button = execute_button + self._current_action_request = action_request + + # Show all widgets including new buttons + cancel_button.show() + execute_button.show() + dialog.show_all() + + # Start AI analysis in background + import threading + analysis_thread = threading.Thread( + target=self._analyze_action_request, + args=(action_request,) + ) + analysis_thread.daemon = True + analysis_thread.start() + + # Set up response handlers + def on_response(dialog, response_id): + if response_id == Gtk.ResponseType.ACCEPT: + # Execute the confirmed action + if hasattr(self, '_parsed_action'): + self._execute_confirmed_action(self._parsed_action) + dialog.destroy() + + dialog.connect("response", on_response) + + except Exception as e: + logger.error(f"Error transforming action dialog: {e}") + dialog.destroy() + self._present_message(f"Error in action dialog: {e}") + + def _analyze_action_request(self, action_request): + """Analyze action request using AI (runs in background thread).""" + try: + logger.info(f"Analyzing action request: {action_request}") + + # Use AI to analyze the request + analysis = self._ai_provider.suggest_actions( + action_request, + self._current_screen_data.get('screenshot'), + self._current_screen_data.get('accessibility') + ) + + # Update UI on main thread + from gi.repository import GLib + GLib.idle_add(self._update_action_analysis, analysis) + + except Exception as e: + logger.error(f"Error analyzing action request: {e}") + from gi.repository import GLib + GLib.idle_add(self._update_action_analysis, f"Error analyzing request: {e}") + + def _update_action_analysis(self, analysis): + """Update action dialog with AI analysis results.""" + try: + if not hasattr(self, '_action_dialog'): + return False + + # Update status + self._action_status_label.set_text("AI Analysis Complete:") + + # Update analysis text + buffer = self._action_text_view.get_buffer() + buffer.set_text(analysis) + + # Enable execute button if analysis looks successful + if not analysis.startswith("Error"): + self._action_execute_button.set_sensitive(True) + self._parsed_action = self._parse_action_response(analysis) + # Set focus to execute button so user can press Enter + self._action_execute_button.grab_focus() + self._action_execute_button.grab_default() + logger.info("Execute button enabled and focused") + else: + logger.error(f"Analysis failed: {analysis}") + + self._present_message("Action analysis complete. Review and confirm in dialog. Press Tab to navigate, Enter to execute, Escape to cancel.") + + except Exception as e: + logger.error(f"Error updating action analysis: {e}") + + return False # Don't repeat idle callback + + def _parse_action_response(self, analysis): + """Parse AI action response into executable commands.""" + try: + # Look for structured ACTION TYPE in AI response + action_data = { + 'type': 'unknown', + 'target': None, + 'value': None, + 'coordinates': None, + 'description': analysis + } + + analysis_lower = analysis.lower() + + # Look for explicit ACTION TYPE markers first + if '**action type**:' in analysis_lower: + import re + action_type_match = re.search(r'\*\*action type\*\*:\s*(\w+)', analysis_lower) + if action_type_match: + action_type = action_type_match.group(1).lower() + if action_type in ['click', 'type', 'copy', 'scroll', 'navigate']: + action_data['type'] = action_type + logger.info(f"Detected action type from AI response: {action_type}") + return action_data + + # Fallback: Look for action keywords in the original user request + if hasattr(self, '_current_action_request'): + request_lower = self._current_action_request.lower() + logger.info(f"Parsing user request: {self._current_action_request}") + + # Check user request directly + if 'copy' in request_lower and 'clipboard' in request_lower: + action_data['type'] = 'copy' + logger.info("Detected COPY action from user request") + elif 'type' in request_lower or 'enter' in request_lower: + action_data['type'] = 'type' + logger.info("Detected TYPE action from user request") + elif 'click' in request_lower: + action_data['type'] = 'click' + logger.info("Detected CLICK action from user request") + + # Final fallback: analyze AI response content + if action_data['type'] == 'unknown': + if 'click' in analysis_lower: + action_data['type'] = 'click' + elif 'type' in analysis_lower or 'typewrite' in analysis_lower: + action_data['type'] = 'type' + elif 'copy' in analysis_lower or 'clipboard' in analysis_lower: + action_data['type'] = 'copy' + + logger.info(f"Final parsed action type: {action_data['type']}") + return action_data + + except Exception as e: + logger.error(f"Error parsing action response: {e}") + return {'type': 'error', 'description': str(e)} + + def _execute_confirmed_action(self, action_data): + """Execute the user-confirmed action.""" + try: + logger.info(f"Executing confirmed action: {action_data}") + + action_type = action_data.get('type', 'unknown') + + if action_type == 'click': + result = self._perform_click_action(action_data) + elif action_type == 'type': + result = self._perform_type_action(action_data) + elif action_type == 'copy': + result = self._perform_copy_action(action_data) + else: + result = f"Unknown action type: {action_type}" + + self._present_message(f"Action result: {result}") + + except Exception as e: + logger.error(f"Error executing action: {e}") + self._present_message(f"Error executing action: {e}") + + def _perform_click_action(self, action_data): + """Perform a click action on a UI element.""" + try: + # Try PyAutoGUI for universal clicking + import pyautogui + + # Extract coordinates from action_data or current screen data + coords = action_data.get('coordinates') + if not coords and hasattr(self, '_current_screen_data'): + # Look for actionable elements in the collected data + actionable_elements = self._current_screen_data.get('accessibility', {}).get('actionable_elements', []) + # For now, just click center of screen as fallback + coords = {'center_x': 640, 'center_y': 360} + + if coords: + x, y = coords.get('center_x', 640), coords.get('center_y', 360) + pyautogui.click(x, y) + return f"Clicked at coordinates ({x}, {y})" + else: + return "Could not determine click coordinates" + + except ImportError: + return "PyAutoGUI not available - install with: pip install pyautogui" + except Exception as e: + return f"Click failed: {e}" + + def _perform_type_action(self, action_data): + """Perform a text typing action.""" + try: + import pyautogui + + # Extract text to type from the action description + text_to_type = self._extract_text_from_action(action_data) + logger.info(f"Attempting to type: '{text_to_type}'") + + if text_to_type: + import time + + # Simple approach: Just wait a moment for dialogs to settle, then type + # Since PyAutoGUI works fine when terminal is focused, let's not overthink it + logger.info("Waiting briefly for focus to settle before typing") + time.sleep(1.0) # Give time for any dialogs to close and focus to return + + # Disable PyAutoGUI failsafe for this operation + pyautogui.FAILSAFE = False + + logger.info(f"Starting to type '{text_to_type}' to focused application") + + # Type the text with more reasonable timing + pyautogui.typewrite(text_to_type, interval=0.05) + + # Check if we should press Enter + request_lower = getattr(self, '_current_action_request', '').lower() + description_lower = action_data.get('description', '').lower() + + if ('press enter' in request_lower or 'hit enter' in request_lower or + 'and enter' in request_lower or 'press enter' in description_lower): + time.sleep(0.1) + pyautogui.press('return') + logger.info("Pressed Enter after typing") + return f"Typed '{text_to_type}' and pressed Enter" + else: + return f"Typed '{text_to_type}'" + else: + return "Could not determine text to type" + + except ImportError: + return "PyAutoGUI not available - install with: pip install pyautogui" + except Exception as e: + logger.error(f"Type action failed: {e}") + return f"Type action failed: {e}" + + def _perform_copy_action(self, action_data): + """Perform a copy to clipboard action.""" + try: + # Extract the specific text to copy from the user's request + text_to_copy = self._extract_text_to_copy(action_data) + + if text_to_copy: + # Use direct clipboard manipulation instead of Ctrl+C + import subprocess + + # Use xclip on Linux (works on both X11 and Wayland via XWayland) + try: + process = subprocess.Popen(['xclip', '-selection', 'clipboard'], + stdin=subprocess.PIPE, + text=True) + process.communicate(input=text_to_copy) + return f"Copied '{text_to_copy}' to clipboard" + except FileNotFoundError: + # Fallback to wl-copy for pure Wayland + try: + process = subprocess.Popen(['wl-copy'], + stdin=subprocess.PIPE, + text=True) + process.communicate(input=text_to_copy) + return f"Copied '{text_to_copy}' to clipboard" + except FileNotFoundError: + return "Neither xclip nor wl-copy available for clipboard operations" + else: + # Fallback: try to copy whatever is currently selected + import pyautogui + pyautogui.hotkey('ctrl', 'c') + return "Copied current selection to clipboard" + + except ImportError: + return "Required clipboard tools not available" + except Exception as e: + return f"Copy action failed: {e}" + + def _extract_text_to_copy(self, action_data): + """Extract the specific text to copy from the user request.""" + try: + if hasattr(self, '_current_action_request'): + request = self._current_action_request + request_lower = request.lower() + + # Special case: summarize and copy requests + if 'summar' in request_lower and ('clipboard' in request_lower or 'copy' in request_lower): + summary = self._generate_screen_summary() + if summary: + logger.info("Generated screen summary for clipboard") + return summary + + # Look for quoted text + import re + quoted_matches = re.findall(r'["\']([^"\']+)["\']', request) + if quoted_matches: + return quoted_matches[0] + + # Look for text after "copy" + copy_matches = re.findall(r'copy\s+(.+?)\s+to\s+clipboard', request, re.IGNORECASE) + if copy_matches: + return copy_matches[0].strip('"\'') + + return None + + except Exception as e: + logger.error(f"Error extracting text to copy: {e}") + return None + + def _generate_screen_summary(self): + """Generate a summary of the current screen for clipboard operations.""" + try: + if hasattr(self, '_current_screen_data') and self._current_screen_data: + accessibility_data = self._current_screen_data.get('accessibility', {}) + + # Build a simple summary + summary_parts = [] + + # Application info + app_name = self._current_screen_data.get('application', 'Unknown Application') + summary_parts.append(f"Application: {app_name}") + + # Focus info + focus_info = accessibility_data.get('focus', {}) + if focus_info: + focus_name = focus_info.get('name', '') + focus_role = focus_info.get('role', '') + if focus_name or focus_role: + summary_parts.append(f"Current focus: {focus_name} ({focus_role})") + + # Context info + context = accessibility_data.get('context', []) + if context and len(context) > 0: + parent_info = context[0] + parent_name = parent_info.get('name', '') + parent_role = parent_info.get('role', '') + if parent_name or parent_role: + summary_parts.append(f"In: {parent_name} ({parent_role})") + + # Actionable elements count + actionable_elements = accessibility_data.get('actionable_elements', []) + if actionable_elements: + summary_parts.append(f"Available actions: {len(actionable_elements)} interactive elements") + + if summary_parts: + return '\n'.join(summary_parts) + else: + return f"Screen summary for {app_name} - focused on accessible content" + + return "Unable to generate screen summary - no data available" + + except Exception as e: + logger.error(f"Error generating screen summary: {e}") + return f"Screen summary generation failed: {e}" + + def _extract_text_from_action(self, action_data): + """Extract text to type from action description.""" + try: + # First try the original user request (most reliable) + if hasattr(self, '_current_action_request'): + request = self._current_action_request + logger.info(f"Extracting text from user request: {request}") + + import re + + # Enhanced extraction for commands like "type echo 'hello world'" + if request.lower().startswith('type '): + # Remove "type " from the beginning + text = request[5:].strip() + + # Remove trailing conditions like "and press enter", "in the terminal" + text = re.sub(r'\s+and\s+(press\s+)?enter.*$', '', text, flags=re.IGNORECASE) + text = re.sub(r'\s+in\s+.+$', '', text, flags=re.IGNORECASE) + + logger.info(f"Extracted command after 'type ': '{text}'") + return text + + # Look for quoted text in various formats + quoted_patterns = [ + r'"([^"]*)"', # Double quotes + r"'([^']*)'", # Single quotes + r'`([^`]*)`' # Backticks + ] + + for pattern in quoted_patterns: + matches = re.findall(pattern, request) + if matches: + text = matches[0] + logger.info(f"Found quoted text: '{text}'") + return text + + # Look for text after "type" with flexible matching + type_patterns = [ + r'type\s+(.+?)(?:\s+and\s+|\s+in\s+|\s*$)', # type X and... or type X in... + r'enter\s+(.+?)(?:\s+into\s+|\s*$)', # enter X into... + ] + + for pattern in type_patterns: + matches = re.findall(pattern, request, re.IGNORECASE) + if matches: + text = matches[0].strip() + logger.info(f"Found text with pattern '{pattern}': '{text}'") + return text + + # Fallback: try the AI description for quoted text + description = action_data.get('description', '') + if description: + import re + quoted_matches = re.findall(r'"([^"]*)"', description) + if quoted_matches: + logger.info(f"Found text in AI description: '{quoted_matches[0]}'") + return quoted_matches[0] + + logger.warning("Could not extract text to type") + return None + + except Exception as e: + logger.error(f"Error extracting text from action: {e}") + return None diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py index e979097..70a3686 100644 --- a/src/cthulhu/settings.py +++ b/src/cthulhu/settings.py @@ -198,6 +198,7 @@ CHAT_SPEAK_FOCUSED_CHANNEL = 2 # AI Assistant constants AI_PROVIDER_CLAUDE = "claude" +AI_PROVIDER_CLAUDE_CODE = "claude_code" AI_PROVIDER_CHATGPT = "chatgpt" AI_PROVIDER_GEMINI = "gemini" AI_PROVIDER_OLLAMA = "ollama" @@ -435,7 +436,7 @@ activePlugins = ['AIAssistant', 'DisplayVersion', 'PluginManager', 'HelloCthulhu # AI Assistant settings (disabled by default for opt-in behavior) aiAssistantEnabled = False -aiProvider = AI_PROVIDER_CLAUDE +aiProvider = AI_PROVIDER_CLAUDE_CODE aiApiKeyFile = "" aiOllamaModel = "llama3.2-vision" aiConfirmationRequired = True