diff --git a/src/cthulhu/cthulhuVersion.py b/src/cthulhu/cthulhuVersion.py index 5a95374..44267ae 100644 --- a/src/cthulhu/cthulhuVersion.py +++ b/src/cthulhu/cthulhuVersion.py @@ -23,5 +23,5 @@ # Fork of Orca Screen Reader (GNOME) # Original source: https://gitlab.gnome.org/GNOME/orca -version = "2025.08.11" +version = "2025.08.12" codeName = "testing" diff --git a/src/cthulhu/cthulhu_gui_prefs.py b/src/cthulhu/cthulhu_gui_prefs.py index 927b07e..07cafc9 100644 --- a/src/cthulhu/cthulhu_gui_prefs.py +++ b/src/cthulhu/cthulhu_gui_prefs.py @@ -1868,14 +1868,10 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): # Set provider combo provider = prefs.get("aiProvider", settings.aiProvider) providerIndex = 0 # Default to Claude Code - if provider == settings.AI_PROVIDER_CLAUDE: + if provider == settings.AI_PROVIDER_GEMINI: providerIndex = 1 - elif provider == settings.AI_PROVIDER_CHATGPT: - providerIndex = 2 - elif provider == settings.AI_PROVIDER_GEMINI: - providerIndex = 3 elif provider == settings.AI_PROVIDER_OLLAMA: - providerIndex = 4 + providerIndex = 2 self.aiProviderCombo.set_active(providerIndex) # Set API key file @@ -1926,8 +1922,8 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): def _updateProviderControls(self, provider): """Update visibility/sensitivity of provider-specific controls.""" - # API key controls (needed for Claude, ChatGPT, Gemini - not for Claude Code or Ollama) - api_key_needed = provider in [settings.AI_PROVIDER_CLAUDE, settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI] + # API key controls (only needed for Gemini - not for Claude Code or Ollama) + api_key_needed = provider in [settings.AI_PROVIDER_GEMINI] self.aiApiKeyEntry.set_sensitive(api_key_needed) # Get Claude API Key button (only for Claude Code) @@ -3695,8 +3691,7 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper): def aiProviderChanged(self, widget): """AI Provider combo box changed handler""" - providers = [settings.AI_PROVIDER_CLAUDE_CODE, settings.AI_PROVIDER_CLAUDE, - settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA] + providers = [settings.AI_PROVIDER_CLAUDE_CODE, settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA] activeIndex = widget.get_active() if 0 <= activeIndex < len(providers): provider = providers[activeIndex] diff --git a/src/cthulhu/plugins/AIAssistant/ai_providers.py b/src/cthulhu/plugins/AIAssistant/ai_providers.py index 04149f6..a6af653 100644 --- a/src/cthulhu/plugins/AIAssistant/ai_providers.py +++ b/src/cthulhu/plugins/AIAssistant/ai_providers.py @@ -39,9 +39,21 @@ class AIProvider(ABC): """Suggest actions to accomplish a user's request.""" pass + @abstractmethod + def analyze_images(self, user_question, screenshot_data, accessibility_data): + """Analyze images visible on screen, focusing on visual content rather than UI elements.""" + pass + def _prepare_system_prompt(self, task_type): """Prepare system prompt based on task type.""" - base_prompt = """You are an AI assistant helping a screen reader user navigate and interact with computer applications. You have access to: + if task_type == "image": + base_prompt = """You are an AI assistant helping a screen reader user analyze image files. You have access to a single image file that the user has selected for analysis. + +The user is using the Cthulhu screen reader, so they cannot see images visually. Your task is to provide detailed visual descriptions of the image content to make it accessible. + +""" + else: + base_prompt = """You are an AI assistant helping a screen reader user navigate and interact with computer applications. You have access to: 1. A screenshot of the current screen 2. Detailed accessibility tree information about UI elements @@ -101,119 +113,26 @@ Example for "find edit box and enter text": 🚨 NEVER GIVE PROGRAMMING CODE OR TECHNICAL INSTRUCTIONS""" + elif task_type == "image": + return base_prompt + """Your task: Analyze and describe any images visible on the screen, focusing on visual content rather than UI elements. + +IMPORTANT IMAGE ANALYSIS GUIDELINES: +- Focus ONLY on the visual content of images (photos, diagrams, graphics, artwork) +- IGNORE UI elements like buttons, menus, toolbars, window frames +- Describe what you see IN the images: objects, people, scenes, colors, text within images +- If multiple images are visible, describe each one separately +- For photographs: describe the scene, subjects, lighting, composition +- For diagrams/charts: describe the data, relationships, labels shown +- For screenshots within images: describe the content being shown +- Be detailed about visual elements that a screen reader user cannot access + +If no images are clearly visible on screen, say so clearly. + +Keep descriptions informative and well-structured.""" + return base_prompt -class ClaudeProvider(AIProvider): - """Claude AI provider using Anthropic's API.""" - - def __init__(self, api_key, model="claude-3-5-sonnet-20241022", **kwargs): - super().__init__(api_key, model, **kwargs) - self.base_url = "https://api.anthropic.com/v1/messages" - self.headers = { - "Content-Type": "application/json", - "X-API-Key": self.api_key, - "anthropic-version": "2023-06-01" - } - - def describe_screen(self, screenshot_data, accessibility_data): - """Generate a description using Claude.""" - try: - prompt = self._build_prompt("describe", None, accessibility_data) - return self._make_request(prompt, screenshot_data) - except Exception as e: - logger.error(f"Claude describe error: {e}") - return f"Error getting screen description: {e}" - - def answer_question(self, question, screenshot_data, accessibility_data): - """Answer a question using Claude.""" - try: - prompt = self._build_prompt("question", question, accessibility_data) - return self._make_request(prompt, screenshot_data) - except Exception as e: - logger.error(f"Claude question error: {e}") - return f"Error answering question: {e}" - - def suggest_actions(self, request, screenshot_data, accessibility_data): - """Suggest actions using Claude.""" - try: - prompt = self._build_prompt("action", request, accessibility_data) - return self._make_request(prompt, screenshot_data) - except Exception as e: - logger.error(f"Claude action error: {e}") - return f"Error suggesting actions: {e}" - - def _build_prompt(self, task_type, user_input, accessibility_data): - """Build the complete prompt for Claude.""" - prompt = f"Current accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" - - if task_type == "describe": - prompt += "Please describe what's on this screen." - elif task_type == "question": - prompt += f"User question: {user_input}" - elif task_type == "action": - prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." - - return prompt - - def _make_request(self, prompt, screenshot_data): - """Make request to Claude API.""" - try: - # Prepare the message content - content = [ - { - "type": "text", - "text": prompt - } - ] - - # Add screenshot if available - if screenshot_data: - content.append({ - "type": "image", - "source": { - "type": "base64", - "media_type": f"image/{screenshot_data['format']}", - "data": screenshot_data['data'] - } - }) - - payload = { - "model": self.model, - "max_tokens": 1000, - "messages": [ - { - "role": "user", - "content": content - } - ], - "system": self._prepare_system_prompt("describe") # Will be made dynamic later - } - - response = requests.post( - self.base_url, - headers=self.headers, - json=payload, - timeout=30 - ) - - if response.status_code == 200: - result = response.json() - return result['content'][0]['text'] - else: - error_msg = f"Claude API error {response.status_code}: {response.text}" - logger.error(error_msg) - return error_msg - - except requests.RequestException as e: - error_msg = f"Network error contacting Claude: {e}" - logger.error(error_msg) - return error_msg - except Exception as e: - error_msg = f"Unexpected error with Claude API: {e}" - logger.error(error_msg) - return error_msg - class ClaudeCodeProvider(AIProvider): """Claude Code CLI provider - uses installed Claude Code application.""" @@ -249,34 +168,98 @@ class ClaudeCodeProvider(AIProvider): logger.error(f"Claude Code action error: {e}") return f"Error suggesting actions: {e}" + def analyze_images(self, user_question, screenshot_data, accessibility_data): + """Analyze images visible on screen using Claude Code CLI.""" + try: + prompt = self._build_prompt("image", user_question, accessibility_data) + + # If we have image data, save it to a temporary file for Claude Code + temp_image_path = None + if screenshot_data: + import tempfile + import base64 + import os + + # Create temporary file with appropriate extension + image_format = screenshot_data.get('format', 'png') + suffix = f".{image_format}" + + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: + # Decode base64 image data and write to temp file + image_data = base64.b64decode(screenshot_data['data']) + temp_file.write(image_data) + temp_image_path = temp_file.name + + try: + # Call Claude Code with the image file path + result = self._call_claude_code(prompt, temp_image_path) + return result + finally: + # Clean up temporary file + if os.path.exists(temp_image_path): + os.unlink(temp_image_path) + else: + # No image data, just call with text prompt + return self._call_claude_code(prompt) + + except Exception as e: + logger.error(f"Claude Code image analysis error: {e}") + return f"Error analyzing images: {e}" + def _build_prompt(self, task_type, user_input, accessibility_data): """Build the complete prompt for Claude Code.""" import json system_prompt = self._get_system_prompt(task_type) - prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" - - if task_type == "describe": - prompt += "Please describe what's on this screen." - elif task_type == "question": - prompt += f"User question: {user_input}" - elif task_type == "action": - prompt += f"User wants to: {user_input}\n\nProvide the action analysis in the required format." + if task_type == "image": + # For image analysis, minimize accessibility data weight + if user_input == "ANALYZE_SINGLE_IMAGE_FILE": + prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details." + else: + prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n" + if user_input: + prompt += f"User question about images: {user_input}\n\n" + prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements." + else: + # Standard prompt with full accessibility data + prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nProvide the action analysis in the required format." return prompt - def _call_claude_code(self, prompt): - """Call Claude Code CLI with the prompt.""" + def _call_claude_code(self, prompt, image_path=None): + """Call Claude Code CLI with the prompt and optional image.""" import subprocess + import tempfile + import os try: - # Call Claude Code CLI with the prompt directly + # Build the command + cmd = ['claude', '--print', '--output-format', 'text'] + + # For accessibility analysis, skip permission checks to allow automatic access + cmd.append('--dangerously-skip-permissions') + + # If we have an image path, include it in the prompt + if image_path: + prompt = f"Please analyze and describe the image at {image_path}. {prompt}" + + # Add the prompt + cmd.append(prompt) + + # Call Claude Code CLI result = subprocess.run( - ['claude', '--print', '--output-format', 'text', prompt], + cmd, capture_output=True, text=True, - timeout=60 + timeout=120 # Longer timeout for image analysis ) if result.returncode == 0: @@ -297,7 +280,14 @@ class ClaudeCodeProvider(AIProvider): def _get_system_prompt(self, task_type): """Get system prompt for Claude Code.""" - base_prompt = """You are Claude Code helping a screen reader user navigate and interact with computer applications. You have expert understanding of terminal commands, programming, and accessibility. + if task_type == "image": + base_prompt = """You are Claude Code helping a screen reader user analyze image files. You have access to a single image file that the user has selected for analysis. + +The user is using the Cthulhu screen reader and cannot see images visually. Provide detailed visual descriptions of the image content to make it accessible. + +""" + else: + base_prompt = """You are Claude Code helping a screen reader user navigate and interact with computer applications. You have expert understanding of terminal commands, programming, and accessibility. The user is using the Cthulhu screen reader and cannot see the screen visually. Provide expert technical assistance. @@ -323,6 +313,23 @@ MANDATORY FORMAT: elif task_type == "question": return base_prompt + "Answer with expert technical knowledge about programming, terminals, and system operations." + + elif task_type == "image": + return base_prompt + """Your task: Analyze and describe any images visible on the screen, focusing on visual content rather than UI elements. + +IMPORTANT IMAGE ANALYSIS GUIDELINES: +- Focus ONLY on the visual content of images (photos, diagrams, graphics, artwork) +- IGNORE UI elements like buttons, menus, toolbars, window frames +- Describe what you see IN the images: objects, people, scenes, colors, text within images +- If multiple images are visible, describe each one separately +- For photographs: describe the scene, subjects, lighting, composition +- For diagrams/charts: describe the data, relationships, labels shown +- For screenshots within images: describe the content being shown +- Be detailed about visual elements that a screen reader user cannot access + +If no images are clearly visible on screen, say so clearly. + +Keep descriptions informative and well-structured.""" return base_prompt @@ -361,18 +368,38 @@ class OllamaProvider(AIProvider): logger.error(f"Ollama action error: {e}") return f"Error suggesting actions: {e}" + def analyze_images(self, user_question, screenshot_data, accessibility_data): + """Analyze images visible on screen using Ollama.""" + try: + prompt = self._build_prompt("image", user_question, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Ollama image analysis error: {e}") + return f"Error analyzing images: {e}" + def _build_prompt(self, task_type, user_input, accessibility_data): """Build the complete prompt for Ollama.""" system_prompt = self._prepare_system_prompt(task_type) - prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" - - if task_type == "describe": - prompt += "Please describe what's on this screen." - elif task_type == "question": - prompt += f"User question: {user_input}" - elif task_type == "action": - prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." + if task_type == "image": + # For image analysis, minimize accessibility data weight + if user_input == "ANALYZE_SINGLE_IMAGE_FILE": + prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details." + else: + prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n" + if user_input: + prompt += f"User question about images: {user_input}\n\n" + prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements." + else: + # Standard prompt with full accessibility data + prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." return prompt @@ -386,14 +413,17 @@ class OllamaProvider(AIProvider): "stream": False } - # Note: Ollama vision support varies by model - # For now, we'll send text-only requests - # TODO: Add image support when Ollama vision models are more stable + # Add image data if available and model supports vision + if screenshot_data and "vision" in self.model.lower(): + payload["images"] = [screenshot_data['data']] + + # Use longer timeout for vision models as they're much slower + timeout = 180 if screenshot_data and "vision" in self.model.lower() else 60 response = requests.post( f"{self.base_url}/api/generate", json=payload, - timeout=60 # Ollama can be slower + timeout=timeout ) if response.status_code == 200: @@ -414,13 +444,128 @@ class OllamaProvider(AIProvider): return error_msg +class GeminiProvider(AIProvider): + """Google Gemini AI provider using Google's Gemini API.""" + + def __init__(self, api_key, model="gemini-1.5-flash", **kwargs): + super().__init__(api_key, model, **kwargs) + self.base_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model}:generateContent" + + def describe_screen(self, screenshot_data, accessibility_data): + """Generate a description using Gemini.""" + try: + prompt = self._build_prompt("describe", None, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Gemini describe error: {e}") + return f"Error getting screen description: {e}" + + def answer_question(self, question, screenshot_data, accessibility_data): + """Answer a question using Gemini.""" + try: + prompt = self._build_prompt("question", question, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Gemini question error: {e}") + return f"Error answering question: {e}" + + def suggest_actions(self, request, screenshot_data, accessibility_data): + """Suggest actions using Gemini.""" + try: + prompt = self._build_prompt("action", request, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Gemini action error: {e}") + return f"Error suggesting actions: {e}" + + def analyze_images(self, user_question, screenshot_data, accessibility_data): + """Analyze images visible on screen using Gemini.""" + try: + prompt = self._build_prompt("image", user_question, accessibility_data) + return self._make_request(prompt, screenshot_data) + except Exception as e: + logger.error(f"Gemini image analysis error: {e}") + return f"Error analyzing images: {e}" + + def _build_prompt(self, task_type, user_input, accessibility_data): + """Build the complete prompt for Gemini.""" + system_prompt = self._prepare_system_prompt(task_type) + + if task_type == "image": + if user_input == "ANALYZE_SINGLE_IMAGE_FILE": + prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details." + else: + prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n" + if user_input: + prompt += f"User question about images: {user_input}\n\n" + prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements." + else: + prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n" + + if task_type == "describe": + prompt += "Please describe what's on this screen." + elif task_type == "question": + prompt += f"User question: {user_input}" + elif task_type == "action": + prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this." + + return prompt + + def _make_request(self, prompt, screenshot_data): + """Make request to Gemini API.""" + try: + parts = [{"text": prompt}] + + # Add image if available + if screenshot_data: + parts.append({ + "inline_data": { + "mime_type": f"image/{screenshot_data['format']}", + "data": screenshot_data['data'] + } + }) + + payload = { + "contents": [{ + "parts": parts + }] + } + + response = requests.post( + f"{self.base_url}?key={self.api_key}", + headers={"Content-Type": "application/json"}, + json=payload, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + if 'candidates' in result and len(result['candidates']) > 0: + return result['candidates'][0]['content']['parts'][0]['text'] + else: + return "No response from Gemini" + else: + error_msg = f"Gemini API error {response.status_code}: {response.text}" + logger.error(error_msg) + return error_msg + + except requests.RequestException as e: + error_msg = f"Network error contacting Gemini: {e}" + logger.error(error_msg) + return error_msg + except Exception as e: + error_msg = f"Unexpected error with Gemini API: {e}" + logger.error(error_msg) + return error_msg + + def create_provider(provider_type, **kwargs): """Factory function to create AI providers.""" - if provider_type == "claude": - return ClaudeProvider(**kwargs) - elif provider_type == "claude_code": + if provider_type == "claude_code": return ClaudeCodeProvider(**kwargs) elif provider_type == "ollama": return OllamaProvider(**kwargs) + elif provider_type == "gemini": + return GeminiProvider(**kwargs) else: raise ValueError(f"Unknown provider type: {provider_type}") \ No newline at end of file diff --git a/src/cthulhu/plugins/AIAssistant/plugin.py b/src/cthulhu/plugins/AIAssistant/plugin.py index d6e046f..4f7ba9b 100644 --- a/src/cthulhu/plugins/AIAssistant/plugin.py +++ b/src/cthulhu/plugins/AIAssistant/plugin.py @@ -103,16 +103,17 @@ class AIAssistant(Plugin): config_valid = self._validate_configuration() logger.info(f"AI Assistant configuration valid: {config_valid}") print(f"DEBUG: AI Assistant configuration valid: {config_valid}") - if not config_valid: - logger.warning("AI Assistant configuration invalid, skipping activation") - print("DEBUG: AI Assistant configuration invalid, skipping activation") - return + + # Initialize AI provider (may fail but we still want menu access) + if config_valid: + provider_init = self._initialize_ai_provider() + print(f"DEBUG: AI provider initialization: {provider_init}") + else: + logger.warning("AI Assistant configuration invalid, menu will show error messages") + print("DEBUG: AI Assistant configuration invalid, menu will show error messages") + provider_init = False - # Initialize AI provider - self._initialize_ai_provider() - print("DEBUG: AI provider initialized") - - # Register keybindings only if configuration is valid + # Always register keybindings so menu is accessible even with config issues self._register_keybindings() print("DEBUG: AI keybindings registered") @@ -139,12 +140,48 @@ class AIAssistant(Plugin): self._enabled = False + def refresh_settings(self): + """Refresh plugin settings and reinitialize provider. Called when settings change.""" + try: + logger.info("AI Assistant: Refreshing settings") + print("DEBUG: AI Assistant refreshing settings") + + # Reload settings + self._load_ai_settings() + + # Validate new configuration + config_valid = self._validate_configuration() + print(f"DEBUG: New configuration valid: {config_valid}") + + # Reinitialize provider if configuration is valid + if config_valid: + old_provider = self._ai_provider + provider_init = self._initialize_ai_provider() + print(f"DEBUG: Provider reinitialization: {provider_init}") + if provider_init: + logger.info(f"AI Assistant provider changed to: {self._provider_type}") + print(f"DEBUG: Provider successfully changed to: {self._provider_type}") + else: + logger.warning("Failed to initialize new provider") + print("DEBUG: Failed to initialize new provider") + self._ai_provider = None + else: + logger.warning("New configuration invalid, clearing provider") + print("DEBUG: New configuration invalid, clearing provider") + self._ai_provider = None + + except Exception as e: + logger.error(f"Error refreshing AI Assistant settings: {e}") + print(f"DEBUG: Error refreshing settings: {e}") + def _load_ai_settings(self): """Load AI Assistant settings from Cthulhu configuration.""" try: # Get provider provider = self._settings_manager.getSetting('aiProvider') - self._provider_type = provider or settings.AI_PROVIDER_CLAUDE + print(f"DEBUG: Raw provider setting: '{provider}'") + self._provider_type = provider or settings.AI_PROVIDER_CLAUDE_CODE + print(f"DEBUG: Final provider type: '{self._provider_type}'") # Load API key from file api_key_file = self._settings_manager.getSetting('aiApiKeyFile') @@ -178,7 +215,7 @@ class AIAssistant(Plugin): logger.warning("No AI provider configured") return False - # Providers that don't need API keys + # Check provider-specific requirements if self._provider_type == settings.AI_PROVIDER_OLLAMA: logger.info("Checking Ollama availability") return self._check_ollama_availability() @@ -187,15 +224,16 @@ class AIAssistant(Plugin): result = self._check_claude_code_availability() logger.info(f"Claude Code availability check result: {result}") return result - - # Other providers need API keys - logger.info(f"Checking API key for provider {self._provider_type}") - if not self._api_key: - logger.warning(f"No API key configured for provider {self._provider_type}") + elif self._provider_type == settings.AI_PROVIDER_GEMINI: + logger.info("Checking Gemini API key") + if not self._api_key: + logger.warning("No API key configured for Gemini") + return False + logger.info("Gemini configuration validated") + return True + else: + logger.warning(f"Unknown provider type: {self._provider_type}") return False - - logger.info("Configuration validation passed") - return True def _check_ollama_availability(self): """Check if Ollama is available and has vision models.""" @@ -244,12 +282,12 @@ class AIAssistant(Plugin): def _initialize_ai_provider(self): """Initialize the AI provider based on settings.""" try: - if self._provider_type == settings.AI_PROVIDER_CLAUDE: - self._ai_provider = create_provider("claude", api_key=self._api_key) - elif self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE: + if self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE: self._ai_provider = create_provider("claude_code") elif self._provider_type == settings.AI_PROVIDER_OLLAMA: self._ai_provider = create_provider("ollama", model=self._ollama_model, base_url=self._ollama_endpoint) + elif self._provider_type == settings.AI_PROVIDER_GEMINI: + self._ai_provider = create_provider("gemini", api_key=self._api_key) else: logger.error(f"Unsupported provider type: {self._provider_type}") return False @@ -318,12 +356,100 @@ class AIAssistant(Plugin): self._handle_ai_describe_with_data(self._pre_menu_screen_data) elif action_id == "request_action": self._handle_ai_activate_with_data(self._pre_menu_screen_data) + elif action_id == "analyze_images": + self._handle_ai_image_analysis_with_data(self._pre_menu_screen_data) + elif action_id == "browse_image_file": + self._handle_browse_image_file() else: logger.warning(f"Unknown AI menu action: {action_id}") except Exception as e: logger.error(f"Error handling menu selection {action_id}: {e}") + def _handle_ai_image_analysis_with_data(self, data): + """Handle AI image analysis request with pre-captured data.""" + try: + logger.info("AI image analysis requested with pre-captured data") + + if not self._enabled: + self._present_message("AI Assistant is not enabled") + return True + + if not self._ai_provider: + self._present_message("AI provider not available. Check configuration.") + return True + + if not data: + self._present_message("No screen data available for image analysis") + return True + + provider_name = self._provider_type.replace('_', ' ').title() + self._present_message(f"AI Assistant ({provider_name}) analyzing images...") + + # Use AI to analyze images on screen + try: + response = self._ai_provider.analyze_images(None, + data.get("screenshot"), + data.get("accessibility")) + self._show_description_dialog(response) + except Exception as e: + logger.error(f"Error getting AI image analysis: {e}") + self._present_message(f"Error getting AI image analysis: {e}") + + return True + + except Exception as e: + logger.error(f"Error in image analysis with data: {e}") + self._present_message(f"Error analyzing images: {e}") + return False + + def _handle_browse_image_file(self): + """Handle browsing for an image file to analyze.""" + try: + logger.info("AI image file browsing requested") + print("DEBUG: _handle_browse_image_file called") + + if not self._enabled: + print("DEBUG: AI Assistant not enabled") + self._present_message("AI Assistant is not enabled") + return True + + if not self._ai_provider: + print("DEBUG: AI provider not available") + self._present_message("AI provider not available. Check configuration.") + return True + + # Show file chooser dialog + print("DEBUG: About to show file chooser") + image_file = self._show_image_file_chooser() + print(f"DEBUG: File chooser returned: {image_file}") + + if image_file: + provider_name = self._provider_type.replace('_', ' ').title() + self._present_message(f"AI Assistant ({provider_name}) analyzing selected image...") + + # Load and analyze the image file + try: + image_data = self._load_image_file(image_file) + if image_data: + # For file browsing, use a special prompt to avoid confusion + response = self._ai_provider.analyze_images("ANALYZE_SINGLE_IMAGE_FILE", image_data, {}) + self._show_description_dialog(response) + else: + self._present_message("Could not load the selected image file") + except Exception as e: + logger.error(f"Error analyzing image file: {e}") + self._present_message(f"Error analyzing image file: {e}") + else: + self._present_message("No image file selected") + + return True + + except Exception as e: + logger.error(f"Error in browse image file: {e}") + self._present_message(f"Error browsing for image file: {e}") + return False + def _handle_ai_describe_with_data(self, data): """Handle AI screen description request with pre-captured data.""" try: @@ -1126,7 +1252,7 @@ class AIAssistant(Plugin): # ============================================================================ def _show_action_dialog(self): - """Show dialog for entering action requests.""" + """Show dialog for entering action requests or analyzing images.""" try: # Create dialog without parent first dialog = Gtk.Dialog( @@ -1143,18 +1269,33 @@ class AIAssistant(Plugin): dialog.set_modal(True) dialog.set_type_hint(Gdk.WindowTypeHint.DIALOG) - dialog.set_default_size(600, 300) + dialog.set_default_size(600, 350) content_area = dialog.get_content_area() + # Mode selection radio buttons + mode_label = Gtk.Label() + mode_label.set_markup("Choose analysis mode:") + mode_label.set_halign(Gtk.Align.START) + content_area.pack_start(mode_label, False, False, 10) + + # Action mode radio button (default) + action_radio = Gtk.RadioButton(label="Request an action") + action_radio.set_active(True) + content_area.pack_start(action_radio, False, False, 5) + + # Image analysis mode radio button + image_radio = Gtk.RadioButton.new_with_label_from_widget(action_radio, "Analyze image content") + content_area.pack_start(image_radio, False, False, 5) + # Instruction label - label = Gtk.Label() - label.set_markup("Tell the AI what you want to do:\n" + + instruction_label = Gtk.Label() + instruction_label.set_markup("Tell the AI what you want to do:\n" + "Examples: 'Click the Continue button', 'Enter storm into username field', " + "'Copy this text to clipboard'") - label.set_line_wrap(True) - label.set_halign(Gtk.Align.START) - content_area.pack_start(label, False, False, 10) + instruction_label.set_line_wrap(True) + instruction_label.set_halign(Gtk.Align.START) + content_area.pack_start(instruction_label, False, False, 10) # Action entry entry = Gtk.Entry() @@ -1162,6 +1303,24 @@ class AIAssistant(Plugin): entry.set_activates_default(True) content_area.pack_start(entry, False, False, 10) + # Update UI based on radio button selection + def on_radio_toggled(radio_button): + if action_radio.get_active(): + instruction_label.set_markup("Tell the AI what you want to do:\n" + + "Examples: 'Click the Continue button', 'Enter storm into username field', " + + "'Copy this text to clipboard'") + entry.set_placeholder_text("What would you like me to do?") + entry.set_sensitive(True) + else: # image mode + instruction_label.set_markup("Image Analysis Mode:\n" + + "The AI will focus on describing visual content in images on the screen, " + + "ignoring UI elements. Leave text field empty or add specific questions.") + entry.set_placeholder_text("Optional: Ask specific questions about images (leave empty for general description)") + entry.set_sensitive(True) + + action_radio.connect("toggled", on_radio_toggled) + image_radio.connect("toggled", on_radio_toggled) + dialog.set_default_response(Gtk.ResponseType.OK) dialog.show_all() @@ -1170,14 +1329,20 @@ class AIAssistant(Plugin): response = dialog.run() if response == Gtk.ResponseType.OK: - action_request = entry.get_text().strip() - if action_request: - # Close this dialog and create a new confirmation dialog + if image_radio.get_active(): + # Image analysis mode + user_question = entry.get_text().strip() dialog.destroy() - self._show_action_confirmation_dialog(action_request) + self._handle_image_analysis(user_question) else: - dialog.destroy() - self._present_message("No action request entered") + # Action request mode + action_request = entry.get_text().strip() + if action_request: + dialog.destroy() + self._show_action_confirmation_dialog(action_request) + else: + dialog.destroy() + self._present_message("No action request entered") else: dialog.destroy() self._present_message("Action request cancelled") @@ -1186,6 +1351,41 @@ class AIAssistant(Plugin): logger.error(f"Error showing action dialog: {e}") self._present_message(f"Error showing action dialog: {e}") + def _handle_image_analysis(self, user_question=None): + """Handle image analysis request.""" + try: + logger.info(f"Image analysis requested with question: '{user_question}'") + + self._present_message("AI Assistant analyzing images on screen...") + + # Use existing screen data or collect fresh data + screen_data = self._current_screen_data + if not screen_data: + screen_data = self._collect_ai_data() + + if not screen_data: + self._present_message("Could not collect screen data for image analysis") + return + + # Get image analysis from AI provider + if user_question: + # User has a specific question about images + result = self._ai_provider.analyze_images(user_question, + screen_data.get("screenshot"), + screen_data.get("accessibility")) + else: + # General image description + result = self._ai_provider.analyze_images(None, + screen_data.get("screenshot"), + screen_data.get("accessibility")) + + # Present the result + self._present_message(f"Image analysis complete: {result}") + + except Exception as e: + logger.error(f"Error in image analysis: {e}") + self._present_message(f"Error analyzing images: {e}") + def _show_action_confirmation_dialog(self, action_request): """Show a fresh confirmation dialog for the action.""" try: @@ -1740,6 +1940,96 @@ class AIAssistant(Plugin): except Exception as e: logger.error(f"Error extracting text from action: {e}") return None + + def _show_image_file_chooser(self): + """Show a file chooser dialog for selecting an image file.""" + try: + dialog = Gtk.FileChooserDialog( + title="Select Image File", + parent=None, + action=Gtk.FileChooserAction.OPEN + ) + + # Add buttons + dialog.add_button("Cancel", Gtk.ResponseType.CANCEL) + dialog.add_button("Open", Gtk.ResponseType.OK) + + # Set up image file filters + filter_images = Gtk.FileFilter() + filter_images.set_name("Image files") + filter_images.add_mime_type("image/png") + filter_images.add_mime_type("image/jpeg") + filter_images.add_mime_type("image/jpg") + filter_images.add_mime_type("image/gif") + filter_images.add_mime_type("image/bmp") + filter_images.add_mime_type("image/webp") + filter_images.add_pattern("*.png") + filter_images.add_pattern("*.jpg") + filter_images.add_pattern("*.jpeg") + filter_images.add_pattern("*.gif") + filter_images.add_pattern("*.bmp") + filter_images.add_pattern("*.webp") + dialog.add_filter(filter_images) + + # Add "All files" filter as backup + filter_all = Gtk.FileFilter() + filter_all.set_name("All files") + filter_all.add_pattern("*") + dialog.add_filter(filter_all) + + # Run the dialog + response = dialog.run() + + filename = None + if response == Gtk.ResponseType.OK: + filename = dialog.get_filename() + logger.info(f"Selected image file: {filename}") + + dialog.destroy() + return filename + + except Exception as e: + logger.error(f"Error showing image file chooser: {e}") + return None + + def _load_image_file(self, image_path): + """Load an image file and convert it to base64 for AI analysis.""" + try: + import base64 + import os + + # Read the image file + with open(image_path, 'rb') as image_file: + image_data = image_file.read() + + # Convert to base64 + base64_data = base64.b64encode(image_data).decode('utf-8') + + # Determine format from file extension + _, ext = os.path.splitext(image_path.lower()) + format_map = { + '.png': 'png', + '.jpg': 'jpeg', + '.jpeg': 'jpeg', + '.gif': 'gif', + '.bmp': 'bmp', + '.webp': 'webp' + } + image_format = format_map.get(ext, 'png') # Default to png + + logger.info(f"Loaded image file: {image_path} ({len(image_data)} bytes, format: {image_format})") + + # Return same structure as _capture_screenshot + return { + 'format': image_format, + 'width': 0, # We don't know dimensions, but AI doesn't need them + 'height': 0, + 'data': base64_data + } + + except Exception as e: + logger.error(f"Error loading image file {image_path}: {e}") + return None class AIAssistantMenu(Gtk.Dialog): @@ -1772,11 +2062,15 @@ class AIAssistantMenu(Gtk.Dialog): self.radio_ask = Gtk.RadioButton.new_with_label(None, "Ask Question") self.radio_describe = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Describe Screen") self.radio_action = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Request Action") + self.radio_image = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Analyze Images") + self.radio_browse = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Browse for Image File") # Pack radio buttons content_area.pack_start(self.radio_ask, False, False, 5) content_area.pack_start(self.radio_describe, False, False, 5) content_area.pack_start(self.radio_action, False, False, 5) + content_area.pack_start(self.radio_image, False, False, 5) + content_area.pack_start(self.radio_browse, False, False, 5) # Set first option as selected by default self.radio_ask.set_active(True) @@ -1798,6 +2092,10 @@ class AIAssistantMenu(Gtk.Dialog): action_id = "describe_screen" elif self.radio_action.get_active(): action_id = "request_action" + elif self.radio_image.get_active(): + action_id = "analyze_images" + elif self.radio_browse.get_active(): + action_id = "browse_image_file" else: action_id = None diff --git a/src/cthulhu/settings.py b/src/cthulhu/settings.py index bfac14d..1a8ea72 100644 --- a/src/cthulhu/settings.py +++ b/src/cthulhu/settings.py @@ -197,10 +197,8 @@ CHAT_SPEAK_ALL = 0 CHAT_SPEAK_ALL_IF_FOCUSED = 1 CHAT_SPEAK_FOCUSED_CHANNEL = 2 -# AI Assistant constants -AI_PROVIDER_CLAUDE = "claude" +# AI Assistant constants - simplified to providers that don't need complex API key management AI_PROVIDER_CLAUDE_CODE = "claude_code" -AI_PROVIDER_CHATGPT = "chatgpt" AI_PROVIDER_GEMINI = "gemini" AI_PROVIDER_OLLAMA = "ollama"