2 new options added for the ai assistant. First, describe images, which *should* describe any images on the screen. The second is browse for image, which will provide a description for the selected image specifically.

This commit is contained in:
Storm Dragon
2025-08-12 15:34:35 -04:00
parent 15bcc0589a
commit fb8c64a406
5 changed files with 625 additions and 189 deletions
+1 -1
View File
@@ -23,5 +23,5 @@
# Fork of Orca Screen Reader (GNOME)
# Original source: https://gitlab.gnome.org/GNOME/orca
version = "2025.08.11"
version = "2025.08.12"
codeName = "testing"
+5 -10
View File
@@ -1868,14 +1868,10 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
# Set provider combo
provider = prefs.get("aiProvider", settings.aiProvider)
providerIndex = 0 # Default to Claude Code
if provider == settings.AI_PROVIDER_CLAUDE:
if provider == settings.AI_PROVIDER_GEMINI:
providerIndex = 1
elif provider == settings.AI_PROVIDER_CHATGPT:
providerIndex = 2
elif provider == settings.AI_PROVIDER_GEMINI:
providerIndex = 3
elif provider == settings.AI_PROVIDER_OLLAMA:
providerIndex = 4
providerIndex = 2
self.aiProviderCombo.set_active(providerIndex)
# Set API key file
@@ -1926,8 +1922,8 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
def _updateProviderControls(self, provider):
"""Update visibility/sensitivity of provider-specific controls."""
# API key controls (needed for Claude, ChatGPT, Gemini - not for Claude Code or Ollama)
api_key_needed = provider in [settings.AI_PROVIDER_CLAUDE, settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI]
# API key controls (only needed for Gemini - not for Claude Code or Ollama)
api_key_needed = provider in [settings.AI_PROVIDER_GEMINI]
self.aiApiKeyEntry.set_sensitive(api_key_needed)
# Get Claude API Key button (only for Claude Code)
@@ -3695,8 +3691,7 @@ class CthulhuSetupGUI(cthulhu_gtkbuilder.GtkBuilderWrapper):
def aiProviderChanged(self, widget):
"""AI Provider combo box changed handler"""
providers = [settings.AI_PROVIDER_CLAUDE_CODE, settings.AI_PROVIDER_CLAUDE,
settings.AI_PROVIDER_CHATGPT, settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA]
providers = [settings.AI_PROVIDER_CLAUDE_CODE, settings.AI_PROVIDER_GEMINI, settings.AI_PROVIDER_OLLAMA]
activeIndex = widget.get_active()
if 0 <= activeIndex < len(providers):
provider = providers[activeIndex]
+285 -140
View File
@@ -39,9 +39,21 @@ class AIProvider(ABC):
"""Suggest actions to accomplish a user's request."""
pass
@abstractmethod
def analyze_images(self, user_question, screenshot_data, accessibility_data):
"""Analyze images visible on screen, focusing on visual content rather than UI elements."""
pass
def _prepare_system_prompt(self, task_type):
"""Prepare system prompt based on task type."""
base_prompt = """You are an AI assistant helping a screen reader user navigate and interact with computer applications. You have access to:
if task_type == "image":
base_prompt = """You are an AI assistant helping a screen reader user analyze image files. You have access to a single image file that the user has selected for analysis.
The user is using the Cthulhu screen reader, so they cannot see images visually. Your task is to provide detailed visual descriptions of the image content to make it accessible.
"""
else:
base_prompt = """You are an AI assistant helping a screen reader user navigate and interact with computer applications. You have access to:
1. A screenshot of the current screen
2. Detailed accessibility tree information about UI elements
@@ -101,119 +113,26 @@ Example for "find edit box and enter text":
🚨 NEVER GIVE PROGRAMMING CODE OR TECHNICAL INSTRUCTIONS"""
elif task_type == "image":
return base_prompt + """Your task: Analyze and describe any images visible on the screen, focusing on visual content rather than UI elements.
IMPORTANT IMAGE ANALYSIS GUIDELINES:
- Focus ONLY on the visual content of images (photos, diagrams, graphics, artwork)
- IGNORE UI elements like buttons, menus, toolbars, window frames
- Describe what you see IN the images: objects, people, scenes, colors, text within images
- If multiple images are visible, describe each one separately
- For photographs: describe the scene, subjects, lighting, composition
- For diagrams/charts: describe the data, relationships, labels shown
- For screenshots within images: describe the content being shown
- Be detailed about visual elements that a screen reader user cannot access
If no images are clearly visible on screen, say so clearly.
Keep descriptions informative and well-structured."""
return base_prompt
class ClaudeProvider(AIProvider):
"""Claude AI provider using Anthropic's API."""
def __init__(self, api_key, model="claude-3-5-sonnet-20241022", **kwargs):
super().__init__(api_key, model, **kwargs)
self.base_url = "https://api.anthropic.com/v1/messages"
self.headers = {
"Content-Type": "application/json",
"X-API-Key": self.api_key,
"anthropic-version": "2023-06-01"
}
def describe_screen(self, screenshot_data, accessibility_data):
"""Generate a description using Claude."""
try:
prompt = self._build_prompt("describe", None, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Claude describe error: {e}")
return f"Error getting screen description: {e}"
def answer_question(self, question, screenshot_data, accessibility_data):
"""Answer a question using Claude."""
try:
prompt = self._build_prompt("question", question, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Claude question error: {e}")
return f"Error answering question: {e}"
def suggest_actions(self, request, screenshot_data, accessibility_data):
"""Suggest actions using Claude."""
try:
prompt = self._build_prompt("action", request, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Claude action error: {e}")
return f"Error suggesting actions: {e}"
def _build_prompt(self, task_type, user_input, accessibility_data):
"""Build the complete prompt for Claude."""
prompt = f"Current accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this."
return prompt
def _make_request(self, prompt, screenshot_data):
"""Make request to Claude API."""
try:
# Prepare the message content
content = [
{
"type": "text",
"text": prompt
}
]
# Add screenshot if available
if screenshot_data:
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": f"image/{screenshot_data['format']}",
"data": screenshot_data['data']
}
})
payload = {
"model": self.model,
"max_tokens": 1000,
"messages": [
{
"role": "user",
"content": content
}
],
"system": self._prepare_system_prompt("describe") # Will be made dynamic later
}
response = requests.post(
self.base_url,
headers=self.headers,
json=payload,
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['content'][0]['text']
else:
error_msg = f"Claude API error {response.status_code}: {response.text}"
logger.error(error_msg)
return error_msg
except requests.RequestException as e:
error_msg = f"Network error contacting Claude: {e}"
logger.error(error_msg)
return error_msg
except Exception as e:
error_msg = f"Unexpected error with Claude API: {e}"
logger.error(error_msg)
return error_msg
class ClaudeCodeProvider(AIProvider):
"""Claude Code CLI provider - uses installed Claude Code application."""
@@ -249,34 +168,98 @@ class ClaudeCodeProvider(AIProvider):
logger.error(f"Claude Code action error: {e}")
return f"Error suggesting actions: {e}"
def analyze_images(self, user_question, screenshot_data, accessibility_data):
"""Analyze images visible on screen using Claude Code CLI."""
try:
prompt = self._build_prompt("image", user_question, accessibility_data)
# If we have image data, save it to a temporary file for Claude Code
temp_image_path = None
if screenshot_data:
import tempfile
import base64
import os
# Create temporary file with appropriate extension
image_format = screenshot_data.get('format', 'png')
suffix = f".{image_format}"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
# Decode base64 image data and write to temp file
image_data = base64.b64decode(screenshot_data['data'])
temp_file.write(image_data)
temp_image_path = temp_file.name
try:
# Call Claude Code with the image file path
result = self._call_claude_code(prompt, temp_image_path)
return result
finally:
# Clean up temporary file
if os.path.exists(temp_image_path):
os.unlink(temp_image_path)
else:
# No image data, just call with text prompt
return self._call_claude_code(prompt)
except Exception as e:
logger.error(f"Claude Code image analysis error: {e}")
return f"Error analyzing images: {e}"
def _build_prompt(self, task_type, user_input, accessibility_data):
"""Build the complete prompt for Claude Code."""
import json
system_prompt = self._get_system_prompt(task_type)
prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nProvide the action analysis in the required format."
if task_type == "image":
# For image analysis, minimize accessibility data weight
if user_input == "ANALYZE_SINGLE_IMAGE_FILE":
prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details."
else:
prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n"
if user_input:
prompt += f"User question about images: {user_input}\n\n"
prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements."
else:
# Standard prompt with full accessibility data
prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nProvide the action analysis in the required format."
return prompt
def _call_claude_code(self, prompt):
"""Call Claude Code CLI with the prompt."""
def _call_claude_code(self, prompt, image_path=None):
"""Call Claude Code CLI with the prompt and optional image."""
import subprocess
import tempfile
import os
try:
# Call Claude Code CLI with the prompt directly
# Build the command
cmd = ['claude', '--print', '--output-format', 'text']
# For accessibility analysis, skip permission checks to allow automatic access
cmd.append('--dangerously-skip-permissions')
# If we have an image path, include it in the prompt
if image_path:
prompt = f"Please analyze and describe the image at {image_path}. {prompt}"
# Add the prompt
cmd.append(prompt)
# Call Claude Code CLI
result = subprocess.run(
['claude', '--print', '--output-format', 'text', prompt],
cmd,
capture_output=True,
text=True,
timeout=60
timeout=120 # Longer timeout for image analysis
)
if result.returncode == 0:
@@ -297,7 +280,14 @@ class ClaudeCodeProvider(AIProvider):
def _get_system_prompt(self, task_type):
"""Get system prompt for Claude Code."""
base_prompt = """You are Claude Code helping a screen reader user navigate and interact with computer applications. You have expert understanding of terminal commands, programming, and accessibility.
if task_type == "image":
base_prompt = """You are Claude Code helping a screen reader user analyze image files. You have access to a single image file that the user has selected for analysis.
The user is using the Cthulhu screen reader and cannot see images visually. Provide detailed visual descriptions of the image content to make it accessible.
"""
else:
base_prompt = """You are Claude Code helping a screen reader user navigate and interact with computer applications. You have expert understanding of terminal commands, programming, and accessibility.
The user is using the Cthulhu screen reader and cannot see the screen visually. Provide expert technical assistance.
@@ -323,6 +313,23 @@ MANDATORY FORMAT:
elif task_type == "question":
return base_prompt + "Answer with expert technical knowledge about programming, terminals, and system operations."
elif task_type == "image":
return base_prompt + """Your task: Analyze and describe any images visible on the screen, focusing on visual content rather than UI elements.
IMPORTANT IMAGE ANALYSIS GUIDELINES:
- Focus ONLY on the visual content of images (photos, diagrams, graphics, artwork)
- IGNORE UI elements like buttons, menus, toolbars, window frames
- Describe what you see IN the images: objects, people, scenes, colors, text within images
- If multiple images are visible, describe each one separately
- For photographs: describe the scene, subjects, lighting, composition
- For diagrams/charts: describe the data, relationships, labels shown
- For screenshots within images: describe the content being shown
- Be detailed about visual elements that a screen reader user cannot access
If no images are clearly visible on screen, say so clearly.
Keep descriptions informative and well-structured."""
return base_prompt
@@ -361,18 +368,38 @@ class OllamaProvider(AIProvider):
logger.error(f"Ollama action error: {e}")
return f"Error suggesting actions: {e}"
def analyze_images(self, user_question, screenshot_data, accessibility_data):
"""Analyze images visible on screen using Ollama."""
try:
prompt = self._build_prompt("image", user_question, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Ollama image analysis error: {e}")
return f"Error analyzing images: {e}"
def _build_prompt(self, task_type, user_input, accessibility_data):
"""Build the complete prompt for Ollama."""
system_prompt = self._prepare_system_prompt(task_type)
prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this."
if task_type == "image":
# For image analysis, minimize accessibility data weight
if user_input == "ANALYZE_SINGLE_IMAGE_FILE":
prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details."
else:
prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n"
if user_input:
prompt += f"User question about images: {user_input}\n\n"
prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements."
else:
# Standard prompt with full accessibility data
prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this."
return prompt
@@ -386,14 +413,17 @@ class OllamaProvider(AIProvider):
"stream": False
}
# Note: Ollama vision support varies by model
# For now, we'll send text-only requests
# TODO: Add image support when Ollama vision models are more stable
# Add image data if available and model supports vision
if screenshot_data and "vision" in self.model.lower():
payload["images"] = [screenshot_data['data']]
# Use longer timeout for vision models as they're much slower
timeout = 180 if screenshot_data and "vision" in self.model.lower() else 60
response = requests.post(
f"{self.base_url}/api/generate",
json=payload,
timeout=60 # Ollama can be slower
timeout=timeout
)
if response.status_code == 200:
@@ -414,13 +444,128 @@ class OllamaProvider(AIProvider):
return error_msg
class GeminiProvider(AIProvider):
"""Google Gemini AI provider using Google's Gemini API."""
def __init__(self, api_key, model="gemini-1.5-flash", **kwargs):
super().__init__(api_key, model, **kwargs)
self.base_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model}:generateContent"
def describe_screen(self, screenshot_data, accessibility_data):
"""Generate a description using Gemini."""
try:
prompt = self._build_prompt("describe", None, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Gemini describe error: {e}")
return f"Error getting screen description: {e}"
def answer_question(self, question, screenshot_data, accessibility_data):
"""Answer a question using Gemini."""
try:
prompt = self._build_prompt("question", question, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Gemini question error: {e}")
return f"Error answering question: {e}"
def suggest_actions(self, request, screenshot_data, accessibility_data):
"""Suggest actions using Gemini."""
try:
prompt = self._build_prompt("action", request, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Gemini action error: {e}")
return f"Error suggesting actions: {e}"
def analyze_images(self, user_question, screenshot_data, accessibility_data):
"""Analyze images visible on screen using Gemini."""
try:
prompt = self._build_prompt("image", user_question, accessibility_data)
return self._make_request(prompt, screenshot_data)
except Exception as e:
logger.error(f"Gemini image analysis error: {e}")
return f"Error analyzing images: {e}"
def _build_prompt(self, task_type, user_input, accessibility_data):
"""Build the complete prompt for Gemini."""
system_prompt = self._prepare_system_prompt(task_type)
if task_type == "image":
if user_input == "ANALYZE_SINGLE_IMAGE_FILE":
prompt = f"{system_prompt}\n\nAnalyze and describe the single image file provided. Focus on visual content only - describe what you see in the image: objects, people, scenery, colors, text, composition, and any other visual details."
else:
prompt = f"{system_prompt}\n\nCurrent screen context (focus on images):\n"
if user_input:
prompt += f"User question about images: {user_input}\n\n"
prompt += "Analyze and describe any images visible on this screen. Focus on visual content, not UI elements."
else:
prompt = f"{system_prompt}\n\nCurrent accessibility information:\n```json\n{json.dumps(accessibility_data, indent=2)}\n```\n\n"
if task_type == "describe":
prompt += "Please describe what's on this screen."
elif task_type == "question":
prompt += f"User question: {user_input}"
elif task_type == "action":
prompt += f"User wants to: {user_input}\n\nPlease suggest specific steps to accomplish this."
return prompt
def _make_request(self, prompt, screenshot_data):
"""Make request to Gemini API."""
try:
parts = [{"text": prompt}]
# Add image if available
if screenshot_data:
parts.append({
"inline_data": {
"mime_type": f"image/{screenshot_data['format']}",
"data": screenshot_data['data']
}
})
payload = {
"contents": [{
"parts": parts
}]
}
response = requests.post(
f"{self.base_url}?key={self.api_key}",
headers={"Content-Type": "application/json"},
json=payload,
timeout=30
)
if response.status_code == 200:
result = response.json()
if 'candidates' in result and len(result['candidates']) > 0:
return result['candidates'][0]['content']['parts'][0]['text']
else:
return "No response from Gemini"
else:
error_msg = f"Gemini API error {response.status_code}: {response.text}"
logger.error(error_msg)
return error_msg
except requests.RequestException as e:
error_msg = f"Network error contacting Gemini: {e}"
logger.error(error_msg)
return error_msg
except Exception as e:
error_msg = f"Unexpected error with Gemini API: {e}"
logger.error(error_msg)
return error_msg
def create_provider(provider_type, **kwargs):
"""Factory function to create AI providers."""
if provider_type == "claude":
return ClaudeProvider(**kwargs)
elif provider_type == "claude_code":
if provider_type == "claude_code":
return ClaudeCodeProvider(**kwargs)
elif provider_type == "ollama":
return OllamaProvider(**kwargs)
elif provider_type == "gemini":
return GeminiProvider(**kwargs)
else:
raise ValueError(f"Unknown provider type: {provider_type}")
+333 -35
View File
@@ -103,16 +103,17 @@ class AIAssistant(Plugin):
config_valid = self._validate_configuration()
logger.info(f"AI Assistant configuration valid: {config_valid}")
print(f"DEBUG: AI Assistant configuration valid: {config_valid}")
if not config_valid:
logger.warning("AI Assistant configuration invalid, skipping activation")
print("DEBUG: AI Assistant configuration invalid, skipping activation")
return
# Initialize AI provider (may fail but we still want menu access)
if config_valid:
provider_init = self._initialize_ai_provider()
print(f"DEBUG: AI provider initialization: {provider_init}")
else:
logger.warning("AI Assistant configuration invalid, menu will show error messages")
print("DEBUG: AI Assistant configuration invalid, menu will show error messages")
provider_init = False
# Initialize AI provider
self._initialize_ai_provider()
print("DEBUG: AI provider initialized")
# Register keybindings only if configuration is valid
# Always register keybindings so menu is accessible even with config issues
self._register_keybindings()
print("DEBUG: AI keybindings registered")
@@ -139,12 +140,48 @@ class AIAssistant(Plugin):
self._enabled = False
def refresh_settings(self):
"""Refresh plugin settings and reinitialize provider. Called when settings change."""
try:
logger.info("AI Assistant: Refreshing settings")
print("DEBUG: AI Assistant refreshing settings")
# Reload settings
self._load_ai_settings()
# Validate new configuration
config_valid = self._validate_configuration()
print(f"DEBUG: New configuration valid: {config_valid}")
# Reinitialize provider if configuration is valid
if config_valid:
old_provider = self._ai_provider
provider_init = self._initialize_ai_provider()
print(f"DEBUG: Provider reinitialization: {provider_init}")
if provider_init:
logger.info(f"AI Assistant provider changed to: {self._provider_type}")
print(f"DEBUG: Provider successfully changed to: {self._provider_type}")
else:
logger.warning("Failed to initialize new provider")
print("DEBUG: Failed to initialize new provider")
self._ai_provider = None
else:
logger.warning("New configuration invalid, clearing provider")
print("DEBUG: New configuration invalid, clearing provider")
self._ai_provider = None
except Exception as e:
logger.error(f"Error refreshing AI Assistant settings: {e}")
print(f"DEBUG: Error refreshing settings: {e}")
def _load_ai_settings(self):
"""Load AI Assistant settings from Cthulhu configuration."""
try:
# Get provider
provider = self._settings_manager.getSetting('aiProvider')
self._provider_type = provider or settings.AI_PROVIDER_CLAUDE
print(f"DEBUG: Raw provider setting: '{provider}'")
self._provider_type = provider or settings.AI_PROVIDER_CLAUDE_CODE
print(f"DEBUG: Final provider type: '{self._provider_type}'")
# Load API key from file
api_key_file = self._settings_manager.getSetting('aiApiKeyFile')
@@ -178,7 +215,7 @@ class AIAssistant(Plugin):
logger.warning("No AI provider configured")
return False
# Providers that don't need API keys
# Check provider-specific requirements
if self._provider_type == settings.AI_PROVIDER_OLLAMA:
logger.info("Checking Ollama availability")
return self._check_ollama_availability()
@@ -187,15 +224,16 @@ class AIAssistant(Plugin):
result = self._check_claude_code_availability()
logger.info(f"Claude Code availability check result: {result}")
return result
# Other providers need API keys
logger.info(f"Checking API key for provider {self._provider_type}")
if not self._api_key:
logger.warning(f"No API key configured for provider {self._provider_type}")
elif self._provider_type == settings.AI_PROVIDER_GEMINI:
logger.info("Checking Gemini API key")
if not self._api_key:
logger.warning("No API key configured for Gemini")
return False
logger.info("Gemini configuration validated")
return True
else:
logger.warning(f"Unknown provider type: {self._provider_type}")
return False
logger.info("Configuration validation passed")
return True
def _check_ollama_availability(self):
"""Check if Ollama is available and has vision models."""
@@ -244,12 +282,12 @@ class AIAssistant(Plugin):
def _initialize_ai_provider(self):
"""Initialize the AI provider based on settings."""
try:
if self._provider_type == settings.AI_PROVIDER_CLAUDE:
self._ai_provider = create_provider("claude", api_key=self._api_key)
elif self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE:
if self._provider_type == settings.AI_PROVIDER_CLAUDE_CODE:
self._ai_provider = create_provider("claude_code")
elif self._provider_type == settings.AI_PROVIDER_OLLAMA:
self._ai_provider = create_provider("ollama", model=self._ollama_model, base_url=self._ollama_endpoint)
elif self._provider_type == settings.AI_PROVIDER_GEMINI:
self._ai_provider = create_provider("gemini", api_key=self._api_key)
else:
logger.error(f"Unsupported provider type: {self._provider_type}")
return False
@@ -318,12 +356,100 @@ class AIAssistant(Plugin):
self._handle_ai_describe_with_data(self._pre_menu_screen_data)
elif action_id == "request_action":
self._handle_ai_activate_with_data(self._pre_menu_screen_data)
elif action_id == "analyze_images":
self._handle_ai_image_analysis_with_data(self._pre_menu_screen_data)
elif action_id == "browse_image_file":
self._handle_browse_image_file()
else:
logger.warning(f"Unknown AI menu action: {action_id}")
except Exception as e:
logger.error(f"Error handling menu selection {action_id}: {e}")
def _handle_ai_image_analysis_with_data(self, data):
"""Handle AI image analysis request with pre-captured data."""
try:
logger.info("AI image analysis requested with pre-captured data")
if not self._enabled:
self._present_message("AI Assistant is not enabled")
return True
if not self._ai_provider:
self._present_message("AI provider not available. Check configuration.")
return True
if not data:
self._present_message("No screen data available for image analysis")
return True
provider_name = self._provider_type.replace('_', ' ').title()
self._present_message(f"AI Assistant ({provider_name}) analyzing images...")
# Use AI to analyze images on screen
try:
response = self._ai_provider.analyze_images(None,
data.get("screenshot"),
data.get("accessibility"))
self._show_description_dialog(response)
except Exception as e:
logger.error(f"Error getting AI image analysis: {e}")
self._present_message(f"Error getting AI image analysis: {e}")
return True
except Exception as e:
logger.error(f"Error in image analysis with data: {e}")
self._present_message(f"Error analyzing images: {e}")
return False
def _handle_browse_image_file(self):
"""Handle browsing for an image file to analyze."""
try:
logger.info("AI image file browsing requested")
print("DEBUG: _handle_browse_image_file called")
if not self._enabled:
print("DEBUG: AI Assistant not enabled")
self._present_message("AI Assistant is not enabled")
return True
if not self._ai_provider:
print("DEBUG: AI provider not available")
self._present_message("AI provider not available. Check configuration.")
return True
# Show file chooser dialog
print("DEBUG: About to show file chooser")
image_file = self._show_image_file_chooser()
print(f"DEBUG: File chooser returned: {image_file}")
if image_file:
provider_name = self._provider_type.replace('_', ' ').title()
self._present_message(f"AI Assistant ({provider_name}) analyzing selected image...")
# Load and analyze the image file
try:
image_data = self._load_image_file(image_file)
if image_data:
# For file browsing, use a special prompt to avoid confusion
response = self._ai_provider.analyze_images("ANALYZE_SINGLE_IMAGE_FILE", image_data, {})
self._show_description_dialog(response)
else:
self._present_message("Could not load the selected image file")
except Exception as e:
logger.error(f"Error analyzing image file: {e}")
self._present_message(f"Error analyzing image file: {e}")
else:
self._present_message("No image file selected")
return True
except Exception as e:
logger.error(f"Error in browse image file: {e}")
self._present_message(f"Error browsing for image file: {e}")
return False
def _handle_ai_describe_with_data(self, data):
"""Handle AI screen description request with pre-captured data."""
try:
@@ -1126,7 +1252,7 @@ class AIAssistant(Plugin):
# ============================================================================
def _show_action_dialog(self):
"""Show dialog for entering action requests."""
"""Show dialog for entering action requests or analyzing images."""
try:
# Create dialog without parent first
dialog = Gtk.Dialog(
@@ -1143,18 +1269,33 @@ class AIAssistant(Plugin):
dialog.set_modal(True)
dialog.set_type_hint(Gdk.WindowTypeHint.DIALOG)
dialog.set_default_size(600, 300)
dialog.set_default_size(600, 350)
content_area = dialog.get_content_area()
# Mode selection radio buttons
mode_label = Gtk.Label()
mode_label.set_markup("<b>Choose analysis mode:</b>")
mode_label.set_halign(Gtk.Align.START)
content_area.pack_start(mode_label, False, False, 10)
# Action mode radio button (default)
action_radio = Gtk.RadioButton(label="Request an action")
action_radio.set_active(True)
content_area.pack_start(action_radio, False, False, 5)
# Image analysis mode radio button
image_radio = Gtk.RadioButton.new_with_label_from_widget(action_radio, "Analyze image content")
content_area.pack_start(image_radio, False, False, 5)
# Instruction label
label = Gtk.Label()
label.set_markup("<b>Tell the AI what you want to do:</b>\n" +
instruction_label = Gtk.Label()
instruction_label.set_markup("<b>Tell the AI what you want to do:</b>\n" +
"Examples: 'Click the Continue button', 'Enter storm into username field', " +
"'Copy this text to clipboard'")
label.set_line_wrap(True)
label.set_halign(Gtk.Align.START)
content_area.pack_start(label, False, False, 10)
instruction_label.set_line_wrap(True)
instruction_label.set_halign(Gtk.Align.START)
content_area.pack_start(instruction_label, False, False, 10)
# Action entry
entry = Gtk.Entry()
@@ -1162,6 +1303,24 @@ class AIAssistant(Plugin):
entry.set_activates_default(True)
content_area.pack_start(entry, False, False, 10)
# Update UI based on radio button selection
def on_radio_toggled(radio_button):
if action_radio.get_active():
instruction_label.set_markup("<b>Tell the AI what you want to do:</b>\n" +
"Examples: 'Click the Continue button', 'Enter storm into username field', " +
"'Copy this text to clipboard'")
entry.set_placeholder_text("What would you like me to do?")
entry.set_sensitive(True)
else: # image mode
instruction_label.set_markup("<b>Image Analysis Mode:</b>\n" +
"The AI will focus on describing visual content in images on the screen, " +
"ignoring UI elements. Leave text field empty or add specific questions.")
entry.set_placeholder_text("Optional: Ask specific questions about images (leave empty for general description)")
entry.set_sensitive(True)
action_radio.connect("toggled", on_radio_toggled)
image_radio.connect("toggled", on_radio_toggled)
dialog.set_default_response(Gtk.ResponseType.OK)
dialog.show_all()
@@ -1170,14 +1329,20 @@ class AIAssistant(Plugin):
response = dialog.run()
if response == Gtk.ResponseType.OK:
action_request = entry.get_text().strip()
if action_request:
# Close this dialog and create a new confirmation dialog
if image_radio.get_active():
# Image analysis mode
user_question = entry.get_text().strip()
dialog.destroy()
self._show_action_confirmation_dialog(action_request)
self._handle_image_analysis(user_question)
else:
dialog.destroy()
self._present_message("No action request entered")
# Action request mode
action_request = entry.get_text().strip()
if action_request:
dialog.destroy()
self._show_action_confirmation_dialog(action_request)
else:
dialog.destroy()
self._present_message("No action request entered")
else:
dialog.destroy()
self._present_message("Action request cancelled")
@@ -1186,6 +1351,41 @@ class AIAssistant(Plugin):
logger.error(f"Error showing action dialog: {e}")
self._present_message(f"Error showing action dialog: {e}")
def _handle_image_analysis(self, user_question=None):
"""Handle image analysis request."""
try:
logger.info(f"Image analysis requested with question: '{user_question}'")
self._present_message("AI Assistant analyzing images on screen...")
# Use existing screen data or collect fresh data
screen_data = self._current_screen_data
if not screen_data:
screen_data = self._collect_ai_data()
if not screen_data:
self._present_message("Could not collect screen data for image analysis")
return
# Get image analysis from AI provider
if user_question:
# User has a specific question about images
result = self._ai_provider.analyze_images(user_question,
screen_data.get("screenshot"),
screen_data.get("accessibility"))
else:
# General image description
result = self._ai_provider.analyze_images(None,
screen_data.get("screenshot"),
screen_data.get("accessibility"))
# Present the result
self._present_message(f"Image analysis complete: {result}")
except Exception as e:
logger.error(f"Error in image analysis: {e}")
self._present_message(f"Error analyzing images: {e}")
def _show_action_confirmation_dialog(self, action_request):
"""Show a fresh confirmation dialog for the action."""
try:
@@ -1740,6 +1940,96 @@ class AIAssistant(Plugin):
except Exception as e:
logger.error(f"Error extracting text from action: {e}")
return None
def _show_image_file_chooser(self):
"""Show a file chooser dialog for selecting an image file."""
try:
dialog = Gtk.FileChooserDialog(
title="Select Image File",
parent=None,
action=Gtk.FileChooserAction.OPEN
)
# Add buttons
dialog.add_button("Cancel", Gtk.ResponseType.CANCEL)
dialog.add_button("Open", Gtk.ResponseType.OK)
# Set up image file filters
filter_images = Gtk.FileFilter()
filter_images.set_name("Image files")
filter_images.add_mime_type("image/png")
filter_images.add_mime_type("image/jpeg")
filter_images.add_mime_type("image/jpg")
filter_images.add_mime_type("image/gif")
filter_images.add_mime_type("image/bmp")
filter_images.add_mime_type("image/webp")
filter_images.add_pattern("*.png")
filter_images.add_pattern("*.jpg")
filter_images.add_pattern("*.jpeg")
filter_images.add_pattern("*.gif")
filter_images.add_pattern("*.bmp")
filter_images.add_pattern("*.webp")
dialog.add_filter(filter_images)
# Add "All files" filter as backup
filter_all = Gtk.FileFilter()
filter_all.set_name("All files")
filter_all.add_pattern("*")
dialog.add_filter(filter_all)
# Run the dialog
response = dialog.run()
filename = None
if response == Gtk.ResponseType.OK:
filename = dialog.get_filename()
logger.info(f"Selected image file: {filename}")
dialog.destroy()
return filename
except Exception as e:
logger.error(f"Error showing image file chooser: {e}")
return None
def _load_image_file(self, image_path):
"""Load an image file and convert it to base64 for AI analysis."""
try:
import base64
import os
# Read the image file
with open(image_path, 'rb') as image_file:
image_data = image_file.read()
# Convert to base64
base64_data = base64.b64encode(image_data).decode('utf-8')
# Determine format from file extension
_, ext = os.path.splitext(image_path.lower())
format_map = {
'.png': 'png',
'.jpg': 'jpeg',
'.jpeg': 'jpeg',
'.gif': 'gif',
'.bmp': 'bmp',
'.webp': 'webp'
}
image_format = format_map.get(ext, 'png') # Default to png
logger.info(f"Loaded image file: {image_path} ({len(image_data)} bytes, format: {image_format})")
# Return same structure as _capture_screenshot
return {
'format': image_format,
'width': 0, # We don't know dimensions, but AI doesn't need them
'height': 0,
'data': base64_data
}
except Exception as e:
logger.error(f"Error loading image file {image_path}: {e}")
return None
class AIAssistantMenu(Gtk.Dialog):
@@ -1772,11 +2062,15 @@ class AIAssistantMenu(Gtk.Dialog):
self.radio_ask = Gtk.RadioButton.new_with_label(None, "Ask Question")
self.radio_describe = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Describe Screen")
self.radio_action = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Request Action")
self.radio_image = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Analyze Images")
self.radio_browse = Gtk.RadioButton.new_with_label_from_widget(self.radio_ask, "Browse for Image File")
# Pack radio buttons
content_area.pack_start(self.radio_ask, False, False, 5)
content_area.pack_start(self.radio_describe, False, False, 5)
content_area.pack_start(self.radio_action, False, False, 5)
content_area.pack_start(self.radio_image, False, False, 5)
content_area.pack_start(self.radio_browse, False, False, 5)
# Set first option as selected by default
self.radio_ask.set_active(True)
@@ -1798,6 +2092,10 @@ class AIAssistantMenu(Gtk.Dialog):
action_id = "describe_screen"
elif self.radio_action.get_active():
action_id = "request_action"
elif self.radio_image.get_active():
action_id = "analyze_images"
elif self.radio_browse.get_active():
action_id = "browse_image_file"
else:
action_id = None
+1 -3
View File
@@ -197,10 +197,8 @@ CHAT_SPEAK_ALL = 0
CHAT_SPEAK_ALL_IF_FOCUSED = 1
CHAT_SPEAK_FOCUSED_CHANNEL = 2
# AI Assistant constants
AI_PROVIDER_CLAUDE = "claude"
# AI Assistant constants - simplified to providers that don't need complex API key management
AI_PROVIDER_CLAUDE_CODE = "claude_code"
AI_PROVIDER_CHATGPT = "chatgpt"
AI_PROVIDER_GEMINI = "gemini"
AI_PROVIDER_OLLAMA = "ollama"