From a3a63eee4db4a63e3c1b6ece34742727802ffd66 Mon Sep 17 00:00:00 2001 From: Devin Prater Date: Wed, 27 May 2026 12:44:26 -0400 Subject: [PATCH] Improve AI screenshot context and window search --- scripts/ai.py | 192 ++++++++++++++++++++++++++++++++++++----- scripts/window_list.sh | 2 +- 2 files changed, 170 insertions(+), 24 deletions(-) diff --git a/scripts/ai.py b/scripts/ai.py index bb5b322..925b405 100755 --- a/scripts/ai.py +++ b/scripts/ai.py @@ -251,6 +251,7 @@ class OllamaInterface: def __init__(self, host='http://localhost:11434'): self.host = host + self.model_details_cache = {} def get_models(self): """Get list of available Ollama models""" @@ -262,17 +263,73 @@ class OllamaInterface: except Exception as e: print(f"Error getting Ollama models: {e}") return [] + + def get_model_details(self, model_name): + """Get model metadata from Ollama, including capabilities when available""" + if not model_name: + return {} + + if model_name in self.model_details_cache: + return self.model_details_cache[model_name] + + try: + response = requests.post( + f'{self.host}/api/show', + json={'model': model_name}, + timeout=5 + ) + if response.status_code == 200: + details = response.json() + self.model_details_cache[model_name] = details + return details + except Exception as e: + print(f"Error getting Ollama model details for {model_name}: {e}") + + self.model_details_cache[model_name] = {} + return {} + + def model_name_looks_like_vision(self, model_name): + """Fallback detection for older Ollama versions without capabilities""" + if not model_name: + return False + + model_lower = model_name.lower() + vision_patterns = [ + 'vision', + 'llava', + 'bakllava', + 'moondream', + 'minicpm-v', + 'minicpm-vision', + 'qwen-vl', + 'qwen2-vl', + 'qwen2.5-vl', + 'qwen3-vl', + 'gemma3', + 'gemma4', + ] + return any(pattern in model_lower for pattern in vision_patterns) + + def model_details_indicate_vision(self, model_name): + """Check Ollama model metadata for native vision support""" + details = self.get_model_details(model_name) + capabilities = details.get('capabilities', []) + if isinstance(capabilities, list) and 'vision' in capabilities: + return True + + model_info = details.get('model_info', {}) + if isinstance(model_info, dict): + return any(str(key).endswith('.vision.image_size') for key in model_info) + + return False def get_vision_models(self): """Get list of models that can handle images""" all_models = self.get_models() - # Common vision model patterns - vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream'] vision_models = [] for model in all_models: - model_lower = model.lower() - if any(pattern in model_lower for pattern in vision_patterns): + if self.is_vision_model(model): vision_models.append(model) return vision_models @@ -281,9 +338,11 @@ class OllamaInterface: """Check if a model can handle images""" if not model_name: return False - model_lower = model_name.lower() - vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream'] - return any(pattern in model_lower for pattern in vision_patterns) + + return ( + self.model_details_indicate_vision(model_name) + or self.model_name_looks_like_vision(model_name) + ) def is_available(self): """Check if Ollama is running and available""" @@ -312,7 +371,7 @@ class OllamaInterface: # Check if the model can handle images if not self.is_vision_model(model): - return f"Error: Model '{model}' cannot process images. Please select a vision model like llava or llama3.2-vision in settings." + return f"Error: Model '{model}' does not advertise vision support. Please select a vision-capable model in settings." # Encode image to base64 try: @@ -482,30 +541,55 @@ class WindowContext: self.i3 = i3ipc.Connection() except (ConnectionError, FileNotFoundError, Exception) as e: self.i3 = None - - def get_focused_window_info(self): - """Get information about the currently focused window""" + + def get_focused_window_id(self): + """Get the X11 window id for the currently focused i3 container""" if not self.i3: - return "Unable to connect to i3" - + return None + try: tree = self.i3.get_tree() focused = tree.find_focused() - + if focused and focused.window: + return str(focused.window) + except Exception: + pass + + return None + + def get_window_info(self, window_id=None): + """Get information about a specific window, or the focused window""" + if not self.i3: + return "Unable to connect to i3" + + try: + tree = self.i3.get_tree() + if window_id: + try: + focused = tree.find_by_window(int(window_id)) + except (TypeError, ValueError): + focused = None + else: + focused = tree.find_focused() + if not focused: return "No focused window found" - + info = { 'name': focused.name or 'Unknown', 'class': getattr(focused, 'window_class', 'Unknown'), 'title': getattr(focused, 'window_title', 'Unknown'), 'workspace': focused.workspace().name if focused.workspace() else 'Unknown' } - + return f"Current application: {info['name']}\nWindow type: {info['class']}" except Exception as e: return f"Error getting window info: {str(e)}" + def get_focused_window_info(self): + """Get information about the currently focused window""" + return self.get_window_info() + class AiAssistant(Gtk.Window): """Main AI Assistant window with accessibility features""" @@ -518,6 +602,7 @@ class AiAssistant(Gtk.Window): self.codexInterface = CodexCliInterface() self.ollamaInterface = OllamaInterface(self.config.get('ollama_host')) self.windowContext = WindowContext() + self.launchWindowId = self.windowContext.get_focused_window_id() self.voiceRecognition = VoiceRecognition(self.config) # Voice mode state @@ -1359,6 +1444,71 @@ class AiAssistant(Gtk.Window): if SystemCommands.is_command_available('play'): subprocess.run(['play', '-qnG', 'synth', '0.05', 'sin', '1200'], capture_output=True) + + def run_on_main_thread(self, callback): + """Run a GTK operation on the main thread and wait briefly for it""" + done = threading.Event() + + def wrapper(): + try: + callback() + finally: + done.set() + return False + + GLib.idle_add(wrapper) + done.wait(timeout=2) + + def get_target_window_context(self): + """Get context for the window that was focused before the assistant opened""" + if self.launchWindowId: + context = self.windowContext.get_window_info(self.launchWindowId) + if not context.startswith("No focused window found"): + return context + + return self.windowContext.get_focused_window_info() + + def focus_launch_window_for_capture(self): + """Hide the assistant and focus the launch window before taking a screenshot""" + self.run_on_main_thread(self.hide) + time.sleep(0.2) + + if not self.launchWindowId: + return False + + try: + result = subprocess.run( + ['i3-msg', f'[id="{self.launchWindowId}"] focus'], + capture_output=True, + text=True, + timeout=3 + ) + time.sleep(0.3) + return result.returncode == 0 + except (subprocess.SubprocessError, FileNotFoundError, OSError): + return False + + def restore_assistant_after_capture(self): + """Show the assistant again after screenshot capture""" + def restore(): + self.show_all() + self.present() + + self.run_on_main_thread(restore) + + def capture_target_screenshot(self, screenshot_path): + """Capture the original focused window when possible, otherwise the screen""" + focused_launch_window = self.focus_launch_window_for_capture() + + try: + command = ['scrot'] + if focused_launch_window: + command.append('-u') + command.append(screenshot_path) + + return subprocess.run(command, capture_output=True, text=True, timeout=10) + finally: + self.restore_assistant_after_capture() def send_ai_request(self, message, context=None, image_path=None): """Send request to selected AI provider""" @@ -1459,7 +1609,7 @@ class AiAssistant(Gtk.Window): return def ask_with_context_in_thread(): - context = self.windowContext.get_focused_window_info() + context = self.get_target_window_context() response = self.send_ai_request(question, context) GLib.idle_add(self.set_response_text, response) @@ -1572,9 +1722,7 @@ class AiAssistant(Gtk.Window): screenshot_path = os.path.join(temp_dir, 'screenshot.png') try: - # Use scrot to take screenshot - result = subprocess.run(['scrot', screenshot_path], - capture_output=True, text=True, timeout=10) + result = self.capture_target_screenshot(screenshot_path) if result.returncode != 0: GLib.idle_add(self.set_response_text, "Error: Could not take screenshot") @@ -1633,9 +1781,7 @@ class AiAssistant(Gtk.Window): screenshot_path = os.path.join(temp_dir, 'screen_analysis.png') try: - # Take screenshot - scrot_result = subprocess.run(['scrot', screenshot_path], - capture_output=True, text=True, timeout=10) + scrot_result = self.capture_target_screenshot(screenshot_path) if scrot_result.returncode != 0: GLib.idle_add(self.set_response_text, "Error: Could not capture screen content") diff --git a/scripts/window_list.sh b/scripts/window_list.sh index 1d3a764..a834bba 100755 --- a/scripts/window_list.sh +++ b/scripts/window_list.sh @@ -20,7 +20,7 @@ for con in i3.get_tree(): if con.window and con.parent.type != "dockarea": print(con.window) print(con.name)') -id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 "${windowList[@]}")" +id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 --search-column 2 "${windowList[@]}")" if [[ -z "${id}" ]]; then exit 0 fi