Improve AI screenshot context and window search

2026-05-27 12:44:26 -04:00
parent 1a2252d8de
commit a3a63eee4d
2 changed files with 170 additions and 24 deletions
@@ -251,6 +251,7 @@ class OllamaInterface:
    def __init__(self, host='http://localhost:11434'):
        self.host = host
        self.model_details_cache = {}
    def get_models(self):
        """Get list of available Ollama models"""
@@ -263,16 +264,72 @@ class OllamaInterface:
            print(f"Error getting Ollama models: {e}")
        return []
    def get_model_details(self, model_name):
        """Get model metadata from Ollama, including capabilities when available"""
        if not model_name:
            return {}
        if model_name in self.model_details_cache:
            return self.model_details_cache[model_name]
        try:
            response = requests.post(
                f'{self.host}/api/show',
                json={'model': model_name},
                timeout=5
            )
            if response.status_code == 200:
                details = response.json()
                self.model_details_cache[model_name] = details
                return details
        except Exception as e:
            print(f"Error getting Ollama model details for {model_name}: {e}")
        self.model_details_cache[model_name] = {}
        return {}
    def model_name_looks_like_vision(self, model_name):
        """Fallback detection for older Ollama versions without capabilities"""
        if not model_name:
            return False
        model_lower = model_name.lower()
        vision_patterns = [
            'vision',
            'llava',
            'bakllava',
            'moondream',
            'minicpm-v',
            'minicpm-vision',
            'qwen-vl',
            'qwen2-vl',
            'qwen2.5-vl',
            'qwen3-vl',
            'gemma3',
            'gemma4',
        ]
        return any(pattern in model_lower for pattern in vision_patterns)
    def model_details_indicate_vision(self, model_name):
        """Check Ollama model metadata for native vision support"""
        details = self.get_model_details(model_name)
        capabilities = details.get('capabilities', [])
        if isinstance(capabilities, list) and 'vision' in capabilities:
            return True
        model_info = details.get('model_info', {})
        if isinstance(model_info, dict):
            return any(str(key).endswith('.vision.image_size') for key in model_info)
        return False
    def get_vision_models(self):
        """Get list of models that can handle images"""
        all_models = self.get_models()
        # Common vision model patterns
        vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
        vision_models = []
        for model in all_models:
-            model_lower = model.lower()
+            if self.is_vision_model(model):
            if any(pattern in model_lower for pattern in vision_patterns):
                vision_models.append(model)
        return vision_models
@@ -281,9 +338,11 @@ class OllamaInterface:
        """Check if a model can handle images"""
        if not model_name:
            return False
-        model_lower = model_name.lower()
+
-        vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
+        return (
-        return any(pattern in model_lower for pattern in vision_patterns)
+            self.model_details_indicate_vision(model_name)
            or self.model_name_looks_like_vision(model_name)
        )
    def is_available(self):
        """Check if Ollama is running and available"""
@@ -312,7 +371,7 @@ class OllamaInterface:
                # Check if the model can handle images
                if not self.is_vision_model(model):
-                    return f"Error: Model '{model}' cannot process images. Please select a vision model like llava or llama3.2-vision in settings."
+                    return f"Error: Model '{model}' does not advertise vision support. Please select a vision-capable model in settings."
                # Encode image to base64
                try:
@@ -483,14 +542,35 @@ class WindowContext:
        except (ConnectionError, FileNotFoundError, Exception) as e:
            self.i3 = None
-    def get_focused_window_info(self):
+    def get_focused_window_id(self):
-        """Get information about the currently focused window"""
+        """Get the X11 window id for the currently focused i3 container"""
        if not self.i3:
            return None
        try:
            tree = self.i3.get_tree()
            focused = tree.find_focused()
            if focused and focused.window:
                return str(focused.window)
        except Exception:
            pass
        return None
    def get_window_info(self, window_id=None):
        """Get information about a specific window, or the focused window"""
        if not self.i3:
            return "Unable to connect to i3"
        try:
            tree = self.i3.get_tree()
-            focused = tree.find_focused()
+            if window_id:
                try:
                    focused = tree.find_by_window(int(window_id))
                except (TypeError, ValueError):
                    focused = None
            else:
                focused = tree.find_focused()
            if not focused:
                return "No focused window found"
@@ -506,6 +586,10 @@ class WindowContext:
        except Exception as e:
            return f"Error getting window info: {str(e)}"
    def get_focused_window_info(self):
        """Get information about the currently focused window"""
        return self.get_window_info()
 class AiAssistant(Gtk.Window):
    """Main AI Assistant window with accessibility features"""
@@ -518,6 +602,7 @@ class AiAssistant(Gtk.Window):
        self.codexInterface = CodexCliInterface()
        self.ollamaInterface = OllamaInterface(self.config.get('ollama_host'))
        self.windowContext = WindowContext()
        self.launchWindowId = self.windowContext.get_focused_window_id()
        self.voiceRecognition = VoiceRecognition(self.config)
        # Voice mode state
@@ -1360,6 +1445,71 @@ class AiAssistant(Gtk.Window):
            subprocess.run(['play', '-qnG', 'synth', '0.05', 'sin', '1200'],
                          capture_output=True)
    def run_on_main_thread(self, callback):
        """Run a GTK operation on the main thread and wait briefly for it"""
        done = threading.Event()
        def wrapper():
            try:
                callback()
            finally:
                done.set()
            return False
        GLib.idle_add(wrapper)
        done.wait(timeout=2)
    def get_target_window_context(self):
        """Get context for the window that was focused before the assistant opened"""
        if self.launchWindowId:
            context = self.windowContext.get_window_info(self.launchWindowId)
            if not context.startswith("No focused window found"):
                return context
        return self.windowContext.get_focused_window_info()
    def focus_launch_window_for_capture(self):
        """Hide the assistant and focus the launch window before taking a screenshot"""
        self.run_on_main_thread(self.hide)
        time.sleep(0.2)
        if not self.launchWindowId:
            return False
        try:
            result = subprocess.run(
                ['i3-msg', f'[id="{self.launchWindowId}"] focus'],
                capture_output=True,
                text=True,
                timeout=3
            )
            time.sleep(0.3)
            return result.returncode == 0
        except (subprocess.SubprocessError, FileNotFoundError, OSError):
            return False
    def restore_assistant_after_capture(self):
        """Show the assistant again after screenshot capture"""
        def restore():
            self.show_all()
            self.present()
        self.run_on_main_thread(restore)
    def capture_target_screenshot(self, screenshot_path):
        """Capture the original focused window when possible, otherwise the screen"""
        focused_launch_window = self.focus_launch_window_for_capture()
        try:
            command = ['scrot']
            if focused_launch_window:
                command.append('-u')
            command.append(screenshot_path)
            return subprocess.run(command, capture_output=True, text=True, timeout=10)
        finally:
            self.restore_assistant_after_capture()
    def send_ai_request(self, message, context=None, image_path=None):
        """Send request to selected AI provider"""
        provider = self.config.get('provider')
@@ -1459,7 +1609,7 @@ class AiAssistant(Gtk.Window):
            return
        def ask_with_context_in_thread():
-            context = self.windowContext.get_focused_window_info()
+            context = self.get_target_window_context()
            response = self.send_ai_request(question, context)
            GLib.idle_add(self.set_response_text, response)
@@ -1572,9 +1722,7 @@ class AiAssistant(Gtk.Window):
            screenshot_path = os.path.join(temp_dir, 'screenshot.png')
            try:
-                # Use scrot to take screenshot
+                result = self.capture_target_screenshot(screenshot_path)
                result = subprocess.run(['scrot', screenshot_path],
                                      capture_output=True, text=True, timeout=10)
                if result.returncode != 0:
                    GLib.idle_add(self.set_response_text, "Error: Could not take screenshot")
@@ -1633,9 +1781,7 @@ class AiAssistant(Gtk.Window):
                    screenshot_path = os.path.join(temp_dir, 'screen_analysis.png')
                    try:
-                        # Take screenshot
+                        scrot_result = self.capture_target_screenshot(screenshot_path)
                        scrot_result = subprocess.run(['scrot', screenshot_path],
                                                    capture_output=True, text=True, timeout=10)
                        if scrot_result.returncode != 0:
                            GLib.idle_add(self.set_response_text, "Error: Could not capture screen content")
@@ -20,7 +20,7 @@ for con in i3.get_tree():
    if con.window and con.parent.type != "dockarea":
        print(con.window)
        print(con.name)')
-id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 "${windowList[@]}")"
+id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 --search-column 2 "${windowList[@]}")"
 if [[ -z "${id}" ]]; then
    exit 0
 fi