From a3a63eee4db4a63e3c1b6ece34742727802ffd66 Mon Sep 17 00:00:00 2001
From: Devin Prater <r.d.t.prater@gmail.com>
Date: Wed, 27 May 2026 12:44:26 -0400
Subject: [PATCH] Improve AI screenshot context and window search

---
 scripts/ai.py          | 192 ++++++++++++++++++++++++++++++++++++-----
 scripts/window_list.sh |   2 +-
 2 files changed, 170 insertions(+), 24 deletions(-)

diff --git a/scripts/ai.py b/scripts/ai.py
index bb5b322..925b405 100755
--- a/scripts/ai.py
+++ b/scripts/ai.py
@@ -251,6 +251,7 @@ class OllamaInterface:
     
     def __init__(self, host='http://localhost:11434'):
         self.host = host
+        self.model_details_cache = {}
     
     def get_models(self):
         """Get list of available Ollama models"""
@@ -262,17 +263,73 @@ class OllamaInterface:
         except Exception as e:
             print(f"Error getting Ollama models: {e}")
         return []
+
+    def get_model_details(self, model_name):
+        """Get model metadata from Ollama, including capabilities when available"""
+        if not model_name:
+            return {}
+
+        if model_name in self.model_details_cache:
+            return self.model_details_cache[model_name]
+
+        try:
+            response = requests.post(
+                f'{self.host}/api/show',
+                json={'model': model_name},
+                timeout=5
+            )
+            if response.status_code == 200:
+                details = response.json()
+                self.model_details_cache[model_name] = details
+                return details
+        except Exception as e:
+            print(f"Error getting Ollama model details for {model_name}: {e}")
+
+        self.model_details_cache[model_name] = {}
+        return {}
+
+    def model_name_looks_like_vision(self, model_name):
+        """Fallback detection for older Ollama versions without capabilities"""
+        if not model_name:
+            return False
+
+        model_lower = model_name.lower()
+        vision_patterns = [
+            'vision',
+            'llava',
+            'bakllava',
+            'moondream',
+            'minicpm-v',
+            'minicpm-vision',
+            'qwen-vl',
+            'qwen2-vl',
+            'qwen2.5-vl',
+            'qwen3-vl',
+            'gemma3',
+            'gemma4',
+        ]
+        return any(pattern in model_lower for pattern in vision_patterns)
+
+    def model_details_indicate_vision(self, model_name):
+        """Check Ollama model metadata for native vision support"""
+        details = self.get_model_details(model_name)
+        capabilities = details.get('capabilities', [])
+        if isinstance(capabilities, list) and 'vision' in capabilities:
+            return True
+
+        model_info = details.get('model_info', {})
+        if isinstance(model_info, dict):
+            return any(str(key).endswith('.vision.image_size') for key in model_info)
+
+        return False
     
     def get_vision_models(self):
         """Get list of models that can handle images"""
         all_models = self.get_models()
-        # Common vision model patterns
-        vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
         vision_models = []
         
         for model in all_models:
-            model_lower = model.lower()
-            if any(pattern in model_lower for pattern in vision_patterns):
+            if self.is_vision_model(model):
                 vision_models.append(model)
         
         return vision_models
@@ -281,9 +338,11 @@ class OllamaInterface:
         """Check if a model can handle images"""
         if not model_name:
             return False
-        model_lower = model_name.lower()
-        vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
-        return any(pattern in model_lower for pattern in vision_patterns)
+
+        return (
+            self.model_details_indicate_vision(model_name)
+            or self.model_name_looks_like_vision(model_name)
+        )
     
     def is_available(self):
         """Check if Ollama is running and available"""
@@ -312,7 +371,7 @@ class OllamaInterface:
                 
                 # Check if the model can handle images
                 if not self.is_vision_model(model):
-                    return f"Error: Model '{model}' cannot process images. Please select a vision model like llava or llama3.2-vision in settings."
+                    return f"Error: Model '{model}' does not advertise vision support. Please select a vision-capable model in settings."
                 
                 # Encode image to base64
                 try:
@@ -482,30 +541,55 @@ class WindowContext:
             self.i3 = i3ipc.Connection()
         except (ConnectionError, FileNotFoundError, Exception) as e:
             self.i3 = None
-    
-    def get_focused_window_info(self):
-        """Get information about the currently focused window"""
+
+    def get_focused_window_id(self):
+        """Get the X11 window id for the currently focused i3 container"""
         if not self.i3:
-            return "Unable to connect to i3"
-        
+            return None
+
         try:
             tree = self.i3.get_tree()
             focused = tree.find_focused()
-            
+            if focused and focused.window:
+                return str(focused.window)
+        except Exception:
+            pass
+
+        return None
+
+    def get_window_info(self, window_id=None):
+        """Get information about a specific window, or the focused window"""
+        if not self.i3:
+            return "Unable to connect to i3"
+
+        try:
+            tree = self.i3.get_tree()
+            if window_id:
+                try:
+                    focused = tree.find_by_window(int(window_id))
+                except (TypeError, ValueError):
+                    focused = None
+            else:
+                focused = tree.find_focused()
+
             if not focused:
                 return "No focused window found"
-            
+
             info = {
                 'name': focused.name or 'Unknown',
                 'class': getattr(focused, 'window_class', 'Unknown'),
                 'title': getattr(focused, 'window_title', 'Unknown'),
                 'workspace': focused.workspace().name if focused.workspace() else 'Unknown'
             }
-            
+
             return f"Current application: {info['name']}\nWindow type: {info['class']}"
         except Exception as e:
             return f"Error getting window info: {str(e)}"
 
+    def get_focused_window_info(self):
+        """Get information about the currently focused window"""
+        return self.get_window_info()
+
 class AiAssistant(Gtk.Window):
     """Main AI Assistant window with accessibility features"""
     
@@ -518,6 +602,7 @@ class AiAssistant(Gtk.Window):
         self.codexInterface = CodexCliInterface()
         self.ollamaInterface = OllamaInterface(self.config.get('ollama_host'))
         self.windowContext = WindowContext()
+        self.launchWindowId = self.windowContext.get_focused_window_id()
         self.voiceRecognition = VoiceRecognition(self.config)
         
         # Voice mode state
@@ -1359,6 +1444,71 @@ class AiAssistant(Gtk.Window):
         if SystemCommands.is_command_available('play'):
             subprocess.run(['play', '-qnG', 'synth', '0.05', 'sin', '1200'],
                           capture_output=True)
+
+    def run_on_main_thread(self, callback):
+        """Run a GTK operation on the main thread and wait briefly for it"""
+        done = threading.Event()
+
+        def wrapper():
+            try:
+                callback()
+            finally:
+                done.set()
+            return False
+
+        GLib.idle_add(wrapper)
+        done.wait(timeout=2)
+
+    def get_target_window_context(self):
+        """Get context for the window that was focused before the assistant opened"""
+        if self.launchWindowId:
+            context = self.windowContext.get_window_info(self.launchWindowId)
+            if not context.startswith("No focused window found"):
+                return context
+
+        return self.windowContext.get_focused_window_info()
+
+    def focus_launch_window_for_capture(self):
+        """Hide the assistant and focus the launch window before taking a screenshot"""
+        self.run_on_main_thread(self.hide)
+        time.sleep(0.2)
+
+        if not self.launchWindowId:
+            return False
+
+        try:
+            result = subprocess.run(
+                ['i3-msg', f'[id="{self.launchWindowId}"] focus'],
+                capture_output=True,
+                text=True,
+                timeout=3
+            )
+            time.sleep(0.3)
+            return result.returncode == 0
+        except (subprocess.SubprocessError, FileNotFoundError, OSError):
+            return False
+
+    def restore_assistant_after_capture(self):
+        """Show the assistant again after screenshot capture"""
+        def restore():
+            self.show_all()
+            self.present()
+
+        self.run_on_main_thread(restore)
+
+    def capture_target_screenshot(self, screenshot_path):
+        """Capture the original focused window when possible, otherwise the screen"""
+        focused_launch_window = self.focus_launch_window_for_capture()
+
+        try:
+            command = ['scrot']
+            if focused_launch_window:
+                command.append('-u')
+            command.append(screenshot_path)
+
+            return subprocess.run(command, capture_output=True, text=True, timeout=10)
+        finally:
+            self.restore_assistant_after_capture()
     
     def send_ai_request(self, message, context=None, image_path=None):
         """Send request to selected AI provider"""
@@ -1459,7 +1609,7 @@ class AiAssistant(Gtk.Window):
             return
         
         def ask_with_context_in_thread():
-            context = self.windowContext.get_focused_window_info()
+            context = self.get_target_window_context()
             response = self.send_ai_request(question, context)
             GLib.idle_add(self.set_response_text, response)
         
@@ -1572,9 +1722,7 @@ class AiAssistant(Gtk.Window):
             screenshot_path = os.path.join(temp_dir, 'screenshot.png')
 
             try:
-                # Use scrot to take screenshot
-                result = subprocess.run(['scrot', screenshot_path],
-                                      capture_output=True, text=True, timeout=10)
+                result = self.capture_target_screenshot(screenshot_path)
 
                 if result.returncode != 0:
                     GLib.idle_add(self.set_response_text, "Error: Could not take screenshot")
@@ -1633,9 +1781,7 @@ class AiAssistant(Gtk.Window):
                     screenshot_path = os.path.join(temp_dir, 'screen_analysis.png')
 
                     try:
-                        # Take screenshot
-                        scrot_result = subprocess.run(['scrot', screenshot_path],
-                                                    capture_output=True, text=True, timeout=10)
+                        scrot_result = self.capture_target_screenshot(screenshot_path)
 
                         if scrot_result.returncode != 0:
                             GLib.idle_add(self.set_response_text, "Error: Could not capture screen content")
diff --git a/scripts/window_list.sh b/scripts/window_list.sh
index 1d3a764..a834bba 100755
--- a/scripts/window_list.sh
+++ b/scripts/window_list.sh
@@ -20,7 +20,7 @@ for con in i3.get_tree():
     if con.window and con.parent.type != "dockarea":
         print(con.window)
         print(con.name)')
-id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 "${windowList[@]}")"
+id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 --search-column 2 "${windowList[@]}")"
 if [[ -z "${id}" ]]; then
     exit 0
 fi