Improve AI screenshot context and window search
This commit is contained in:
committed by
Storm Dragon
parent
1a2252d8de
commit
a3a63eee4d
+164
-18
@@ -251,6 +251,7 @@ class OllamaInterface:
|
|||||||
|
|
||||||
def __init__(self, host='http://localhost:11434'):
|
def __init__(self, host='http://localhost:11434'):
|
||||||
self.host = host
|
self.host = host
|
||||||
|
self.model_details_cache = {}
|
||||||
|
|
||||||
def get_models(self):
|
def get_models(self):
|
||||||
"""Get list of available Ollama models"""
|
"""Get list of available Ollama models"""
|
||||||
@@ -263,16 +264,72 @@ class OllamaInterface:
|
|||||||
print(f"Error getting Ollama models: {e}")
|
print(f"Error getting Ollama models: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def get_model_details(self, model_name):
|
||||||
|
"""Get model metadata from Ollama, including capabilities when available"""
|
||||||
|
if not model_name:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if model_name in self.model_details_cache:
|
||||||
|
return self.model_details_cache[model_name]
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
f'{self.host}/api/show',
|
||||||
|
json={'model': model_name},
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
details = response.json()
|
||||||
|
self.model_details_cache[model_name] = details
|
||||||
|
return details
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error getting Ollama model details for {model_name}: {e}")
|
||||||
|
|
||||||
|
self.model_details_cache[model_name] = {}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def model_name_looks_like_vision(self, model_name):
|
||||||
|
"""Fallback detection for older Ollama versions without capabilities"""
|
||||||
|
if not model_name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
model_lower = model_name.lower()
|
||||||
|
vision_patterns = [
|
||||||
|
'vision',
|
||||||
|
'llava',
|
||||||
|
'bakllava',
|
||||||
|
'moondream',
|
||||||
|
'minicpm-v',
|
||||||
|
'minicpm-vision',
|
||||||
|
'qwen-vl',
|
||||||
|
'qwen2-vl',
|
||||||
|
'qwen2.5-vl',
|
||||||
|
'qwen3-vl',
|
||||||
|
'gemma3',
|
||||||
|
'gemma4',
|
||||||
|
]
|
||||||
|
return any(pattern in model_lower for pattern in vision_patterns)
|
||||||
|
|
||||||
|
def model_details_indicate_vision(self, model_name):
|
||||||
|
"""Check Ollama model metadata for native vision support"""
|
||||||
|
details = self.get_model_details(model_name)
|
||||||
|
capabilities = details.get('capabilities', [])
|
||||||
|
if isinstance(capabilities, list) and 'vision' in capabilities:
|
||||||
|
return True
|
||||||
|
|
||||||
|
model_info = details.get('model_info', {})
|
||||||
|
if isinstance(model_info, dict):
|
||||||
|
return any(str(key).endswith('.vision.image_size') for key in model_info)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def get_vision_models(self):
|
def get_vision_models(self):
|
||||||
"""Get list of models that can handle images"""
|
"""Get list of models that can handle images"""
|
||||||
all_models = self.get_models()
|
all_models = self.get_models()
|
||||||
# Common vision model patterns
|
|
||||||
vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
|
|
||||||
vision_models = []
|
vision_models = []
|
||||||
|
|
||||||
for model in all_models:
|
for model in all_models:
|
||||||
model_lower = model.lower()
|
if self.is_vision_model(model):
|
||||||
if any(pattern in model_lower for pattern in vision_patterns):
|
|
||||||
vision_models.append(model)
|
vision_models.append(model)
|
||||||
|
|
||||||
return vision_models
|
return vision_models
|
||||||
@@ -281,9 +338,11 @@ class OllamaInterface:
|
|||||||
"""Check if a model can handle images"""
|
"""Check if a model can handle images"""
|
||||||
if not model_name:
|
if not model_name:
|
||||||
return False
|
return False
|
||||||
model_lower = model_name.lower()
|
|
||||||
vision_patterns = ['llava', 'llama3.2-vision', 'minicpm-v', 'bakllava', 'moondream']
|
return (
|
||||||
return any(pattern in model_lower for pattern in vision_patterns)
|
self.model_details_indicate_vision(model_name)
|
||||||
|
or self.model_name_looks_like_vision(model_name)
|
||||||
|
)
|
||||||
|
|
||||||
def is_available(self):
|
def is_available(self):
|
||||||
"""Check if Ollama is running and available"""
|
"""Check if Ollama is running and available"""
|
||||||
@@ -312,7 +371,7 @@ class OllamaInterface:
|
|||||||
|
|
||||||
# Check if the model can handle images
|
# Check if the model can handle images
|
||||||
if not self.is_vision_model(model):
|
if not self.is_vision_model(model):
|
||||||
return f"Error: Model '{model}' cannot process images. Please select a vision model like llava or llama3.2-vision in settings."
|
return f"Error: Model '{model}' does not advertise vision support. Please select a vision-capable model in settings."
|
||||||
|
|
||||||
# Encode image to base64
|
# Encode image to base64
|
||||||
try:
|
try:
|
||||||
@@ -483,14 +542,35 @@ class WindowContext:
|
|||||||
except (ConnectionError, FileNotFoundError, Exception) as e:
|
except (ConnectionError, FileNotFoundError, Exception) as e:
|
||||||
self.i3 = None
|
self.i3 = None
|
||||||
|
|
||||||
def get_focused_window_info(self):
|
def get_focused_window_id(self):
|
||||||
"""Get information about the currently focused window"""
|
"""Get the X11 window id for the currently focused i3 container"""
|
||||||
|
if not self.i3:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
tree = self.i3.get_tree()
|
||||||
|
focused = tree.find_focused()
|
||||||
|
if focused and focused.window:
|
||||||
|
return str(focused.window)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_window_info(self, window_id=None):
|
||||||
|
"""Get information about a specific window, or the focused window"""
|
||||||
if not self.i3:
|
if not self.i3:
|
||||||
return "Unable to connect to i3"
|
return "Unable to connect to i3"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree = self.i3.get_tree()
|
tree = self.i3.get_tree()
|
||||||
focused = tree.find_focused()
|
if window_id:
|
||||||
|
try:
|
||||||
|
focused = tree.find_by_window(int(window_id))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
focused = None
|
||||||
|
else:
|
||||||
|
focused = tree.find_focused()
|
||||||
|
|
||||||
if not focused:
|
if not focused:
|
||||||
return "No focused window found"
|
return "No focused window found"
|
||||||
@@ -506,6 +586,10 @@ class WindowContext:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error getting window info: {str(e)}"
|
return f"Error getting window info: {str(e)}"
|
||||||
|
|
||||||
|
def get_focused_window_info(self):
|
||||||
|
"""Get information about the currently focused window"""
|
||||||
|
return self.get_window_info()
|
||||||
|
|
||||||
class AiAssistant(Gtk.Window):
|
class AiAssistant(Gtk.Window):
|
||||||
"""Main AI Assistant window with accessibility features"""
|
"""Main AI Assistant window with accessibility features"""
|
||||||
|
|
||||||
@@ -518,6 +602,7 @@ class AiAssistant(Gtk.Window):
|
|||||||
self.codexInterface = CodexCliInterface()
|
self.codexInterface = CodexCliInterface()
|
||||||
self.ollamaInterface = OllamaInterface(self.config.get('ollama_host'))
|
self.ollamaInterface = OllamaInterface(self.config.get('ollama_host'))
|
||||||
self.windowContext = WindowContext()
|
self.windowContext = WindowContext()
|
||||||
|
self.launchWindowId = self.windowContext.get_focused_window_id()
|
||||||
self.voiceRecognition = VoiceRecognition(self.config)
|
self.voiceRecognition = VoiceRecognition(self.config)
|
||||||
|
|
||||||
# Voice mode state
|
# Voice mode state
|
||||||
@@ -1360,6 +1445,71 @@ class AiAssistant(Gtk.Window):
|
|||||||
subprocess.run(['play', '-qnG', 'synth', '0.05', 'sin', '1200'],
|
subprocess.run(['play', '-qnG', 'synth', '0.05', 'sin', '1200'],
|
||||||
capture_output=True)
|
capture_output=True)
|
||||||
|
|
||||||
|
def run_on_main_thread(self, callback):
|
||||||
|
"""Run a GTK operation on the main thread and wait briefly for it"""
|
||||||
|
done = threading.Event()
|
||||||
|
|
||||||
|
def wrapper():
|
||||||
|
try:
|
||||||
|
callback()
|
||||||
|
finally:
|
||||||
|
done.set()
|
||||||
|
return False
|
||||||
|
|
||||||
|
GLib.idle_add(wrapper)
|
||||||
|
done.wait(timeout=2)
|
||||||
|
|
||||||
|
def get_target_window_context(self):
|
||||||
|
"""Get context for the window that was focused before the assistant opened"""
|
||||||
|
if self.launchWindowId:
|
||||||
|
context = self.windowContext.get_window_info(self.launchWindowId)
|
||||||
|
if not context.startswith("No focused window found"):
|
||||||
|
return context
|
||||||
|
|
||||||
|
return self.windowContext.get_focused_window_info()
|
||||||
|
|
||||||
|
def focus_launch_window_for_capture(self):
|
||||||
|
"""Hide the assistant and focus the launch window before taking a screenshot"""
|
||||||
|
self.run_on_main_thread(self.hide)
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
if not self.launchWindowId:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['i3-msg', f'[id="{self.launchWindowId}"] focus'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=3
|
||||||
|
)
|
||||||
|
time.sleep(0.3)
|
||||||
|
return result.returncode == 0
|
||||||
|
except (subprocess.SubprocessError, FileNotFoundError, OSError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def restore_assistant_after_capture(self):
|
||||||
|
"""Show the assistant again after screenshot capture"""
|
||||||
|
def restore():
|
||||||
|
self.show_all()
|
||||||
|
self.present()
|
||||||
|
|
||||||
|
self.run_on_main_thread(restore)
|
||||||
|
|
||||||
|
def capture_target_screenshot(self, screenshot_path):
|
||||||
|
"""Capture the original focused window when possible, otherwise the screen"""
|
||||||
|
focused_launch_window = self.focus_launch_window_for_capture()
|
||||||
|
|
||||||
|
try:
|
||||||
|
command = ['scrot']
|
||||||
|
if focused_launch_window:
|
||||||
|
command.append('-u')
|
||||||
|
command.append(screenshot_path)
|
||||||
|
|
||||||
|
return subprocess.run(command, capture_output=True, text=True, timeout=10)
|
||||||
|
finally:
|
||||||
|
self.restore_assistant_after_capture()
|
||||||
|
|
||||||
def send_ai_request(self, message, context=None, image_path=None):
|
def send_ai_request(self, message, context=None, image_path=None):
|
||||||
"""Send request to selected AI provider"""
|
"""Send request to selected AI provider"""
|
||||||
provider = self.config.get('provider')
|
provider = self.config.get('provider')
|
||||||
@@ -1459,7 +1609,7 @@ class AiAssistant(Gtk.Window):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def ask_with_context_in_thread():
|
def ask_with_context_in_thread():
|
||||||
context = self.windowContext.get_focused_window_info()
|
context = self.get_target_window_context()
|
||||||
response = self.send_ai_request(question, context)
|
response = self.send_ai_request(question, context)
|
||||||
GLib.idle_add(self.set_response_text, response)
|
GLib.idle_add(self.set_response_text, response)
|
||||||
|
|
||||||
@@ -1572,9 +1722,7 @@ class AiAssistant(Gtk.Window):
|
|||||||
screenshot_path = os.path.join(temp_dir, 'screenshot.png')
|
screenshot_path = os.path.join(temp_dir, 'screenshot.png')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use scrot to take screenshot
|
result = self.capture_target_screenshot(screenshot_path)
|
||||||
result = subprocess.run(['scrot', screenshot_path],
|
|
||||||
capture_output=True, text=True, timeout=10)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
GLib.idle_add(self.set_response_text, "Error: Could not take screenshot")
|
GLib.idle_add(self.set_response_text, "Error: Could not take screenshot")
|
||||||
@@ -1633,9 +1781,7 @@ class AiAssistant(Gtk.Window):
|
|||||||
screenshot_path = os.path.join(temp_dir, 'screen_analysis.png')
|
screenshot_path = os.path.join(temp_dir, 'screen_analysis.png')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Take screenshot
|
scrot_result = self.capture_target_screenshot(screenshot_path)
|
||||||
scrot_result = subprocess.run(['scrot', screenshot_path],
|
|
||||||
capture_output=True, text=True, timeout=10)
|
|
||||||
|
|
||||||
if scrot_result.returncode != 0:
|
if scrot_result.returncode != 0:
|
||||||
GLib.idle_add(self.set_response_text, "Error: Could not capture screen content")
|
GLib.idle_add(self.set_response_text, "Error: Could not capture screen content")
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ for con in i3.get_tree():
|
|||||||
if con.window and con.parent.type != "dockarea":
|
if con.window and con.parent.type != "dockarea":
|
||||||
print(con.window)
|
print(con.window)
|
||||||
print(con.name)')
|
print(con.name)')
|
||||||
id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 "${windowList[@]}")"
|
id="$(yad --title "I38" --list --separator "" --column "id" --column "Select Window" --hide-column 1 --print-column 1 --search-column 2 "${windowList[@]}")"
|
||||||
if [[ -z "${id}" ]]; then
|
if [[ -z "${id}" ]]; then
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user