Removed ocrdesktop dependency. I38 now handles OCR with a much smaller, simpler and faster program.

2025-05-12 06:07:03 -04:00
parent 09e21e297c
commit 65e6dcf980
4 changed files with 156 additions and 6 deletions
--- a/I38.md
+++ b/I38.md
@@ -151,7 +151,7 @@ If you've enabled braille display support during setup, I38 will start XBrlAPI a

 ### OCR (Optical Character Recognition)

-If installed, you can use OCR to read text from images or inaccessible applications:
+If required dependencies are installed, you can use OCR to read text from images or inaccessible applications:

 - `MODKEY` + `F5`: Perform OCR on the entire screen and speak the content
 - In Ratpoison mode: `Print` or `MODKEY` + `r`: Perform OCR and save to clipboard
--- a/README.md
+++ b/README.md
@@ -25,15 +25,19 @@ An uppercase I looks like a 1, 3 from i3, and 8 because the song [We Are 138](ht
 - lxsession: [optional] For GUI power options like shutdown
 - magic-wormhole: [optional] for file sharing with magic-wormhole GUI
 - notification-daemon: To handle notifications
- ocrdesktop: For getting contents of the current window with OCR.
 - pamixer: for the mute-unmute script
 - pandoc or markdown: To generate html files.
 - pcmanfm: [optional] Graphical file manager.
 - playerctl: music controls
 - python-gobject: for applications menu.
 - python-i3ipc: for sounds etc.
+- python-pillow: For OCR
+- python-pytesseract: For OCR
 - remind: [optional] For reminder notifications, Requires notify-daemon and notify-send for automatic reminders.
+scrot: For OCR
 - sox: for sounds.
+- tesseract: For OCR
+- tesseract-data-eng: For OCR
 - transfersh: [optional] for file sharing GUI
 - udiskie: [optional] for automatically mounting removable storage
 - x11bell: [optional] Bell support if you do not have a PC speaker. Available from https://github.com/jovanlanik/x11bell
--- a/i38.sh
+++ b/i38.sh
@@ -576,10 +576,8 @@ bindsym $mod+Shift+BackSpace mode "default"

 EOF

-# ocrdesktop through speech-dispatcher
-if command -v ocrdesktop &> /dev/null ; then
-    echo "bindsym ${mod}+F5 exec bash -c 'spd-say -Cw \"performing O C R\" && ocrdesktop -cnog | spd-say -e --'"  >> ${i3Path}/config
-fi
+# ocr through speech-dispatcher
+    echo "bindsym ${mod}+F5 exec ${i3Path}/scripts/ocr.py" >> ${i3Path}/config
 # Interrupt speech-dispatcher output
 echo "bindsym ${mod}+Shift+F5 exec spd-say -C" >> ${i3Path}/config

--- a/scripts/ocr.py
+++ b/scripts/ocr.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This file is part of I38.
+
+# I38 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
+# either version 3 of the License, or (at your option) any later version.
+
+# I38 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+# PURPOSE. See the GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License along with I38. If not, see <https://www.gnu.org/licenses/>.
+
+
+"""
+Simple OCR Screen Reader
+A lightweight tool that performs OCR on the screen and speaks the results
+"""
+
+import os
+import sys
+import time
+import subprocess
+from PIL import Image, ImageOps
+import pytesseract
+
+def capture_screen(max_retries=3, initial_delay=0.2):
+    """
+    Capture the screen using scrot with robust checking and retries
+    
+    Args:
+        max_retries: Maximum number of attempts to read the image
+        initial_delay: Initial delay in seconds (will increase with retries)
+    """
+    temp_file = "/tmp/ocr_capture.png"
+    
+    try:
+        # Capture the screen
+        subprocess.run(["scrot", temp_file], check=True)
+        
+        # Wait and retry approach with validity checking
+        delay = initial_delay
+        for attempt in range(max_retries):
+            time.sleep(delay)
+            
+            # Check if file exists and has content
+            if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
+                try:
+                    # Try to verify the image is valid
+                    with Image.open(temp_file) as test_img:
+                        # Just accessing a property forces PIL to validate the image
+                        test_img.size
+                    
+                    # If we get here, the image is valid
+                    return Image.open(temp_file)
+                except (IOError, OSError) as e:
+                    # Image exists but isn't valid yet
+                    if attempt < max_retries - 1:
+                        # Increase delay exponentially for next attempt
+                        delay *= 2
+                        continue
+                    else:
+                        raise Exception(f"Image file exists but is not valid after {max_retries} attempts")
+            
+            # File doesn't exist or is empty
+            if attempt < max_retries - 1:
+                # Increase delay exponentially for next attempt
+                delay *= 2
+            else:
+                raise Exception(f"Screenshot file not created properly after {max_retries} attempts")
+    
+    except Exception as e:
+        print(f"Error capturing screen: {e}")
+        raise
+    finally:
+        # Ensure file is removed even if an error occurs
+        if os.path.exists(temp_file):
+            os.remove(temp_file)
+
+def process_image(img, scale_factor=1.5):
+    """Process the image to improve OCR accuracy"""
+    # Scale the image to improve OCR
+    if scale_factor != 1:
+        width, height = img.size
+        img = img.resize((int(width * scale_factor), int(height * scale_factor)), 
+                         Image.Resampling.BICUBIC)
+    
+    # Convert to grayscale for faster processing
+    img = ImageOps.grayscale(img)
+    
+    # Improve contrast for better text recognition
+    img = ImageOps.autocontrast(img)
+    
+    return img
+
+def perform_ocr(img, lang='eng'):
+    """Perform OCR on the image"""
+    # Use tessaract with optimized settings
+    # --oem 1: Use LSTM OCR Engine
+    # --psm 6: Assume a single uniform block of text
+    text = pytesseract.image_to_string(img, lang=lang, config='--oem 1 --psm 6')
+    
+    return text
+
+def speak_text(text):
+    """Speak the text using speech-dispatcher"""
+    # Filter out empty lines and clean up the text
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    cleaned_text = ' '.join(lines)
+    
+    # Use speech-dispatcher to speak the text
+    if cleaned_text:
+        subprocess.run(["spd-say", "-Cw", cleaned_text])
+    else:
+        subprocess.run(["spd-say", "-Cw", "No text detected"])
+
+def main():
+    # Limit tesseract thread usage to improve performance on Pi
+    os.environ["OMP_THREAD_LIMIT"] = "4"
+    
+    try:
+        # Announce start
+        subprocess.run(["spd-say", "-Cw", "performing OCR"])
+        
+        # Capture screen
+        img = capture_screen()
+        
+        # Process image
+        processed_img = process_image(img, scale_factor=1.5)
+        
+        # Perform OCR
+        text = perform_ocr(processed_img)
+        
+        # Speak the results
+        speak_text(text)
+    
+    except Exception as e:
+        # Let the user know something went wrong
+        error_msg = f"Error during OCR: {str(e)}"
+        print(error_msg)
+        try:
+            subprocess.run(["spd-say", "-Cw", "OCR failed"])
+        except:
+            # If even speech fails, at least we tried
+            pass
+
+if __name__ == "__main__":
+    main()