From 65e6dcf98088fe60d8847cc0c38aeca7614c1ecc Mon Sep 17 00:00:00 2001 From: Storm Dragon Date: Mon, 12 May 2025 06:07:03 -0400 Subject: [PATCH] Removed ocrdesktop dependency. I38 now handles OCR with a much smaller, simpler and faster program. --- I38.md | 2 +- README.md | 6 +- i38.sh | 6 +- scripts/ocr.py | 148 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 6 deletions(-) create mode 100755 scripts/ocr.py diff --git a/I38.md b/I38.md index b02ec1e..d2b7f50 100644 --- a/I38.md +++ b/I38.md @@ -151,7 +151,7 @@ If you've enabled braille display support during setup, I38 will start XBrlAPI a ### OCR (Optical Character Recognition) -If installed, you can use OCR to read text from images or inaccessible applications: +If required dependencies are installed, you can use OCR to read text from images or inaccessible applications: - `MODKEY` + `F5`: Perform OCR on the entire screen and speak the content - In Ratpoison mode: `Print` or `MODKEY` + `r`: Perform OCR and save to clipboard diff --git a/README.md b/README.md index d0f1830..aa182a6 100644 --- a/README.md +++ b/README.md @@ -25,15 +25,19 @@ An uppercase I looks like a 1, 3 from i3, and 8 because the song [We Are 138](ht - lxsession: [optional] For GUI power options like shutdown - magic-wormhole: [optional] for file sharing with magic-wormhole GUI - notification-daemon: To handle notifications -- ocrdesktop: For getting contents of the current window with OCR. - pamixer: for the mute-unmute script - pandoc or markdown: To generate html files. - pcmanfm: [optional] Graphical file manager. - playerctl: music controls - python-gobject: for applications menu. - python-i3ipc: for sounds etc. +- python-pillow: For OCR +- python-pytesseract: For OCR - remind: [optional] For reminder notifications, Requires notify-daemon and notify-send for automatic reminders. +scrot: For OCR - sox: for sounds. +- tesseract: For OCR +- tesseract-data-eng: For OCR - transfersh: [optional] for file sharing GUI - udiskie: [optional] for automatically mounting removable storage - x11bell: [optional] Bell support if you do not have a PC speaker. Available from https://github.com/jovanlanik/x11bell diff --git a/i38.sh b/i38.sh index b651055..5fb7739 100755 --- a/i38.sh +++ b/i38.sh @@ -576,10 +576,8 @@ bindsym $mod+Shift+BackSpace mode "default" EOF -# ocrdesktop through speech-dispatcher -if command -v ocrdesktop &> /dev/null ; then - echo "bindsym ${mod}+F5 exec bash -c 'spd-say -Cw \"performing O C R\" && ocrdesktop -cnog | spd-say -e --'" >> ${i3Path}/config -fi +# ocr through speech-dispatcher + echo "bindsym ${mod}+F5 exec ${i3Path}/scripts/ocr.py" >> ${i3Path}/config # Interrupt speech-dispatcher output echo "bindsym ${mod}+Shift+F5 exec spd-say -C" >> ${i3Path}/config diff --git a/scripts/ocr.py b/scripts/ocr.py new file mode 100755 index 0000000..b69c10c --- /dev/null +++ b/scripts/ocr.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This file is part of I38. + +# I38 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, +# either version 3 of the License, or (at your option) any later version. + +# I38 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +# PURPOSE. See the GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License along with I38. If not, see . + + +""" +Simple OCR Screen Reader +A lightweight tool that performs OCR on the screen and speaks the results +""" + +import os +import sys +import time +import subprocess +from PIL import Image, ImageOps +import pytesseract + +def capture_screen(max_retries=3, initial_delay=0.2): + """ + Capture the screen using scrot with robust checking and retries + + Args: + max_retries: Maximum number of attempts to read the image + initial_delay: Initial delay in seconds (will increase with retries) + """ + temp_file = "/tmp/ocr_capture.png" + + try: + # Capture the screen + subprocess.run(["scrot", temp_file], check=True) + + # Wait and retry approach with validity checking + delay = initial_delay + for attempt in range(max_retries): + time.sleep(delay) + + # Check if file exists and has content + if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0: + try: + # Try to verify the image is valid + with Image.open(temp_file) as test_img: + # Just accessing a property forces PIL to validate the image + test_img.size + + # If we get here, the image is valid + return Image.open(temp_file) + except (IOError, OSError) as e: + # Image exists but isn't valid yet + if attempt < max_retries - 1: + # Increase delay exponentially for next attempt + delay *= 2 + continue + else: + raise Exception(f"Image file exists but is not valid after {max_retries} attempts") + + # File doesn't exist or is empty + if attempt < max_retries - 1: + # Increase delay exponentially for next attempt + delay *= 2 + else: + raise Exception(f"Screenshot file not created properly after {max_retries} attempts") + + except Exception as e: + print(f"Error capturing screen: {e}") + raise + finally: + # Ensure file is removed even if an error occurs + if os.path.exists(temp_file): + os.remove(temp_file) + +def process_image(img, scale_factor=1.5): + """Process the image to improve OCR accuracy""" + # Scale the image to improve OCR + if scale_factor != 1: + width, height = img.size + img = img.resize((int(width * scale_factor), int(height * scale_factor)), + Image.Resampling.BICUBIC) + + # Convert to grayscale for faster processing + img = ImageOps.grayscale(img) + + # Improve contrast for better text recognition + img = ImageOps.autocontrast(img) + + return img + +def perform_ocr(img, lang='eng'): + """Perform OCR on the image""" + # Use tessaract with optimized settings + # --oem 1: Use LSTM OCR Engine + # --psm 6: Assume a single uniform block of text + text = pytesseract.image_to_string(img, lang=lang, config='--oem 1 --psm 6') + + return text + +def speak_text(text): + """Speak the text using speech-dispatcher""" + # Filter out empty lines and clean up the text + lines = [line.strip() for line in text.split('\n') if line.strip()] + cleaned_text = ' '.join(lines) + + # Use speech-dispatcher to speak the text + if cleaned_text: + subprocess.run(["spd-say", "-Cw", cleaned_text]) + else: + subprocess.run(["spd-say", "-Cw", "No text detected"]) + +def main(): + # Limit tesseract thread usage to improve performance on Pi + os.environ["OMP_THREAD_LIMIT"] = "4" + + try: + # Announce start + subprocess.run(["spd-say", "-Cw", "performing OCR"]) + + # Capture screen + img = capture_screen() + + # Process image + processed_img = process_image(img, scale_factor=1.5) + + # Perform OCR + text = perform_ocr(processed_img) + + # Speak the results + speak_text(text) + + except Exception as e: + # Let the user know something went wrong + error_msg = f"Error during OCR: {str(e)}" + print(error_msg) + try: + subprocess.run(["spd-say", "-Cw", "OCR failed"]) + except: + # If even speech fails, at least we tried + pass + +if __name__ == "__main__": + main()