I38/scripts/ocr.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This file is part of I38.

# I38 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.

# I38 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
# PURPOSE. See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with I38. If not, see <https://www.gnu.org/licenses/>.


"""
Simple OCR Screen Reader
A lightweight tool that performs OCR on the screen and speaks the results
"""

import os
import sys
import time
import subprocess
from PIL import Image, ImageOps
import pytesseract

def capture_screen(max_retries=3, initial_delay=0.2):
    """
    Capture the screen using scrot with robust checking and retries

    Args:
        max_retries: Maximum number of attempts to read the image
        initial_delay: Initial delay in seconds (will increase with retries)
    """
    temp_file = "/tmp/ocr_capture.png"

    try:
        # Capture the screen
        subprocess.run(["scrot", temp_file], check=True)

        # Wait and retry approach with validity checking
        delay = initial_delay
        for attempt in range(max_retries):
            time.sleep(delay)

            # Check if file exists and has content
            if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
                try:
                    # Try to verify the image is valid
                    with Image.open(temp_file) as test_img:
                        # Just accessing a property forces PIL to validate the image
                        test_img.size

                    # If we get here, the image is valid
                    return Image.open(temp_file)
                except (IOError, OSError) as e:
                    # Image exists but isn't valid yet
                    if attempt < max_retries - 1:
                        # Increase delay exponentially for next attempt
                        delay *= 2
                        continue
                    else:
                        raise Exception(f"Image file exists but is not valid after {max_retries} attempts")

            # File doesn't exist or is empty
            if attempt < max_retries - 1:
                # Increase delay exponentially for next attempt
                delay *= 2
            else:
                raise Exception(f"Screenshot file not created properly after {max_retries} attempts")

    except Exception as e:
        print(f"Error capturing screen: {e}")
        raise
    finally:
        # Ensure file is removed even if an error occurs
        if os.path.exists(temp_file):
            os.remove(temp_file)

def process_image(img, scale_factor=1.5):
    """Process the image to improve OCR accuracy"""
    # Scale the image to improve OCR
    if scale_factor != 1:
        width, height = img.size
        img = img.resize((int(width * scale_factor), int(height * scale_factor)),
                         Image.Resampling.BICUBIC)

    # Convert to grayscale for faster processing
    img = ImageOps.grayscale(img)

    # Improve contrast for better text recognition
    img = ImageOps.autocontrast(img)

    return img

def perform_ocr(img, lang='eng'):
    """Perform OCR on the image"""
    # Use tessaract with optimized settings
    # --oem 1: Use LSTM OCR Engine
    # --psm 6: Assume a single uniform block of text
    text = pytesseract.image_to_string(img, lang=lang, config='--oem 1 --psm 6')

    return text

def speak_text(text):
    """Speak the text using speech-dispatcher"""
    # Filter out empty lines and clean up the text
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    cleaned_text = ' '.join(lines)

    # Use speech-dispatcher to speak the text
    if cleaned_text:
        subprocess.run(["spd-say", "-Cw", cleaned_text])
    else:
        subprocess.run(["spd-say", "-Cw", "No text detected"])

def main():
    # Limit tesseract thread usage to improve performance on Pi
    os.environ["OMP_THREAD_LIMIT"] = "4"

    try:
        # Announce start
        subprocess.run(["spd-say", "-Cw", "performing OCR"])

        # Capture screen
        img = capture_screen()

        # Process image
        processed_img = process_image(img, scale_factor=1.5)

        # Perform OCR
        text = perform_ocr(processed_img)

        # Speak the results
        speak_text(text)

    except Exception as e:
        # Let the user know something went wrong
        error_msg = f"Error during OCR: {str(e)}"
        print(error_msg)
        try:
            subprocess.run(["spd-say", "-Cw", "OCR failed"])
        except:
            # If even speech fails, at least we tried
            pass

if __name__ == "__main__":
    main()