Removed ocrdesktop dependency. I38 now handles OCR with a much smaller, simpler and faster program.
This commit is contained in:
parent
09e21e297c
commit
65e6dcf980
2
I38.md
2
I38.md
@ -151,7 +151,7 @@ If you've enabled braille display support during setup, I38 will start XBrlAPI a
|
||||
|
||||
### OCR (Optical Character Recognition)
|
||||
|
||||
If installed, you can use OCR to read text from images or inaccessible applications:
|
||||
If required dependencies are installed, you can use OCR to read text from images or inaccessible applications:
|
||||
|
||||
- `MODKEY` + `F5`: Perform OCR on the entire screen and speak the content
|
||||
- In Ratpoison mode: `Print` or `MODKEY` + `r`: Perform OCR and save to clipboard
|
||||
|
@ -25,15 +25,19 @@ An uppercase I looks like a 1, 3 from i3, and 8 because the song [We Are 138](ht
|
||||
- lxsession: [optional] For GUI power options like shutdown
|
||||
- magic-wormhole: [optional] for file sharing with magic-wormhole GUI
|
||||
- notification-daemon: To handle notifications
|
||||
- ocrdesktop: For getting contents of the current window with OCR.
|
||||
- pamixer: for the mute-unmute script
|
||||
- pandoc or markdown: To generate html files.
|
||||
- pcmanfm: [optional] Graphical file manager.
|
||||
- playerctl: music controls
|
||||
- python-gobject: for applications menu.
|
||||
- python-i3ipc: for sounds etc.
|
||||
- python-pillow: For OCR
|
||||
- python-pytesseract: For OCR
|
||||
- remind: [optional] For reminder notifications, Requires notify-daemon and notify-send for automatic reminders.
|
||||
scrot: For OCR
|
||||
- sox: for sounds.
|
||||
- tesseract: For OCR
|
||||
- tesseract-data-eng: For OCR
|
||||
- transfersh: [optional] for file sharing GUI
|
||||
- udiskie: [optional] for automatically mounting removable storage
|
||||
- x11bell: [optional] Bell support if you do not have a PC speaker. Available from https://github.com/jovanlanik/x11bell
|
||||
|
6
i38.sh
6
i38.sh
@ -576,10 +576,8 @@ bindsym $mod+Shift+BackSpace mode "default"
|
||||
|
||||
EOF
|
||||
|
||||
# ocrdesktop through speech-dispatcher
|
||||
if command -v ocrdesktop &> /dev/null ; then
|
||||
echo "bindsym ${mod}+F5 exec bash -c 'spd-say -Cw \"performing O C R\" && ocrdesktop -cnog | spd-say -e --'" >> ${i3Path}/config
|
||||
fi
|
||||
# ocr through speech-dispatcher
|
||||
echo "bindsym ${mod}+F5 exec ${i3Path}/scripts/ocr.py" >> ${i3Path}/config
|
||||
# Interrupt speech-dispatcher output
|
||||
echo "bindsym ${mod}+Shift+F5 exec spd-say -C" >> ${i3Path}/config
|
||||
|
||||
|
148
scripts/ocr.py
Executable file
148
scripts/ocr.py
Executable file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This file is part of I38.
|
||||
|
||||
# I38 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
|
||||
# either version 3 of the License, or (at your option) any later version.
|
||||
|
||||
# I38 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
# PURPOSE. See the GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License along with I38. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
"""
|
||||
Simple OCR Screen Reader
|
||||
A lightweight tool that performs OCR on the screen and speaks the results
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
from PIL import Image, ImageOps
|
||||
import pytesseract
|
||||
|
||||
def capture_screen(max_retries=3, initial_delay=0.2):
|
||||
"""
|
||||
Capture the screen using scrot with robust checking and retries
|
||||
|
||||
Args:
|
||||
max_retries: Maximum number of attempts to read the image
|
||||
initial_delay: Initial delay in seconds (will increase with retries)
|
||||
"""
|
||||
temp_file = "/tmp/ocr_capture.png"
|
||||
|
||||
try:
|
||||
# Capture the screen
|
||||
subprocess.run(["scrot", temp_file], check=True)
|
||||
|
||||
# Wait and retry approach with validity checking
|
||||
delay = initial_delay
|
||||
for attempt in range(max_retries):
|
||||
time.sleep(delay)
|
||||
|
||||
# Check if file exists and has content
|
||||
if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
|
||||
try:
|
||||
# Try to verify the image is valid
|
||||
with Image.open(temp_file) as test_img:
|
||||
# Just accessing a property forces PIL to validate the image
|
||||
test_img.size
|
||||
|
||||
# If we get here, the image is valid
|
||||
return Image.open(temp_file)
|
||||
except (IOError, OSError) as e:
|
||||
# Image exists but isn't valid yet
|
||||
if attempt < max_retries - 1:
|
||||
# Increase delay exponentially for next attempt
|
||||
delay *= 2
|
||||
continue
|
||||
else:
|
||||
raise Exception(f"Image file exists but is not valid after {max_retries} attempts")
|
||||
|
||||
# File doesn't exist or is empty
|
||||
if attempt < max_retries - 1:
|
||||
# Increase delay exponentially for next attempt
|
||||
delay *= 2
|
||||
else:
|
||||
raise Exception(f"Screenshot file not created properly after {max_retries} attempts")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error capturing screen: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Ensure file is removed even if an error occurs
|
||||
if os.path.exists(temp_file):
|
||||
os.remove(temp_file)
|
||||
|
||||
def process_image(img, scale_factor=1.5):
|
||||
"""Process the image to improve OCR accuracy"""
|
||||
# Scale the image to improve OCR
|
||||
if scale_factor != 1:
|
||||
width, height = img.size
|
||||
img = img.resize((int(width * scale_factor), int(height * scale_factor)),
|
||||
Image.Resampling.BICUBIC)
|
||||
|
||||
# Convert to grayscale for faster processing
|
||||
img = ImageOps.grayscale(img)
|
||||
|
||||
# Improve contrast for better text recognition
|
||||
img = ImageOps.autocontrast(img)
|
||||
|
||||
return img
|
||||
|
||||
def perform_ocr(img, lang='eng'):
|
||||
"""Perform OCR on the image"""
|
||||
# Use tessaract with optimized settings
|
||||
# --oem 1: Use LSTM OCR Engine
|
||||
# --psm 6: Assume a single uniform block of text
|
||||
text = pytesseract.image_to_string(img, lang=lang, config='--oem 1 --psm 6')
|
||||
|
||||
return text
|
||||
|
||||
def speak_text(text):
|
||||
"""Speak the text using speech-dispatcher"""
|
||||
# Filter out empty lines and clean up the text
|
||||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||||
cleaned_text = ' '.join(lines)
|
||||
|
||||
# Use speech-dispatcher to speak the text
|
||||
if cleaned_text:
|
||||
subprocess.run(["spd-say", "-Cw", cleaned_text])
|
||||
else:
|
||||
subprocess.run(["spd-say", "-Cw", "No text detected"])
|
||||
|
||||
def main():
|
||||
# Limit tesseract thread usage to improve performance on Pi
|
||||
os.environ["OMP_THREAD_LIMIT"] = "4"
|
||||
|
||||
try:
|
||||
# Announce start
|
||||
subprocess.run(["spd-say", "-Cw", "performing OCR"])
|
||||
|
||||
# Capture screen
|
||||
img = capture_screen()
|
||||
|
||||
# Process image
|
||||
processed_img = process_image(img, scale_factor=1.5)
|
||||
|
||||
# Perform OCR
|
||||
text = perform_ocr(processed_img)
|
||||
|
||||
# Speak the results
|
||||
speak_text(text)
|
||||
|
||||
except Exception as e:
|
||||
# Let the user know something went wrong
|
||||
error_msg = f"Error during OCR: {str(e)}"
|
||||
print(error_msg)
|
||||
try:
|
||||
subprocess.run(["spd-say", "-Cw", "OCR failed"])
|
||||
except:
|
||||
# If even speech fails, at least we tried
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user