From cb553d3031694f82a96b9b88b0b6a4065622a7ad Mon Sep 17 00:00:00 2001 From: Storm Dragon Date: Thu, 21 May 2026 00:32:00 -0400 Subject: [PATCH] Improve Cthulhu Remote speech relay --- src/cthulhu/plugins/CthulhuRemote/README.md | 206 ++++++++++++++++++ .../plugins/CthulhuRemote/local_machine.py | 3 +- src/cthulhu/plugins/CthulhuRemote/plugin.py | 28 +++ src/cthulhu/speech.py | 42 +++- tests/test_cthulhu_remote_plugin.py | 63 ++++++ 5 files changed, 336 insertions(+), 6 deletions(-) create mode 100644 src/cthulhu/plugins/CthulhuRemote/README.md diff --git a/src/cthulhu/plugins/CthulhuRemote/README.md b/src/cthulhu/plugins/CthulhuRemote/README.md new file mode 100644 index 0000000..994accb --- /dev/null +++ b/src/cthulhu/plugins/CthulhuRemote/README.md @@ -0,0 +1,206 @@ +# Cthulhu Remote + +Cthulhu Remote is an NVDA Remote-style assistive technology relay plugin. It is +intended for remote help between screen reader users: one user can control +another user's desktop through keyboard input and receive the remote screen +reader's speech feedback. + +This is not graphical screen sharing. It does not forward a framebuffer, window +image, VNC/RDP session, or screenshots. The useful channel is screen reader +output and remote input. + +## Current Status + +### Working + +- Plugin installs and loads as `CthulhuRemote`. +- The plugin is registered as a D-Bus module when loaded. +- The transport connects outbound to NVDA Remote-compatible relay servers over + TLS. +- The default relay port is `6837`. +- Protocol version `2` is used. +- `cthulhuremote://` URLs are parsed. +- `nvdaremote://` URLs are accepted and rewritten to `cthulhuremote://`. +- `master` and `slave` connection modes are represented. +- Random connection keys can be generated. +- Invite URLs can be copied for the opposite role. +- The local clipboard can be pushed to connected peers. +- Incoming remote speech is spoken through Cthulhu. +- Incoming remote speech is suppressed from speech monitor callbacks to avoid + echo loops. +- In `slave` mode, local Cthulhu speech is forwarded to the relay as `speak` + messages. +- In `slave` mode, incoming remote key messages are mapped from common Windows + virtual-key codes or keysyms to AT-SPI key events. +- Disconnect and mute gestures are registered: + - `cthulhu+alt+page_down`: disconnect + - `cthulhu+alt+delete`: mute or unmute incoming remote output + - `cthulhu+control+shift+c`: push clipboard text + +### Partially Implemented + +- NVDA Remote compatibility is intentional at the relay-message level, but + cross-client interoperability with NVDA has not been verified. +- Remote key injection exists on the controlled/slave side, but master-side key + capture and forwarding is not implemented yet. +- Braille message types exist, but braille routing is not implemented. +- TLS certificate verification is enabled by default, and an insecure mode + exists, but there is no trust-on-first-use certificate workflow. +- Connection state is exposed, but there is no complete user-facing connection + dialog. + +### Not Implemented + +- Graphical screen sharing. +- Screenshot forwarding. +- VNC/RDP integration. +- Master-side keyboard forwarding. +- Remote braille display output. +- Remote braille input routing. +- Tone, wave, and other non-speech remote audio events. +- Ping/keepalive handling. +- Relay error presentation beyond basic logging/messages. +- Preferences UI for host, port, key, mode, certificate trust, and invite + management. + +## Intended User Model + +The typical assistive remote-support flow is: + +1. The person needing help runs Cthulhu and connects as `slave`. +2. The helper runs Cthulhu and connects as `master`. +3. The helper sends keyboard commands through the relay. +4. The controlled desktop receives those keys locally. +5. The controlled machine's Cthulhu speech is sent back to the helper. + +Examples this is meant to support: + +- Teaching someone how to use a program. +- Helping with a stuck dialog or inaccessible workflow. +- Typing into the remote user's editor or terminal. +- Navigating menus and controls with speech feedback. + +Things it does not solve by itself: + +- Visual captchas that require seeing an image. +- Visual inspection of a remote screen. +- Mouse-driven visual troubleshooting without another channel. + +## Implementation Checklist + +### Core Protocol + +- [x] Define protocol constants and message types. +- [x] Serialize and parse newline-delimited JSON messages. +- [x] Connect to relay servers over TLS. +- [x] Send `protocol_version` and `join` messages after connecting. +- [x] Dispatch inbound messages on the GLib main loop. +- [ ] Add ping/keepalive handling. +- [ ] Present relay `error` and `nvda_not_connected` messages clearly. +- [ ] Verify exact payload compatibility with current NVDA Remote clients. + +### Connection Management + +- [x] Parse `cthulhuremote://` URLs. +- [x] Accept `nvdaremote://` URLs. +- [x] Support host, port, key, mode, and insecure TLS fields. +- [x] Track connection states. +- [x] Expose D-Bus commands for connect, disconnect, state, key generation, and + invite URL copying. +- [ ] Add an accessible GTK connection dialog. +- [ ] Add saved/recent relay configuration if desired. +- [ ] Add trust-on-first-use certificate handling or a clear certificate trust + workflow. +- [ ] Add better reconnect/backoff status reporting. + +### Speech + +- [x] Speak incoming remote `speak` messages locally. +- [x] Support muting incoming remote speech. +- [x] Prevent inbound remote speech from being echoed back to the relay. +- [x] Forward local speech to the relay in `slave` mode. +- [ ] Decide whether master mode should ever forward local speech. +- [ ] Preserve richer speech sequence details if needed instead of flattening to + plain text. +- [ ] Verify behavior with speech interruption and cancellation across two live + clients. + +### Keyboard Control + +- [x] Receive remote key messages in `slave` mode. +- [x] Map common Windows virtual-key codes to Linux keysyms. +- [x] Inject remote key presses/releases with AT-SPI. +- [ ] Capture local keyboard events in `master` mode. +- [ ] Forward master key events as remote `key` messages. +- [ ] Prevent forwarded keys from also acting on the master's local desktop, + unless intentionally passed through. +- [ ] Preserve modifier press/release ordering. +- [ ] Verify Xorg behavior. +- [ ] Verify Wayland behavior without weakening Xorg support. +- [ ] Add tests for key payload generation and modifier handling. + +### Clipboard + +- [x] Push local clipboard text to connected peers. +- [x] Receive remote clipboard text into the local clipboard. +- [ ] Add a command or dialog control for pull/request clipboard if protocol + support is available. +- [ ] Decide how to handle large clipboard payloads. +- [ ] Add tests for empty, plain text, and multiline clipboard text. + +### Braille + +- [ ] Route incoming remote `display` messages to Cthulhu braille output. +- [ ] Implement `set_display_size`. +- [ ] Implement `set_braille_info` if required for compatibility. +- [ ] Route local braille input as remote `braille_input` messages when acting + as master. +- [ ] Verify routing keys and cursor-routing behavior. +- [ ] Add tests around braille payload parsing and ignored/unsupported fields. + +### Audio And Miscellaneous Messages + +- [ ] Implement or intentionally ignore `tone`. +- [ ] Implement or intentionally ignore `wave`. +- [ ] Decide whether `index` is relevant to Cthulhu speech. +- [ ] Handle `send_SAS` with a clear Linux-specific message. +- [ ] Log unsupported message types at debug level without spamming users. + +### User Interface + +- [x] Register basic gestures for disconnect, mute, and clipboard push. +- [ ] Add connect/disconnect controls to plugin preferences. +- [ ] Add host, port, key, mode, and insecure TLS fields. +- [ ] Add generate-key and copy-invite controls. +- [ ] Add connection status text suitable for screen reader users. +- [ ] Ensure Tab and Shift+Tab navigate the entire dialog. +- [ ] Associate GTK labels with their controls. + +### Tests And Verification + +- [x] Test URL parsing. +- [x] Test serializer message type values. +- [x] Test common key mapping. +- [x] Test plugin loading through the plugin manager. +- [x] Test additive speech monitor callbacks. +- [x] Test remote speech echo suppression. +- [x] Test slave-mode local speech forwarding. +- [x] Test that master mode does not forward local speech. +- [ ] Add transport tests with a fake relay socket. +- [ ] Add connection-state transition tests. +- [ ] Add D-Bus introspection or command exposure tests for the plugin module. +- [ ] Test against a live NVDA Remote-compatible relay. +- [ ] Test Cthulhu-to-Cthulhu master/slave operation. +- [ ] Test NVDA master to Cthulhu slave. +- [ ] Test Cthulhu master to NVDA slave after master key forwarding exists. + +## Suggested Next Steps + +1. Add an accessible connection dialog so the plugin can be used without manual + D-Bus calls. +2. Implement ping/error handling to improve relay behavior and diagnostics. +3. Design master-side key forwarding carefully around Cthulhu's existing input + event manager. +4. Add fake-relay tests before broad live testing. +5. Perform live two-client testing and record exact compatibility gaps. + diff --git a/src/cthulhu/plugins/CthulhuRemote/local_machine.py b/src/cthulhu/plugins/CthulhuRemote/local_machine.py index 39a48d3..eeb5021 100644 --- a/src/cthulhu/plugins/CthulhuRemote/local_machine.py +++ b/src/cthulhu/plugins/CthulhuRemote/local_machine.py @@ -77,7 +77,8 @@ class LocalMachine: return text = self._speech_sequence_to_text(sequence) if text: - speech.speak(text, interrupt=False) + with speech.suppress_monitor_callbacks(): + speech.speak(text, interrupt=False) def cancel_speech(self, **kwargs: Any) -> None: if not self.isMuted: diff --git a/src/cthulhu/plugins/CthulhuRemote/plugin.py b/src/cthulhu/plugins/CthulhuRemote/plugin.py index 8061345..26fb7b8 100644 --- a/src/cthulhu/plugins/CthulhuRemote/plugin.py +++ b/src/cthulhu/plugins/CthulhuRemote/plugin.py @@ -22,6 +22,7 @@ gi.require_version("Gtk", "3.0") from gi.repository import Gdk, Gtk from cthulhu import dbus_service +from cthulhu import speech from cthulhu.plugin import Plugin, cthulhu_hookimpl from cthulhu.plugins.CthulhuRemote.connection_info import ( @@ -49,6 +50,7 @@ class CthulhuRemote(Plugin): self._connectionState = ConnectionState.DISCONNECTED self._localMachine = LocalMachine(self._present_message) self._muted = False + self._speechMonitorRegistered = False @cthulhu_hookimpl def activate(self, plugin=None): @@ -145,6 +147,7 @@ class CthulhuRemote(Plugin): self._connectionState = ConnectionState.DISCONNECTING self._transport.close() self._transport = None + self._deregister_speech_monitor() self._connectionState = ConnectionState.DISCONNECTED if notify_user: self._present_message("Cthulhu Remote disconnected") @@ -220,11 +223,36 @@ class CthulhuRemote(Plugin): self._transport = transport self._connectionInfo = connectionInfo self._connectionState = ConnectionState.CONNECTING + self._register_speech_monitor(connectionInfo.mode) transport.start() if notify_user: self._present_message("Cthulhu Remote connecting") return True + def _register_speech_monitor(self, mode: ConnectionMode) -> None: + if mode != ConnectionMode.SLAVE or self._speechMonitorRegistered: + return + + speech.add_monitor_callback(self._send_local_speech) + self._speechMonitorRegistered = True + + def _deregister_speech_monitor(self) -> None: + if not self._speechMonitorRegistered: + return + + speech.remove_monitor_callback(self._send_local_speech) + self._speechMonitorRegistered = False + + def _send_local_speech(self, text: str) -> None: + if not text or not text.strip(): + return + if not self._connectionInfo or self._connectionInfo.mode != ConnectionMode.SLAVE: + return + if not self._transport or not self._transport.connected: + return + + self._transport.send(RemoteMessageType.speak, sequence=[text]) + def _register_transport_handlers(self, transport: RelayTransport, mode: ConnectionMode) -> None: transport.register_inbound(RemoteMessageType.speak, self._localMachine.speak) transport.register_inbound(RemoteMessageType.cancel, self._localMachine.cancel_speech) diff --git a/src/cthulhu/speech.py b/src/cthulhu/speech.py index 3c02a7b..60c3e7c 100644 --- a/src/cthulhu/speech.py +++ b/src/cthulhu/speech.py @@ -36,6 +36,7 @@ __license__ = "LGPL" import importlib import time +from contextlib import contextmanager from typing import TYPE_CHECKING, Optional, List, Dict, Any, Union, Callable from . import debug @@ -76,6 +77,8 @@ _timestamp: float = 0.0 # Optional callback for live monitoring of spoken text. _monitorWriteTextCallback: Optional[Callable[[str], None]] = None +_monitorWriteTextListeners: List[Callable[[str], None]] = [] +_monitorSuppressionDepth = 0 def _isSpeechDispatcherFactory(moduleName: Optional[str]) -> bool: if not moduleName: @@ -325,15 +328,44 @@ def set_monitor_callbacks(writeText: Optional[Callable[[str], None]] = None) -> global _monitorWriteTextCallback _monitorWriteTextCallback = writeText +def add_monitor_callback(writeText: Callable[[str], None]) -> None: + """Adds a runtime callback for live speech monitoring.""" + if writeText not in _monitorWriteTextListeners: + _monitorWriteTextListeners.append(writeText) + +def remove_monitor_callback(writeText: Callable[[str], None]) -> None: + """Removes a runtime callback for live speech monitoring.""" + if writeText in _monitorWriteTextListeners: + _monitorWriteTextListeners.remove(writeText) + +@contextmanager +def suppress_monitor_callbacks(): + """Temporarily suppresses live speech monitoring callbacks.""" + global _monitorSuppressionDepth + _monitorSuppressionDepth += 1 + try: + yield + finally: + _monitorSuppressionDepth = max(0, _monitorSuppressionDepth - 1) + def _write_to_monitor(text: str) -> None: """Writes text to the active speech monitor callback if set.""" - if _monitorWriteTextCallback is None: + if _monitorSuppressionDepth: return - try: - _monitorWriteTextCallback(text) - except Exception: - debug.printException(debug.LEVEL_INFO) + callbacks = [] + if _monitorWriteTextCallback is not None: + callbacks.append(_monitorWriteTextCallback) + callbacks.extend( + callback for callback in _monitorWriteTextListeners + if callback != _monitorWriteTextCallback + ) + + for callback in callbacks: + try: + callback(text) + except Exception: + debug.printException(debug.LEVEL_INFO) def __resolveACSS(acss: Optional[Any] = None) -> ACSS: if isinstance(acss, ACSS): diff --git a/tests/test_cthulhu_remote_plugin.py b/tests/test_cthulhu_remote_plugin.py index d037e7a..6e01677 100644 --- a/tests/test_cthulhu_remote_plugin.py +++ b/tests/test_cthulhu_remote_plugin.py @@ -11,8 +11,11 @@ input_event_manager_stub.get_manager = mock.Mock(return_value=mock.Mock()) sys.modules["cthulhu.input_event_manager"] = input_event_manager_stub from cthulhu.plugin_system_manager import PluginSystemManager +from cthulhu import cthulhu_state +from cthulhu import speech from cthulhu.plugins.CthulhuRemote.connection_info import ConnectionInfo, ConnectionMode from cthulhu.plugins.CthulhuRemote.local_machine import LocalMachine +from cthulhu.plugins.CthulhuRemote.plugin import CthulhuRemote from cthulhu.plugins.CthulhuRemote.protocol import RemoteMessageType from cthulhu.plugins.CthulhuRemote.serializer import JSONSerializer @@ -45,6 +48,66 @@ class CthulhuRemotePluginTests(unittest.TestCase): self.assertEqual(machine._resolve_keyval(0x70, None), machine._resolve_keyval(None, "F1")) self.assertIsNone(machine._resolve_keyval(0xFF, None)) + def test_speech_monitor_callbacks_are_additive(self): + primary = mock.Mock() + listener = mock.Mock() + speech.set_monitor_callbacks(writeText=primary) + speech.add_monitor_callback(listener) + self.addCleanup(speech.set_monitor_callbacks, None) + self.addCleanup(speech.remove_monitor_callback, listener) + + speech._write_to_monitor("status") + + primary.assert_called_once_with("status") + listener.assert_called_once_with("status") + + def test_remote_speech_does_not_echo_to_monitor_callbacks(self): + listener = mock.Mock() + speech.add_monitor_callback(listener) + self.addCleanup(speech.remove_monitor_callback, listener) + + with ( + mock.patch.object(speech, "_speechserver", None), + mock.patch.object(speech.speech_history, "add"), + mock.patch.object(cthulhu_state, "activeScript", None), + ): + LocalMachine(lambda message: None).speak(["remote", "status"]) + + listener.assert_not_called() + + def test_slave_mode_forwards_local_speech_to_relay(self): + plugin = CthulhuRemote() + plugin._connectionInfo = ConnectionInfo( + hostname="example.com", + port=1234, + key="abc", + mode=ConnectionMode.SLAVE, + ) + plugin._transport = mock.Mock() + plugin._transport.connected = True + + plugin._send_local_speech("focused button") + + plugin._transport.send.assert_called_once_with( + RemoteMessageType.speak, + sequence=["focused button"], + ) + + def test_master_mode_does_not_forward_local_speech_to_relay(self): + plugin = CthulhuRemote() + plugin._connectionInfo = ConnectionInfo( + hostname="example.com", + port=1234, + key="abc", + mode=ConnectionMode.MASTER, + ) + plugin._transport = mock.Mock() + plugin._transport.connected = True + + plugin._send_local_speech("local status") + + plugin._transport.send.assert_not_called() + @mock.patch("cthulhu.plugin_system_manager.dbus_service.get_remote_controller") def test_plugin_manager_can_load_cthulhu_remote(self, remote_controller): remote_controller.return_value = mock.Mock()