Files
bookstorm/src/tts_engine.py

181 lines
5.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TTS Engine Wrapper
Wrapper for piper-tts to generate speech from text.
Handles streaming audio generation for real-time playback.
"""
import subprocess
import wave
import io
import struct
import shutil
from .text_validator import is_valid_text
class TtsEngine:
"""Text-to-speech engine using piper-tts"""
@staticmethod
def is_available():
"""
Check if piper-tts is available on the system
Returns:
bool: True if piper-tts is installed, False otherwise
"""
return shutil.which('piper-tts') is not None
def __init__(self, modelPath="/usr/share/piper-voices/en/en_US/hfc_male/medium/en_US-hfc_male-medium.onnx"):
"""
Initialize TTS engine
Args:
modelPath: Path to piper-tts voice model
"""
self.modelPath = modelPath
self.sampleRate = 22050
self.sampleWidth = 2 # 16-bit
self.channels = 1
def text_to_wav_data(self, text):
"""
Convert text to WAV audio data
Args:
text: Text to convert to speech
Returns:
Bytes containing WAV audio data
Raises:
RuntimeError: If piper-tts fails
"""
if not is_valid_text(text):
return None
# Safety: Limit text size to prevent excessive memory usage
# ~10,000 chars = ~10-15 minutes of audio at normal reading speed
# Reduced to 5000 to lower memory usage per paragraph
MAX_TEXT_LENGTH = 5000
if len(text) > MAX_TEXT_LENGTH:
print(f"Warning: Paragraph too long ({len(text)} chars), truncating to {MAX_TEXT_LENGTH}")
text = text[:MAX_TEXT_LENGTH] + "..."
process = None
try:
# Run piper-tts with raw output
process = subprocess.Popen(
[
'piper-tts',
'--model', self.modelPath,
'--output-raw'
],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
# Send text and get raw audio with timeout (60 seconds max)
# This prevents hanging if piper-tts gets stuck
rawAudio = None
stderr = None
try:
rawAudio, stderr = process.communicate(
input=text.encode('utf-8'),
timeout=60
)
except subprocess.TimeoutExpired:
process.kill()
process.communicate() # Clean up
raise RuntimeError("piper-tts timed out (>60s)")
if process.returncode != 0:
errorMsg = stderr.decode('utf-8', errors='ignore')
del stderr # Free stderr buffer immediately
stderr = None
raise RuntimeError(f"piper-tts failed: {errorMsg}")
# Free stderr buffer immediately (can be large if piper-tts is verbose)
if stderr:
del stderr
stderr = None
# Convert raw PCM to WAV format
wavData = self._raw_to_wav(rawAudio)
# CRITICAL: Delete rawAudio immediately after conversion
# This is a huge uncompressed buffer (~1-2MB per paragraph)
if rawAudio:
del rawAudio
rawAudio = None
# Ensure process is fully terminated and cleaned up
if process:
try:
process.wait(timeout=0.1)
except subprocess.TimeoutExpired:
pass
return wavData
except FileNotFoundError:
raise RuntimeError("piper-tts not found. Please install piper-tts.")
except Exception as e:
# Clean up buffers on error
if rawAudio:
del rawAudio
if stderr:
del stderr
# Ensure subprocess is terminated if something goes wrong
if process and process.poll() is None:
process.kill()
try:
process.wait(timeout=2)
except subprocess.TimeoutExpired:
pass # Process is truly stuck, nothing we can do
raise RuntimeError(f"TTS generation failed: {str(e)}")
def _raw_to_wav(self, rawData):
"""
Convert raw PCM data to WAV format
Args:
rawData: Raw PCM audio bytes
Returns:
WAV formatted bytes
"""
wavBuffer = io.BytesIO()
try:
with wave.open(wavBuffer, 'wb') as wavFile:
wavFile.setnchannels(self.channels)
wavFile.setsampwidth(self.sampleWidth)
wavFile.setframerate(self.sampleRate)
wavFile.writeframes(rawData)
wavBuffer.seek(0)
result = wavBuffer.read()
finally:
# Explicitly close BytesIO to free memory
wavBuffer.close()
return result
def get_audio_params(self):
"""
Get audio parameters for playback
Returns:
Dictionary with sampleRate, sampleWidth, channels
"""
return {
'sampleRate': self.sampleRate,
'sampleWidth': self.sampleWidth,
'channels': self.channels
}