Skip to content

Instantly share code, notes, and snippets.

@AcTePuKc
Last active February 28, 2025 14:23
Show Gist options
  • Save AcTePuKc/eb50cb0f7efa6bb2c8d4e22cdd01857a to your computer and use it in GitHub Desktop.
Save AcTePuKc/eb50cb0f7efa6bb2c8d4e22cdd01857a to your computer and use it in GitHub Desktop.
Spark-TTS-PySide_Gui - Spark-TTS Studio
# Save in the main directory
'''
SparkTTS Studio - GUI Description (Concise)
SparkTTS Studio is a user-friendly interface for text-to-speech synthesis, featuring voice cloning capabilities.
Key Features:
Text Input: Large text area to enter text for speech synthesis.
Voice Sample Loading: Load WAV/MP3 files to enable voice cloning and mimic voice styles.
Gender Selection: Choose "Male," "Female," or "Pick Voice (Gender optional)" for voice synthesis.
Pitch & Speed Control: Sliders to adjust voice pitch and speaking speed.
Interactive Waveform: Visual display of generated audio with playback progress and clickable seeking.
Playback Controls: "Play/Pause," "Stop," and Volume slider for audio playback.
Save Audio: Save generated speech to WAV files.
Multi-Language GUI: Switch UI language between English, Bulgarian, Spanish, French, and Japanese.
Status Bar: Displays messages about model loading, generation progress, and errors.
System Tray: Option to minimize to system tray for background operation.
Workflow Highlights:
Enter Text: Input the text you want to convert to speech.
Optional Voice Cloning: Load a voice sample to clone a voice.
Set Voice Parameters: Adjust gender, pitch, and speed.
Generate Speech: Click "Generate Speech" to synthesize audio.
Playback & Seek: Use playback controls and click on the waveform to navigate audio.
Save Audio: Save the generated audio to a WAV file.
'''
import traceback
import sys
import os
import time
import torch
import shutil
import numpy as np
import soundfile as sf
from PySide6.QtWidgets import (
QApplication, QWidget, QVBoxLayout, QPushButton, QLabel,
QTextEdit, QSlider, QFileDialog, QComboBox, QHBoxLayout,
QGroupBox, QProgressBar, QSystemTrayIcon, QMenu, QSizePolicy
)
from PySide6.QtCore import Qt, QThread, Signal, QPoint, QTimer, QCoreApplication
from PySide6.QtMultimedia import QMediaPlayer, QAudioOutput
from PySide6.QtGui import QPainter, QColor, QPen, QLinearGradient, QIcon, QAction
from cli.SparkTTS import SparkTTS
# ------------------- Modern Style Sheet -------------------
STYLE_SHEET = """
QWidget {
background-color: #2D2D2D;
color: #FFFFFF;
font-family: 'Segoe UI';
font-size: 12px;
}
QTextEdit {
background-color: #404040;
border: 2px solid #505050;
border-radius: 5px;
padding: 8px;
selection-background-color: #3DAEE9;
}
QPushButton {
background-color: #3DAEE9;
border: none;
border-radius: 4px;
color: white;
padding: 8px 16px;
min-width: 80px;
}
QPushButton:hover {
background-color: #2D9CDB;
}
QPushButton:disabled {
background-color: #505050;
color: #808080;
}
QSlider::groove:horizontal {
height: 6px;
background: #404040;
border-radius: 3px;
}
QSlider::handle:horizontal {
background: #3DAEE9;
border: 2px solid #2D2D2D;
width: 16px;
margin: -6px 0;
border-radius: 8px;
}
QComboBox {
background-color: #404040;
border: 2px solid #505050;
border-radius: 4px;
padding: 4px;
min-width: 100px;
}
QGroupBox {
border: 2px solid #505050;
border-radius: 6px;
margin-top: 10px;
padding-top: 15px;
color: #FFFFFF; /* Added to ensure title text is white */
}
QGroupBox::title {
subcontrol-origin: margin;
left: 10px;
padding: 0 5px;
color: #FFFFFF; /* Added to ensure title text is white */
}
"""
# ------------------- Enhanced Waveform Widget -------------------
class WaveformWidget(QWidget):
seek_position_signal = Signal(int) # New signal to emit seek position in milliseconds
def __init__(self, parent=None):
super().__init__(parent)
self.waveform_data = None
self.playback_progress = 0.0
self.playhead_progress = 0.0
self.audio_player_duration_ms = 0 # ADDED: Store audio duration here!
self.setMinimumHeight(100)
self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Fixed)
self.setMouseTracking(True) # Enable mouse tracking for click events
def set_waveform(self, data):
self.waveform_data = data
self.update()
def set_playback_progress_overlay(self, progress):
self.playback_progress = progress
self.update()
def set_playhead_progress(self, progress):
self.playhead_progress = progress
self.update()
def mousePressEvent(self, event):
if self.waveform_data is not None and self.audio_player_duration_ms > 0: # Now it has the attribute!
click_x = event.position().x()
progress_ratio = click_x / self.width()
seek_position_ms = int(self.audio_player_duration_ms * progress_ratio)
self.seek_position_signal.emit(seek_position_ms)
def set_audio_duration(self, duration_ms): # NEW: Method to set duration
self.audio_player_duration_ms = duration_ms
def paintEvent(self, event):
painter = QPainter(self)
rect = self.rect()
# Draw background, waveform, progress overlay, playhead (No changes in paintEvent itself)
# ... (rest of paintEvent code is the same as before) ...
# Draw background gradient
gradient = QLinearGradient(0, 0, 0, rect.height())
gradient.setColorAt(0, QColor("#363636"))
gradient.setColorAt(1, QColor("#2D2D2D"))
painter.fillRect(rect, gradient)
if self.waveform_data is not None and len(self.waveform_data) > 0:
# Normalize waveform data
normalized_waveform = self.waveform_data / np.max(np.abs(self.waveform_data)) if np.max(np.abs(self.waveform_data)) > 0 else self.waveform_data
# Draw waveform
pen = QPen(QColor("#3DAEE9"))
pen.setWidth(2)
painter.setPen(pen)
num_samples = len(normalized_waveform)
step = max(1, num_samples // rect.width())
center_y = rect.height() / 2
for x in range(rect.width()):
idx = min(int(x * step), num_samples - 1)
sample_value = normalized_waveform[idx]
value_pixel_height = int(abs(sample_value) * center_y * 0.95)
y1 = int(center_y - value_pixel_height)
y2 = int(center_y + value_pixel_height)
y1 = max(0, min(int(y1), rect.height()))
y2 = max(0, min(int(y2), rect.height()))
painter.drawLine(x, y1, x, y2)
# Draw playback progress overlay
painter.setCompositionMode(QPainter.CompositionMode_SourceOver)
progress_width = int(rect.width() * self.playback_progress)
progress_rect = rect.adjusted(0, 0, progress_width - rect.width(), 0)
painter.fillRect(progress_rect, QColor(61, 174, 233, 80))
# Draw playhead
if self.playhead_progress > 0:
playhead_x = int(rect.width() * self.playhead_progress)
playhead_pen = QPen(QColor("white"))
playhead_pen.setWidth(2)
painter.setPen(playhead_pen)
painter.drawLine(playhead_x, 0, playhead_x, rect.height())
def mousePressEvent(self, event): # New mouse click event handler
if self.waveform_data is not None and self.audio_player_duration_ms > 0: # Check if waveform data and duration are available
click_x = event.position().x()
progress_ratio = click_x / self.width()
seek_position_ms = int(self.audio_player_duration_ms * progress_ratio)
self.seek_position_signal.emit(seek_position_ms) # Emit signal with seek position
# ------------------- Main Application Class -------------------
class SparkTTSApp(QWidget):
def __init__(self, model, device):
super().__init__() # Corrected super().__init__() call - no arguments
self.model = model
self.device = device # Store device as instance attribute
self.voice_sample = None
self.current_audio_file = None
self.audio_player_duration_ms = 0 # Store audio player duration in milliseconds
self.init_ui()
self.init_tray_icon()
self.status_label.setText(f"Model loaded on {device}")
self.audio_player = QMediaPlayer()
self.audio_output = QAudioOutput()
self.audio_player.setAudioOutput(self.audio_output)
self.audio_player.positionChanged.connect(self.update_waveform_playhead)
self.audio_player.positionChanged.connect(self.on_position_changed)
self.audio_player.durationChanged.connect(self.on_duration_changed)
self.timer = QTimer(self)
self.timer.timeout.connect(self.update_time_display)
self.setWindowIcon(QIcon("src/logo.webp")) # Add actual icon file
self.waveform.seek_position_signal.connect(self.seek_audio) # Connect seek_position_signal to seek_audio
def init_ui(self):
# Window title - needs translation
self.setWindowTitle("SparkTTS Studio")
self.setMinimumSize(800, 600)
self.setStyleSheet(STYLE_SHEET)
main_layout = QVBoxLayout()
main_layout.setContentsMargins(20, 20, 20, 20)
main_layout.setSpacing(15)
# GUI Language Selector (Placed at the Top Right)
top_bar_layout = QHBoxLayout()
top_bar_layout.addStretch() # Push to the right
# Language Label - needs translation
top_bar_layout.addWidget(QLabel("Language:"))
self.language_label = top_bar_layout.itemAt(
1).widget() # Store language label
self.gui_language_selector = QComboBox()
self.gui_language_selector.addItems(
["English", "Bulgarian", "Spanish", "French", "Japanese"])
self.gui_language_selector.currentIndexChanged.connect(
self.update_gui_language)
top_bar_layout.addWidget(self.gui_language_selector)
main_layout.addLayout(top_bar_layout)
# Text Input Group
# Group Box Title - needs translation
input_group = QGroupBox("Text Input")
self.input_group = input_group # Store group box for translation
input_layout = QVBoxLayout()
self.text_input = QTextEdit()
# Placeholder - needs translation
self.text_input.setPlaceholderText(
"Enter text for speech synthesis...")
input_layout.addWidget(self.text_input)
input_group.setLayout(input_layout)
main_layout.addWidget(input_group)
# Controls Group
# Group Box Title - needs translation
controls_group = QGroupBox("Synthesis Controls")
self.controls_group = controls_group # Store group box for translation
controls_layout = QVBoxLayout()
# Voice Sample Section
voice_layout = QHBoxLayout()
# Label - needs translation
self.voice_label = QLabel("No voice sample loaded")
self.voice_label_status = self.voice_label # Store label for translation
# Button text - needs translation
self.voice_btn = QPushButton("Load Voice Sample")
self.voice_btn_load_voice = self.voice_btn # Store button for translation
self.voice_btn.clicked.connect(self.select_voice_sample)
self.voice_btn.setIcon(QIcon()) # Add actual icon file
voice_layout.addWidget(self.voice_btn)
voice_layout.addWidget(self.voice_label)
controls_layout.addLayout(voice_layout)
# Parameters
params_layout = QHBoxLayout()
# Gender Selector Group
# Group Box Title - needs translation
gender_box = QGroupBox("Voice Parameters")
self.gender_box = gender_box # Store group box for translation
gender_layout = QVBoxLayout()
self.gender_selector_label = QLabel(
"Gender:") # Label - needs translation
# Label - needs translation
gender_layout.addWidget(self.gender_selector_label)
# Store label for translation
self.gender_selector_label_widget = self.gender_selector_label
self.gender_selector = QComboBox()
self.gender_selector.addItems(
# Items - need translation
["Pick Voice (Gender optional)", "Male", "Female"])
# Store items for translation
self.gender_selector_items = [
"Pick Voice (Gender optional)", "Male", "Female"]
self.gender_selector.currentIndexChanged.connect(
self.on_gender_changed)
gender_layout.addWidget(self.gender_selector)
gender_box.setLayout(gender_layout)
params_layout.addWidget(gender_box)
# Pitch Control Group
pitch_box = QGroupBox("Pitch") # Group Box Title - needs translation
self.pitch_box = pitch_box # Store group box for translation
pitch_layout = QVBoxLayout()
self.pitch_slider = QSlider(Qt.Horizontal)
self.pitch_slider.setRange(0, 4)
self.pitch_slider.setValue(2)
pitch_layout.addWidget(self.pitch_slider)
pitch_box.setLayout(pitch_layout)
params_layout.addWidget(pitch_box)
# Speed Control Group
speed_box = QGroupBox("Speed") # Group Box Title - needs translation
self.speed_box = speed_box # Store group box for translation
speed_layout = QVBoxLayout()
self.speed_slider = QSlider(Qt.Horizontal)
self.speed_slider.setRange(0, 4)
self.speed_slider.setValue(2)
speed_layout.addWidget(self.speed_slider)
speed_box.setLayout(speed_layout)
params_layout.addWidget(speed_box)
controls_layout.addLayout(params_layout)
controls_group.setLayout(controls_layout)
main_layout.addWidget(controls_group)
# Visualization and Playback Group
# Group Box Title - needs translation
vis_group = QGroupBox("Audio Visualization")
self.vis_group = vis_group # Store group box for translation
vis_layout = QVBoxLayout()
self.waveform = WaveformWidget()
vis_layout.addWidget(self.waveform)
# Time Display
time_layout = QHBoxLayout()
# Label - needs translation (though format is universal)
self.current_time = QLabel("00:00")
# Label - needs translation (though format is universal)
self.total_time = QLabel("00:00")
time_layout.addWidget(self.current_time)
time_layout.addStretch()
time_layout.addWidget(self.total_time)
vis_layout.addLayout(time_layout)
# Playback Controls
playback_layout = QHBoxLayout()
self.play_btn = QPushButton("Play") # Button text - needs translation
self.play_btn_play = self.play_btn # Store button for translation
self.play_btn.clicked.connect(self.play_audio)
self.play_btn.setIcon(QIcon())
self.pause_btn = QPushButton("Pause")
self.pause_btn_pause = self.pause_btn # Store for translation
self.pause_btn.clicked.connect(self.pause_audio) # Connect to pause_audio method
self.pause_btn.setIcon(QIcon()) # Add pause icon if you have one
self.stop_btn = QPushButton("Stop") # Button text - needs translation
self.play_btn_stop = self.stop_btn # Store button for translation
self.stop_btn.clicked.connect(self.stop_audio)
self.stop_btn.setIcon(QIcon())
self.volume_label = QLabel("Volume:") # Label - needs translation
self.volume_label_widget = self.volume_label # Store label for translation
self.volume_slider = QSlider(Qt.Horizontal)
self.volume_slider.setRange(0, 100)
self.volume_slider.setValue(100)
self.volume_slider.valueChanged.connect(self.set_volume)
playback_layout.addWidget(self.play_btn)
playback_layout.addWidget(self.pause_btn)
playback_layout.addWidget(self.stop_btn)
playback_layout.addWidget(self.volume_label)
playback_layout.addWidget(self.volume_slider)
vis_layout.addLayout(playback_layout)
vis_group.setLayout(vis_layout)
main_layout.addWidget(vis_group)
# Bottom Panel
bottom_layout = QHBoxLayout()
# Button text - needs translation
self.generate_btn = QPushButton("Generate Speech")
self.generate_btn_generate = self.generate_btn # Store button for translation
self.generate_btn.clicked.connect(self.run_synthesis)
self.generate_btn.setIcon(QIcon())
self.generate_btn.setEnabled(False) # Initially disabled
# Button text - needs translation
self.save_btn = QPushButton("Save Audio")
self.save_btn_save = self.save_btn # Store button for translation
self.save_btn.clicked.connect(self.save_audio)
self.save_btn.setIcon(QIcon()) # Add actual icon file
# New Exit Button - needs translation
self.exit_btn = QPushButton("Exit")
self.exit_btn_main_window = self.exit_btn # Store for translation
self.exit_btn.clicked.connect(self.quit_app) # Connect to quit_app
self.exit_btn.setIcon(QIcon()) # Add exit icon if you have one
# Progress Bar
self.progress_bar = QProgressBar()
self.progress_bar.setTextVisible(False)
self.progress_bar.setFixedHeight(6)
bottom_layout.addWidget(self.generate_btn)
bottom_layout.addWidget(self.save_btn)
bottom_layout.addWidget(self.exit_btn) # Add Exit button to layout
bottom_layout.addWidget(self.progress_bar)
main_layout.addLayout(bottom_layout)
# Status Bar
self.status_label = QLabel("Ready") # Label - needs translation
self.status_label_bottom = self.status_label # Store status label
self.status_label.setAlignment(Qt.AlignCenter)
main_layout.addWidget(self.status_label)
self.setLayout(main_layout)
self.update_gui_language() # Initial language setup
def init_tray_icon(self):
self.tray_icon = QSystemTrayIcon(self)
self.tray_icon.setIcon(QIcon("src/logo.webp")) # Use logo for tray icon as well
tray_menu = QMenu()
# Action text - needs translation (if tray menu is always visible)
show_action = QAction("Show", self)
self.show_action_tray = show_action # Store action for translation
show_action.triggered.connect(self.show)
# Action text - needs translation (if tray menu is always visible)
exit_action = QAction("Exit", self)
self.exit_action_tray = exit_action # Store action for translation
exit_action.triggered.connect(self.quit_app) # Connect to quit_app instead of close
tray_menu.addAction(show_action)
tray_menu.addAction(exit_action)
self.tray_icon.setContextMenu(tray_menu)
self.tray_icon.show()
def quit_app(self): # New function to quit the app properly
QCoreApplication.quit() # Use QCoreApplication.quit() to properly exit
def closeEvent(self, event):
self.hide() # Minimize to tray on window close
event.ignore() # Still ignore the close event to prevent window destruction, but now we have quit_app for proper exit.
self.tray_icon.showMessage(
# Title - needs translation (if tray message is always visible)
"SparkTTS Studio",
# Message - needs translation (if tray message is always visible)
"The application is running in the system tray",
QSystemTrayIcon.Information,
2000
)
def set_volume(self, value):
self.audio_output.setVolume(value / 100)
def update_time_display(self):
if self.audio_player.isPlaying():
position_ms = self.audio_player.position()
duration_ms = self.audio_player.duration()
if not isinstance(position_ms, (int, float)): # Check if position is a number
print(f"Error: audio_player.position() returned unexpected type: {type(position_ms)}")
return # Exit if unexpected type
if not isinstance(duration_ms, (int, float)): # Check if duration is a number
print(f"Error: audio_player.duration() returned unexpected type: {type(duration_ms)}")
return # Exit if unexpected type
current_seconds = position_ms // 1000
total_seconds = duration_ms // 1000
self.current_time.setText(f"{current_seconds//60:02}:{current_seconds % 60:02}")
if total_seconds > 0: # Use total_seconds here
self.total_time.setText(f"{total_seconds//60:02}:{total_seconds % 60:02}")
else:
self.total_time.setText("00:00") # Or some default if total duration is invalid
def on_generation_complete(self, result, elapsed):
if isinstance(result, Exception):
error_message = f"Error during speech generation: {result}. "
# Error related to voice sample
if "prompt_speech_path" in str(result):
# Needs translation
error_message += "Please load a voice sample or select a gender (if no voice sample is loaded)."
# Error related to gender parameter
elif "Gender must be 'male' or 'female' or None" in str(result):
# Needs translation
error_message += "Please select a valid gender (Male or Female) if not using 'Pick Voice (Gender optional)'."
else: # Generic error message
# Needs translation
error_message += "Please check your input and try again. See console for details."
self.status_label.setText(error_message)
# Print full error to console for debugging
print(f"Full Error Details:")
traceback.print_exc() # Print full traceback to console!
self.progress_bar.setValue(0)
else:
filename = f"output_{int(time.time())}.wav"
sf.write(filename, result, samplerate=16000)
self.current_audio_file = filename
# No translation needed for technical status
self.status_label.setText(
f"Generated in {elapsed:.1f}s | Saved to {filename}")
self.waveform.set_waveform(result)
self.progress_bar.setValue(0)
self.generate_btn.setEnabled(True)
self.update_generate_button_state() # Re-validate and set button state again
def on_position_changed(self, position):
if self.audio_player.duration() > 0: # Keep the duration check
progress = position / self.audio_player.duration()
self.update_waveform_playhead(progress) # Update waveform playhead
self.update_time_display() # Call update_time_display from here
def update_waveform_playhead(self, progress): # New function to update playhead
self.waveform.set_playhead_progress(progress)
def on_duration_changed(self, duration):
if duration > 0:
self.timer.start(200)
total_seconds = duration // 1000
self.total_time.setText(f"{total_seconds//60:02}:{total_seconds % 60:02}")
self.audio_player_duration_ms = duration
self.waveform.set_audio_duration(duration) # NEW: Pass duration to WaveformWidget!
else:
self.total_time.setText("00:00")
self.audio_player_duration_ms = 0
self.waveform.set_audio_duration(0) # Also reset in WaveformWidget
def seek_audio(self, position_ms): # New method to seek audio
self.audio_player.setPosition(position_ms)
if not self.audio_player.isPlaying(): # If not playing, start playing from seek position
self.play_audio()
def update_word_count(self):
"""Updates the word count dynamically as the user types."""
text = self.text_input.toPlainText().strip()
word_count = len(text.split()) if text else 0
# Keep word count in English - usually numbers are universal
self.word_count_label.setText(f"Word Count: {word_count}")
def validate_inputs(self):
"""
Validates if required inputs (voice sample OR gender) are provided.
Returns True if inputs are valid, False otherwise.
"""
if self.voice_sample is not None:
return True # Voice sample loaded, valid
elif self.gender_selector.currentIndex() != 0: # Not "Pick Voice (Gender optional)"
# Gender selected, valid (assuming not "Pick Voice...")
return True
else:
return False # Neither voice sample nor gender selected, invalid
def update_generate_button_state(self):
"""Updates the 'Generate Speech' button's enabled state based on input validity."""
is_valid = self.validate_inputs()
self.generate_btn.setEnabled(is_valid)
if not is_valid:
self.status_label.setText(
"Load a voice sample or select gender to enable 'Generate Speech'.") # Needs translation
def on_gender_changed(self, index):
"""Handler for gender selector changes. Updates generate button state."""
self.update_generate_button_state()
def reset_voice_sample(self):
"""Clears the loaded voice sample and restores gender selection."""
self.voice_sample = None
# Needs translation - update status label
self.voice_label.setText("No voice sample loaded")
self.update_generate_button_state() # Update button state after reset
def select_voice_sample(self):
file_path, _ = QFileDialog.getOpenFileName(
# "Select Voice Sample" - Dialog title - OS dependent usually
self, "Select Voice Sample", "", "Audio Files (*.wav *.mp3)"
)
if file_path:
self.voice_sample = file_path
# Needs translation - update status label, but keep filename in English
self.voice_label.setText(
f"Loaded voice sample: {os.path.basename(file_path)}")
# Update button state after loading voice sample
self.update_generate_button_state()
else:
# Re-validate in case selection was cancelled
self.update_generate_button_state()
def save_audio(self):
if not (self.current_audio_file and os.path.exists(self.current_audio_file)):
self.status_label.setText("No audio to save!") # Needs translation
return
default_filename = f"SparkTTS_output_{int(time.time())}.wav" # Generate default filename
save_path, _ = QFileDialog.getSaveFileName(
# "Save Audio" - Dialog title - OS dependent usually
self, "Save Audio", default_filename, "WAV Files (*.wav)" # Added default filename here
)
if save_path:
shutil.copy(self.current_audio_file, save_path)
# No translation needed for technical status
self.status_label.setText(
f"Audio saved to: {os.path.basename(save_path)}")
def play_audio(self):
if self.current_audio_file and os.path.exists(self.current_audio_file):
if not self.audio_player.isPlaying():
self.audio_player.setSource(self.current_audio_file)
self.audio_player.play()
self.play_btn.setText("Pause")
self.play_btn_play.setText("Pause")
else:
self.audio_player.pause()
self.play_btn.setText("Play")
self.play_btn_play.setText("Play")
elif self.audio_player.isPlaying():
self.audio_player.pause()
self.play_btn.setText("Play")
self.play_btn_play.setText("Play")
def pause_audio(self):
if self.audio_player.isPlaying():
self.audio_player.pause()
self.play_btn.setText("Play")
self.play_btn_play.setText("Play")
else:
self.play_audio()
def stop_audio(self):
self.audio_player.stop()
self.play_btn.setText("Play")
self.play_btn_play.setText("Play")
self.waveform.set_playback_progress_overlay(0.0)
self.waveform.set_playhead_progress(0.0)
self.current_time.setText("00:00")
def run_synthesis(self):
text = self.text_input.toPlainText().strip()
if not text:
self.status_label.setText(
"Please enter some text!") # Needs translation
return
if not self.validate_inputs(): # Double check validation before synthesis (optional, but good practice)
self.status_label.setText(
"Load a voice sample or select gender to generate speech.") # Needs translation
return
# Segmentation: Limit each segment to 150 words.
segmentation_threshold = 150
words = text.split()
if len(words) > segmentation_threshold:
text_to_process = [
' '.join(words[i:i + segmentation_threshold])
for i in range(0, len(words), segmentation_threshold)
]
self.status_label.setText(
"Text too long: processing segments...") # Needs translation
# Setup progress bar for segments
self.progress_bar.setMaximum(len(text_to_process))
self.progress_bar.setValue(0)
else:
text_to_process = text
self.progress_bar.setMaximum(1) # Single segment
self.progress_bar.setValue(0)
# Determine parameters based on whether a voice sample is loaded.
if self.voice_sample is not None:
prompt = self.voice_sample
gender = None
pitch = None
speed = None
else:
prompt = None
gender = self.gender_selector.currentText().lower()
# Corrected gender logic
gender = None if gender == "pick voice (gender optional)" else gender
speed = self.speed_slider.value()
pitch = self.pitch_slider.value()
# Disable again right before generation, just in case
self.generate_btn.setEnabled(False)
self.status_label.setText("Generating speech...") # Needs translation
self.worker = TTSWorker(
self.model, text_to_process, prompt, gender, pitch, speed)
self.worker.progress_update.connect(self.on_generation_progress)
self.worker.result_ready.connect(self.on_generation_complete)
self.worker.start()
def on_generation_progress(self, current, total):
# Needs translation - segment info
self.status_label.setText(f"Generating segment {current} / {total}...")
self.progress_bar.setValue(current) # Update progress bar
# AI GENERATED LANGUAGE TRANSLATIONS
translations = { # --- PASTE THE TRANSLATIONS DICTIONARY HERE ---
"English": {
"SparkTTS Studio": "SparkTTS Studio",
"enter_text": "Enter text for speech synthesis...",
"language": "Language:",
"word_count": "Word Count:",
"load_voice": "Load Voice Sample",
"reset_voice": "Reset Voice Sample",
"generate_speech": "Generate Speech",
"gender": "Gender:",
"auto": "Pick Voice (Gender optional)", # Renamed "Auto"
"male": "Male",
"female": "Female",
"pitch_label": "Pitch",
"speed_label": "Speed",
"play_button": "Play",
"stop": "Stop",
"save_audio": "Save Audio",
"model_cuda": "Model loaded on CUDA",
"pitch": "Pitch",
"speed": "Speed",
"text_input_group": "Text Input",
"synthesis_controls_group": "Synthesis Controls",
"audio_visualization_group": "Audio Visualization",
"voice_parameters_group": "Voice Parameters",
"no_voice_sample_loaded": "No voice sample loaded",
"volume_label": "Volume:",
"ready_status": "Ready",
"tray_show": "Show",
"tray_exit": "Exit",
"tray_message_title": "SparkTTS Studio",
"tray_message_text": "The application is running in the system tray",
"error_voice_sample_missing": "Please load a voice sample or select a gender (if no voice sample is loaded).",
"error_gender_invalid": "Please select a valid gender (Male or Female) if not using 'Pick Voice (Gender optional)'.",
"error_generic": "Please check your input and try again. See console for details.",
"status_generating_segment": "Generating segment {current} / {total}...",
"status_generating_speech": "Generating speech...",
"status_load_voice_sample_enable_generate": "Load a voice sample or select gender to enable 'Generate Speech'.",
"status_no_audio_to_save": "No audio to save!",
"status_please_enter_text": "Please enter some text!",
"status_text_too_long_segments": "Text too long: processing segments...",
"status_voice_sample_cleared": "Voice sample cleared.",
"status_loaded_voice_sample": "Loaded voice sample: {filename}",
"status_audio_saved_to": "Audio saved to: {filename}",
"play_button_play": "Play",
"play_button_pause": "Pause",
"pause_button": "Pause",
"stop_button": "Stop",
"generate_button": "Generate Speech",
"save_audio_button": "Save Audio",
"reset_voice_sample_status": "Voice sample cleared.",
"exit_button": "Exit",
},
"Bulgarian": {
"SparkTTS Studio": "SparkTTS Студио",
"enter_text": "Въведете текст за синтез на реч...",
"language": "Език:",
"word_count": "Брой думи:",
"load_voice": "Зареди гласов файл",
"reset_voice": "Изчисти гласов файл",
"generate_speech": "Генерирай реч",
"gender": "Пол:",
"auto": "Избери глас",
"male": "Мъжски",
"female": "Женоски",
"pitch_label": "Височина на тона",
"speed_label": "Скорост",
"play": "Пусни",
"stop": "Спри",
"save_audio": "Запази аудио",
"model_cuda": "Моделът е зареден на CUDA",
"pitch": "Височина на тона",
"speed": "Скорост",
"text_input_group": "Въвеждане на текст",
"synthesis_controls_group": "Контрол на синтеза",
"audio_visualization_group": "Визуализация на аудио",
"voice_parameters_group": "Гласови параметри",
"no_voice_sample_loaded": "Не е зареден гласов файл",
"volume_label": "Сила на звука:",
"ready_status": "Готов",
"tray_show": "Покажи",
"tray_exit": "Изход",
"tray_message_title": "SparkTTS Studio",
"tray_message_text": "Приложението работи в системния трей",
"error_voice_sample_missing": "Моля, заредете гласов файл или изберете пол (ако не е зареден гласов файл).",
"error_gender_invalid": "Моля, изберете валиден пол (Мъж или Жена), ако не използвате 'Избери глас (Пол по избор)'.",
"error_generic": "Моля, проверете въведените данни и опитайте отново. Вижте конзолата за подробности.",
"status_generating_segment": "Генериране на сегмент {current} / {total}...",
"status_generating_speech": "Генериране на реч...",
"status_load_voice_sample_enable_generate": "Заредете гласов файл или изберете пол, за да активирате 'Генерирай реч'.",
"status_no_audio_to_save": "Няма аудио за запазване!",
"status_please_enter_text": "Моля, въведете текст!",
"status_text_too_long_segments": "Текстът е твърде дълъг: обработка на сегменти...",
"status_voice_sample_cleared": "Гласовият файл е изчистен.",
"status_loaded_voice_sample": "Зареден гласов файл: {filename}",
"status_audio_saved_to": "Аудиото е запазено в: {filename}",
"play_button_play": "Пусни",
"play_button_pause": "Пауза",
"stop_button": "Спри",
"generate_button": "Генерирай реч",
"save_audio_button": "Запази аудио",
"reset_voice_sample_status": "Гласовият файл е изчистен.",
"exit_button": "Затвори", # Added translation for Exit button
"pause_button": "Пауза", # Added translation for Pause button
},
"Spanish": {
"SparkTTS Studio": "SparkTTS Studio",
"enter_text": "Introduzca texto para la síntesis de voz...",
"language": "Idioma:",
"word_count": "Recuento de palabras:",
"load_voice": "Cargar muestra de voz",
"reset_voice": "Restablecer muestra de voz",
"generate_speech": "Generar voz",
"gender": "Género:",
# Renamed "Auto" - Example translation, verify!
"auto": "Elegir voz (Género opcional)",
"male": "Masculino",
"female": "Femenino",
"pitch_label": "Tono",
"speed_label": "Velocidad",
"play": "Reproducir",
"stop": "Detener",
"save_audio": "Guardar audio",
"model_cuda": "Modelo cargado en CUDA",
"pitch": "Tono",
"speed": "Velocidad",
"text_input_group": "Entrada de texto",
"synthesis_controls_group": "Controles de síntesis",
"audio_visualization_group": "Visualización de audio",
"voice_parameters_group": "Parámetros de voz",
"no_voice_sample_loaded": "No se ha cargado ninguna muestra de voz",
"volume_label": "Volumen:",
"ready_status": "Listo",
"tray_show": "Mostrar",
"tray_exit": "Salir",
"tray_message_title": "SparkTTS Studio",
"tray_message_text": "La aplicación se está ejecutando en la bandeja del sistema",
"error_voice_sample_missing": "Por favor, cargue una muestra de voz o seleccione un género (si no se carga ninguna muestra de voz).",
"error_gender_invalid": "Por favor, seleccione un género válido (Masculino o Femenino) si no utiliza 'Elegir voz (Género opcional)'.",
"error_generic": "Por favor, revise su entrada e inténtelo de nuevo. Consulte la consola para obtener más detalles.",
"status_generating_segment": "Generando segmento {current} / {total}...",
"status_generating_speech": "Generando voz...",
"status_load_voice_sample_enable_generate": "Cargue una muestra de voz o seleccione un género para activar 'Generar voz'.",
"status_no_audio_to_save": "¡No hay audio para guardar!",
"status_please_enter_text": "¡Por favor, introduzca algún texto!",
"status_text_too_long_segments": "Texto demasiado largo: procesando segmentos...",
"status_voice_sample_cleared": "Muestra de voz borrada.",
"status_loaded_voice_sample": "Muestra de voz cargada: {filename}",
"status_audio_saved_to": "Audio guardado en: {filename}",
"play_button_play": "Reproducir",
"play_button_pause": "Pausa",
"stop_button": "Detener",
"generate_button": "Generar voz",
"save_audio_button": "Guardar audio",
"reset_voice_sample_status": "Muestra de voz borrada.",
"exit_button": "Salir", # Added translation for Exit button
"pause_button": "Pausa", # Added translation for Pause button
},
"French": {
"SparkTTS Studio": "SparkTTS Studio",
"enter_text": "Entrez du texte pour la synthèse vocale...",
"language": "Langue:",
"word_count": "Nombre de mots:",
"load_voice": "Charger un échantillon de voix",
"reset_voice": "Réinitialiser l'échantillon vocal",
"generate_speech": "Générer la parole",
"gender": "Genre:",
# Renamed "Auto" - Example translation, verify!
"auto": "Choisir une voix (Genre optionnel)",
"male": "Masculin",
"female": "Féminin",
"pitch_label": "Hauteur",
"speed_label": "Vitesse",
"play": "Lecture",
"stop": "Arrêter",
"save_audio": "Enregistrer l'audio",
"model_cuda": "Modèle chargé sur CUDA",
"pitch": "Hauteur",
"speed": "Vitesse",
"text_input_group": "Saisie de texte",
"synthesis_controls_group": "Contrôles de synthèse",
"audio_visualization_group": "Visualisation audio",
"voice_parameters_group": "Paramètres vocaux",
"no_voice_sample_loaded": "Aucun échantillon vocal chargé",
"volume_label": "Volume:",
"ready_status": "Prêt",
"tray_show": "Afficher",
"tray_exit": "Quitter",
"tray_message_title": "SparkTTS Studio",
"tray_message_text": "L'application fonctionne dans la barre des tâches",
"error_voice_sample_missing": "Veuillez charger un échantillon vocal ou sélectionner un genre (si aucun échantillon vocal n'est chargé).",
"error_gender_invalid": "Veuillez sélectionner un genre valide (Masculin ou Féminin) si vous n'utilisez pas 'Choisir une voix (Genre optionnel)'.",
"error_generic": "Veuillez vérifier votre saisie et réessayer. Consultez la console pour plus de détails.",
"status_generating_segment": "Génération du segment {current} / {total}...",
"status_generating_speech": "Génération de la parole...",
"status_load_voice_sample_enable_generate": "Chargez un échantillon vocal ou sélectionnez un genre pour activer 'Générer la parole'.",
"status_no_audio_to_save": "Aucun audio à enregistrer !",
"status_please_enter_text": "Veuillez saisir du texte !",
"status_text_too_long_segments": "Texte trop long : traitement des segments...",
"status_voice_sample_cleared": "Échantillon vocal effacé.",
"status_loaded_voice_sample": "Échantillon vocal chargé : {filename}",
"status_audio_saved_to": "Audio enregistré dans : {filename}",
"play_button_play": "Lecture",
"play_button_pause": "Pause",
"stop_button": "Arrêter",
"generate_button": "Générer la parole",
"save_audio_button": "Enregistrer l'audio",
"reset_voice_sample_status": "Échantillon vocal effacé.",
"exit_button": "Quitter", # Added translation for Exit button
"pause_button": "Pause", # Added translation for Pause button
},
"Japanese": {
"SparkTTS Studio": "SparkTTS Studio",
"enter_text": "音声合成のためのテキストを入力してください…",
"language": "言語:",
"word_count": "単語数:",
"load_voice": "音声サンプルを読み込む",
"reset_voice": "音声サンプルをリセット",
"generate_speech": "音声を生成",
"gender": "性別:",
# Renamed "Auto" - Example translation, verify!
"auto": "音声を選択 (性別はオプション)",
"male": "男性",
"female": "女性",
"pitch_label": "ピッチ",
"speed_label": "速度",
"play": "再生",
"stop": "停止",
"save_audio": "音声を保存",
"model_cuda": "CUDAでモデルが読み込まれました",
"pitch": "ピッチ",
"speed": "速度",
"text_input_group": "テキスト入力",
"synthesis_controls_group": "合成コントロール",
"audio_visualization_group": "オーディオ可視化",
"voice_parameters_group": "音声パラメータ",
"no_voice_sample_loaded": "音声サンプルはロードされていません",
"volume_label": "音量:",
"ready_status": "準備完了",
"tray_show": "表示",
"tray_exit": "終了",
"tray_message_title": "SparkTTS Studio",
"tray_message_text": "アプリケーションはシステムトレイで実行されています",
"error_voice_sample_missing": "音声サンプルをロードするか、性別を選択してください(音声サンプルがロードされていない場合)。",
"error_gender_invalid": "'音声を選択(性別はオプション)'を使用しない場合は、有効な性別(男性または女性)を選択してください。",
"error_generic": "入力内容を確認して、もう一度お試しください。詳細については、コンソールを参照してください。",
"status_generating_segment": "セグメント {current} / {total} を生成中...",
"status_generating_speech": "音声を生成中...",
"status_load_voice_sample_enable_generate": "音声サンプルをロードするか、性別を選択して「音声を生成」を有効にしてください。",
"status_no_audio_to_save": "保存するオーディオはありません!",
"status_please_enter_text": "テキストを入力してください!",
"status_text_too_long_segments": "テキストが長すぎます: セグメントを処理中...",
"status_voice_sample_cleared": "音声サンプルをクリアしました。",
"status_loaded_voice_sample": "音声サンプルをロードしました: {filename}",
"status_audio_saved_to": "オーディオを保存しました: {filename}",
"play_button_play": "再生",
"play_button_pause": "一時停止",
"stop_button": "停止",
"generate_button": "音声を生成",
"save_audio_button": "音声を保存",
"reset_voice_sample_status": "音声サンプルをクリアしました。",
"exit_button": "終了", # Added translation for Exit button
"pause_button": "一時停止", # Added translation for Pause button
}
}
def update_gui_language(self):
"""Updates the GUI labels based on the selected language."""
# Get selected language, default to English
selected_lang = self.gui_language_selector.currentText()
t = self.translations.get(selected_lang, self.translations["English"])
# Apply translations to UI elements
self.setWindowTitle(t["SparkTTS Studio"]) # Window title
self.language_label.setText(t["language"]) # "Language:" label
# "Text Input" group box
self.input_group.setTitle(t["text_input_group"])
self.text_input.setPlaceholderText(
t["enter_text"]) # Text input placeholder
# "Synthesis Controls" group box
self.controls_group.setTitle(t["synthesis_controls_group"])
self.voice_btn_load_voice.setText(
t["load_voice"]) # "Load Voice Sample" button
# "No voice sample loaded" label
self.voice_label_status.setText(t["no_voice_sample_loaded"])
# "Voice Parameters" group box
self.gender_box.setTitle(t["voice_parameters_group"])
self.gender_selector_label_widget.setText(
t["gender"]) # "Gender:" label
# Gender ComboBox items
for i, item_text in enumerate([t["auto"], t["male"], t["female"]]):
self.gender_selector.setItemText(i, item_text)
self.pitch_box.setTitle(t["pitch_label"]) # "Pitch" group box
self.speed_box.setTitle(t["speed_label"]) # "Speed" group box
# "Audio Visualization" group box
self.vis_group.setTitle(t["audio_visualization_group"])
self.volume_label_widget.setText(t["volume_label"]) # "Volume:" label
# "Play" button (initial text)
self.play_btn_play.setText(t["play_button_play"])
self.play_btn_play.setText("Play") # Ensure initial text is "Play" after language change too.
self.play_btn_stop.setText(t["stop_button"]) # "Stop" button
self.generate_btn_generate.setText(
t["generate_button"]) # "Generate Speech" button
self.save_btn_save.setText(
t["save_audio_button"]) # "Save Audio" button
# New Exit button text
self.exit_btn_main_window.setText(t["exit_button"]) # Set Exit button text
# Bottom status bar "Ready" text
self.status_label_bottom.setText(t["ready_status"])
# Tray message - conditionally translate if it's always shown
# self.tray_icon.showMessage(
# t["tray_message_title"],
# t["tray_message_text"],
# QSystemTrayIcon.Information,
# 2000
# )
self.update_generate_button_state() # Update button state after language change
# --- TTSWorker class (no changes needed) ---
class TTSWorker(QThread):
# Emits (final result, generation_time)
result_ready = Signal(object, float)
# Emits (current_segment, total_segments)
progress_update = Signal(int, int)
def __init__(self, model, text, voice_sample, gender, pitch, speed):
"""
text: Either a string or a list of strings (segments).
"""
super().__init__()
self.model = model
self.text = text
self.voice_sample = voice_sample
self.gender = gender
self.pitch = pitch
self.speed = speed
def run(self):
start = time.time()
try:
results = []
if isinstance(self.text, list):
total = len(self.text)
for i, segment in enumerate(self.text):
with torch.no_grad():
wav = self.model.inference(
segment,
prompt_speech_path=self.voice_sample,
gender=self.gender,
pitch=self.pitch,
speed=self.speed
)
results.append(wav)
self.progress_update.emit(i + 1, total)
final_wav = np.concatenate(results, axis=0)
else:
with torch.no_grad():
final_wav = self.model.inference(
self.text,
prompt_speech_path=self.voice_sample,
gender=self.gender,
pitch=self.pitch,
speed=self.speed
)
self.progress_update.emit(1, 1)
elapsed = time.time() - start
self.result_ready.emit(final_wav, elapsed)
except Exception as e:
self.result_ready.emit(e, 0)
if __name__ == "__main__":
app = QApplication(sys.argv)
app.setStyleSheet(STYLE_SHEET) # Apply stylesheet globally for the app
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SparkTTS("pretrained_models/Spark-TTS-0.5B", device=device)
window = SparkTTSApp(model, device.type.upper())
window.show()
sys.exit(app.exec())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment