# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("af_alloy")
pipeline.load_single_voice("am_onyx")
pipeline.load_single_voice("af_heart")
return pipeline
kokoro_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
#####################################################################
# 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
#####################################################################
# cpu=1,
# workers=1,
# memory="4Gi",
#####################################################################
cpu=10,
workers=10,
memory="24Gi",
#####################################################################
gpu=["RTX4090", "A10G", "A100-40"],
gpu_count=1,
image=kokoro_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
voice = inputs.pop("voice", "af_alloy")
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice=voice)
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f",
"s16le", # Raw PCM 16-bit little-endian
"-ar",
"24000", # Sample rate
"-ac",
"1", # Mono audio
"-i",
"pipe:0", # Read from stdin (raw audio)
"-codec:a",
"libmp3lame", # MP3 codec
"-b:a",
"48k", # Bitrate
"-write_xing", # Add proper MP3 header
"0", # Disable Xing header (which contains duration info)
"-write_id3v2",
"1", # Add ID3v2 header for file recognition
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
# Convert tensor to bytes and scale to 16-bit PCM format
audio_bytes = (
(result.audio.cpu().numpy() * 32767)
.clip(-32768, 32767)
.astype("int16")
.tobytes()
)
ffmpeg_proc.stdin.write(audio_bytes)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
del pipeline
return {"output_url": public_url}
-
-
Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
import os
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("am_echo")
return pipeline
kokor_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
cpu=1,
memory="8Gi",
gpu="A10G",
gpu_count=1,
image=kokor_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice="am_echo")
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f", "s16le", # Raw PCM 16-bit little-endian
"-ar", "24000", # Sample rate
"-ac", "1", # Mono audio
"-i", "pipe:0", # Read from stdin (raw audio)
"-codec:a", "libmp3lame", # MP3 codec
"-b:a", "192k", # Bitrate
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
ffmpeg_proc.stdin.write(result.audio)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
# Cleanup pipeline memory (optional)
del pipeline
return {"output_url": public_url}
With this code, you need ffmpeg, but it's faster and better than pydub for exporting to MP3.
Also fixed WAV buffer loss issues—now streams directly into ffmpeg (no need to concatenate WAVs).
The loop is removed, and wave is replaced with an optimized approach for better performance.
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
import os
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("af_alloy")
pipeline.load_single_voice("am_onyx")
pipeline.load_single_voice("af_heart")
return pipeline
kokor_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
cpu=1,
memory="8Gi",
gpu=["RTX4090", "A10G", "A100-40"],
gpu_count=1,
image=kokor_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
voice = inputs.pop("voice", "af_alloy")
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice=voice)
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f",
"s16le", # Raw PCM 16-bit little-endian
"-ar",
"24000", # Sample rate
"-ac",
"1", # Mono audio
"-i",
"pipe:0", # Read from stdin (raw audio)
"-codec:a",
"libmp3lame", # MP3 codec
"-b:a",
"48k", # Bitrate
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
# Convert tensor to bytes and scale to 16-bit PCM format
audio_bytes = (
(result.audio.cpu().numpy() * 32767)
.clip(-32768, 32767)
.astype("int16")
.tobytes()
)
ffmpeg_proc.stdin.write(audio_bytes)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
# Cleanup pipeline memory (optional)
del pipeline
return {"output_url": public_url}
Ffmpeg needs raw bytes, not tensor
Thank you @chandradeepc @AcTePuKc @etrotta for the input! Updated the code.
my only question is - is it working as intended? thanks for the previous guy to fix it of course :)
Yes, tested just now works as expected
Fixed bug with mp3 metadata encoding. And set the beam config for max concurrency possible on 24GB GPU. This config costs $3/hr. Reduce to 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro. This minimum config costs $0.8/hr.
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
import os
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("af_alloy")
pipeline.load_single_voice("am_onyx")
pipeline.load_single_voice("af_heart")
return pipeline
kokor_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
cpu=10,
workers=10,
memory="24Gi",
gpu=["RTX4090", "A10G", "A100-40"],
gpu_count=1,
image=kokor_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
voice = inputs.pop("voice", "af_alloy")
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice=voice)
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f",
"s16le", # Raw PCM 16-bit little-endian
"-ar",
"24000", # Sample rate
"-ac",
"1", # Mono audio
"-i",
"pipe:0", # Read from stdin (raw audio)
"-codec:a",
"libmp3lame", # MP3 codec
"-b:a",
"48k", # Bitrate
"-write_xing", # Add proper MP3 header
"0", # Disable Xing header (which contains duration info)
"-write_id3v2",
"1", # Add ID3v2 header for file recognition
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
# Convert tensor to bytes and scale to 16-bit PCM format
audio_bytes = (
(result.audio.cpu().numpy() * 32767)
.clip(-32768, 32767)
.astype("int16")
.tobytes()
)
ffmpeg_proc.stdin.write(audio_bytes)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
# Cleanup pipeline memory (optional)
del pipeline
return {"output_url": public_url}
Thanks for the input @chandradeepc ! Updated the gist to reflect that as an option
I would replace this for readability
(or at least just
for _, _, audio in generator:
)You could also use the standard library
wave
package instead ofsoundfile
if you want to keep the image size as low as possible