Skip to content

Instantly share code, notes, and snippets.

@efemaer
Last active March 20, 2025 15:22
Show Gist options
  • Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech

from beam import endpoint, env, Image, Output


if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokoro_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    #####################################################################
    # 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
    #####################################################################
    # cpu=1,
    # workers=1,
    # memory="4Gi",
    #####################################################################
    cpu=10,
    workers=10,
    memory="24Gi",
    #####################################################################
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokoro_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    del pipeline

    return {"output_url": public_url}
@etrotta
Copy link

etrotta commented Feb 18, 2025

I would replace this for readability

- for _, (_, _, audio) in enumerate(generator):
+ for result in generator:
+     audio = result.audio 

(or at least just for _, _, audio in generator:)

You could also use the standard library wave package instead of soundfile if you want to keep the image size as low as possible

@AcTePuKc
Copy link

AcTePuKc commented Feb 19, 2025

from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("am_echo")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])  
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=1,
    memory="8Gi",
    gpu="A10G",
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice="am_echo")

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f", "s16le",  # Raw PCM 16-bit little-endian
        "-ar", "24000",  # Sample rate
        "-ac", "1",  # Mono audio
        "-i", "pipe:0",  # Read from stdin (raw audio)
        "-codec:a", "libmp3lame",  # MP3 codec
        "-b:a", "192k",  # Bitrate
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                ffmpeg_proc.stdin.write(result.audio)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}


With this code, you need ffmpeg, but it's faster and better than pydub for exporting to MP3.
Also fixed WAV buffer loss issues—now streams directly into ffmpeg (no need to concatenate WAVs).
The loop is removed, and wave is replaced with an optimized approach for better performance.

@chandradeepc
Copy link

chandradeepc commented Feb 19, 2025


from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=1,
    memory="8Gi",
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

@chandradeepc
Copy link

Ffmpeg needs raw bytes, not tensor

@efemaer
Copy link
Author

efemaer commented Feb 19, 2025

Thank you @chandradeepc @AcTePuKc @etrotta for the input! Updated the code.

@AcTePuKc
Copy link

my only question is - is it working as intended? thanks for the previous guy to fix it of course :)

@efemaer
Copy link
Author

efemaer commented Feb 19, 2025

Yes, tested just now works as expected

@chandradeepc
Copy link

chandradeepc commented Feb 20, 2025

Fixed bug with mp3 metadata encoding. And set the beam config for max concurrency possible on 24GB GPU. This config costs $3/hr. Reduce to 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro. This minimum config costs $0.8/hr.



from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=10,
    workers=10,
    memory="24Gi",
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

@efemaer
Copy link
Author

efemaer commented Feb 20, 2025

Thanks for the input @chandradeepc ! Updated the gist to reflect that as an option

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment