efemaer/kokoro_beam.md

Last active March 20, 2025 15:22

Star (3) You must be signed in to star a gist
Fork (1) You must be signed in to fork a gist

Learn more about clone URLs
Clone this repository at <script src="https://gist.github.com/efemaer/cedbc629d2cf5bfe1eb35276e5531c4b.js"></script>
Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.

Raw

# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech

from beam import endpoint, env, Image, Output


if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokoro_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    #####################################################################
    # 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
    #####################################################################
    # cpu=1,
    # workers=1,
    # memory="4Gi",
    #####################################################################
    cpu=10,
    workers=10,
    memory="24Gi",
    #####################################################################
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokoro_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    del pipeline

    return {"output_url": public_url}

etrotta commented Feb 18, 2025

I would replace this for readability

- for _, (_, _, audio) in enumerate(generator):
+ for result in generator:
+     audio = result.audio

(or at least just for _, _, audio in generator:)

You could also use the standard library wave package instead of soundfile if you want to keep the image size as low as possible

AcTePuKc commented Feb 19, 2025 •

edited

Loading

from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("am_echo")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])  
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=1,
    memory="8Gi",
    gpu="A10G",
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice="am_echo")

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f", "s16le",  # Raw PCM 16-bit little-endian
        "-ar", "24000",  # Sample rate
        "-ac", "1",  # Mono audio
        "-i", "pipe:0",  # Read from stdin (raw audio)
        "-codec:a", "libmp3lame",  # MP3 codec
        "-b:a", "192k",  # Bitrate
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                ffmpeg_proc.stdin.write(result.audio)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

With this code, you need ffmpeg, but it's faster and better than pydub for exporting to MP3.
Also fixed WAV buffer loss issues—now streams directly into ffmpeg (no need to concatenate WAVs).
The loop is removed, and wave is replaced with an optimized approach for better performance.

chandradeepc commented Feb 19, 2025 •

edited

Loading


from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=1,
    memory="8Gi",
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

chandradeepc commented Feb 19, 2025

Ffmpeg needs raw bytes, not tensor

Author

efemaer commented Feb 19, 2025

Thank you @chandradeepc @AcTePuKc @etrotta for the input! Updated the code.

AcTePuKc commented Feb 19, 2025

my only question is - is it working as intended? thanks for the previous guy to fix it of course :)

Author

efemaer commented Feb 19, 2025

Yes, tested just now works as expected

chandradeepc commented Feb 20, 2025 •

edited

Loading

Fixed bug with mp3 metadata encoding. And set the beam config for max concurrency possible on 24GB GPU. This config costs $3/hr. Reduce to 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro. This minimum config costs $0.8/hr.



from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=10,
    workers=10,
    memory="24Gi",
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

Author

efemaer commented Feb 20, 2025

Thanks for the input @chandradeepc ! Updated the gist to reflect that as an option

efemaer/kokoro_beam.md

etrotta commented Feb 18, 2025

Uh oh!

AcTePuKc commented Feb 19, 2025 •

edited

Loading

Uh oh!

chandradeepc commented Feb 19, 2025 •

edited

Loading

Uh oh!

chandradeepc commented Feb 19, 2025

Uh oh!

efemaer commented Feb 19, 2025

Uh oh!

AcTePuKc commented Feb 19, 2025

Uh oh!

efemaer commented Feb 19, 2025

Uh oh!

chandradeepc commented Feb 20, 2025 •

edited

Loading

Uh oh!

efemaer commented Feb 20, 2025

Uh oh!

efemaer/kokoro_beam.md

etrotta commented Feb 18, 2025

Uh oh!

AcTePuKc commented Feb 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

chandradeepc commented Feb 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

chandradeepc commented Feb 19, 2025

Uh oh!

efemaer commented Feb 19, 2025

Uh oh!

AcTePuKc commented Feb 19, 2025

Uh oh!

efemaer commented Feb 19, 2025

Uh oh!

chandradeepc commented Feb 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

efemaer commented Feb 20, 2025

Uh oh!

AcTePuKc commented Feb 19, 2025 •

edited

Loading

chandradeepc commented Feb 19, 2025 •

edited

Loading

chandradeepc commented Feb 20, 2025 •

edited

Loading