Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Last active February 19, 2025 15:50
Show Gist options
  • Save TomAugspurger/ac306010252be09b9a5091952d34d268 to your computer and use it in GitHub Desktop.
Save TomAugspurger/ac306010252be09b9a5091952d34d268 to your computer and use it in GitHub Desktop.
kvikio / moto slow benchmark
"""
Sample demonstrating slowness of cudf.read_parquet with kvikio from S3.
# Instructions
Download the dev wheels (skip if you have it built locally; rquires vpn access)
curl -LO https://downloads.rapids.ai/ci/cudf/pull-request/17957/793d0b2/cudf_wheel_python_cudf_cu12_py312_x86_64.tar.gz
curl -LO https://downloads.rapids.ai/ci/cudf/pull-request/17957/793d0b2/cudf_wheel_cpp_libcudf_cu12_x86_64.tar.gz
tar xvf cudf_wheel*.tar.gz
# Install the dev dependencies
(remove the the cudf / libcudf if you've build it locally)
uv pip install \
--extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
--prerelease allow \
moto[s3,server] \
s3fs \
kvikio-cu12 \
cudf_cu12-25.4.0a201-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl \
libcudf_cu12-25.4.0a201-py3-none-manylinux_2_28_x86_64.whl
Run the tests
```
CUDF_KVIKIO_REMOTE_IO=OFF AWS_ACCESS_KEY_ID=testing AWS_SECRET_ACCESS_KEY=testing AWS_DEFAULT_REGION=us-east-1 AWS_ENDPOINT_URL="http://localhost:5000" python io-test.py
```
which takes ~0.8s. With kvikio
```
CUDF_KVIKIO_REMOTE_IO=ON AWS_ACCESS_KEY_ID=testing AWS_SECRET_ACCESS_KEY=testing AWS_DEFAULT_REGION=us-east-1 AWS_ENDPOINT_URL="http://localhost:5000" python io-test.py
```
which takes 6--7s.
"""
import pathlib
import time
import cudf
from moto.moto_server.werkzeug_app import (
DomainDispatcherApplication,
create_backend_app,
)
from moto.server import ThreadedMotoServer
from werkzeug.serving import WSGIRequestHandler, make_server
class SlowHandler(WSGIRequestHandler):
def handle(self):
time.sleep(0.1)
return super().handle()
class SlowMotoServer(ThreadedMotoServer):
def _server_entry(self) -> None:
app = DomainDispatcherApplication(create_backend_app)
self._server = make_server(
self._ip_address,
self._port,
app,
True,
request_handler=SlowHandler,
)
self._server_ready_event.set()
self._server.serve_forever()
def make_data() -> list[pathlib.Path]:
import boto3
client = boto3.client("s3", endpoint_url="http://127.0.0.1:5000")
client.create_bucket(Bucket="data")
paths = []
for i in range(12):
p = f"{i:0>2d}.parquet"
cudf.DataFrame({"A": list(range(10))}).to_parquet(f"s3://data/{p}")
paths.append(p)
return paths
def main():
server = SlowMotoServer()
server.start()
paths = make_data()
urls = [f"s3://data/{p}" for p in paths]
t0 = time.monotonic()
cudf.read_parquet(urls)
t1 = time.monotonic()
print(f"{t1 - t0:0.2f}s")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment