Last active
February 19, 2025 15:50
-
-
Save TomAugspurger/ac306010252be09b9a5091952d34d268 to your computer and use it in GitHub Desktop.
kvikio / moto slow benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Sample demonstrating slowness of cudf.read_parquet with kvikio from S3. | |
# Instructions | |
Download the dev wheels (skip if you have it built locally; rquires vpn access) | |
curl -LO https://downloads.rapids.ai/ci/cudf/pull-request/17957/793d0b2/cudf_wheel_python_cudf_cu12_py312_x86_64.tar.gz | |
curl -LO https://downloads.rapids.ai/ci/cudf/pull-request/17957/793d0b2/cudf_wheel_cpp_libcudf_cu12_x86_64.tar.gz | |
tar xvf cudf_wheel*.tar.gz | |
# Install the dev dependencies | |
(remove the the cudf / libcudf if you've build it locally) | |
uv pip install \ | |
--extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \ | |
--prerelease allow \ | |
moto[s3,server] \ | |
s3fs \ | |
kvikio-cu12 \ | |
cudf_cu12-25.4.0a201-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl \ | |
libcudf_cu12-25.4.0a201-py3-none-manylinux_2_28_x86_64.whl | |
Run the tests | |
``` | |
CUDF_KVIKIO_REMOTE_IO=OFF AWS_ACCESS_KEY_ID=testing AWS_SECRET_ACCESS_KEY=testing AWS_DEFAULT_REGION=us-east-1 AWS_ENDPOINT_URL="http://localhost:5000" python io-test.py | |
``` | |
which takes ~0.8s. With kvikio | |
``` | |
CUDF_KVIKIO_REMOTE_IO=ON AWS_ACCESS_KEY_ID=testing AWS_SECRET_ACCESS_KEY=testing AWS_DEFAULT_REGION=us-east-1 AWS_ENDPOINT_URL="http://localhost:5000" python io-test.py | |
``` | |
which takes 6--7s. | |
""" | |
import pathlib | |
import time | |
import cudf | |
from moto.moto_server.werkzeug_app import ( | |
DomainDispatcherApplication, | |
create_backend_app, | |
) | |
from moto.server import ThreadedMotoServer | |
from werkzeug.serving import WSGIRequestHandler, make_server | |
class SlowHandler(WSGIRequestHandler): | |
def handle(self): | |
time.sleep(0.1) | |
return super().handle() | |
class SlowMotoServer(ThreadedMotoServer): | |
def _server_entry(self) -> None: | |
app = DomainDispatcherApplication(create_backend_app) | |
self._server = make_server( | |
self._ip_address, | |
self._port, | |
app, | |
True, | |
request_handler=SlowHandler, | |
) | |
self._server_ready_event.set() | |
self._server.serve_forever() | |
def make_data() -> list[pathlib.Path]: | |
import boto3 | |
client = boto3.client("s3", endpoint_url="http://127.0.0.1:5000") | |
client.create_bucket(Bucket="data") | |
paths = [] | |
for i in range(12): | |
p = f"{i:0>2d}.parquet" | |
cudf.DataFrame({"A": list(range(10))}).to_parquet(f"s3://data/{p}") | |
paths.append(p) | |
return paths | |
def main(): | |
server = SlowMotoServer() | |
server.start() | |
paths = make_data() | |
urls = [f"s3://data/{p}" for p in paths] | |
t0 = time.monotonic() | |
cudf.read_parquet(urls) | |
t1 = time.monotonic() | |
print(f"{t1 - t0:0.2f}s") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment