Skip to content

Instantly share code, notes, and snippets.

@mariosasko
Created June 4, 2025 00:20
Show Gist options
  • Save mariosasko/80a4424f4c6627ed7c10f22d43880b2d to your computer and use it in GitHub Desktop.
Save mariosasko/80a4424f4c6627ed7c10f22d43880b2d to your computer and use it in GitHub Desktop.
Buffer protocol tokenizers benchmark
import time
import numpy as np
import pyarrow as pa
import torch
import tqdm
from tokenizers import Tokenizer
from datasets import load_dataset
# dataset = load_dataset("parquet", data_files="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/main/data/CC-MAIN-2013-20/train-00000-of-00014.parquet", split="train").select(range(100000))
# dataset = load_dataset("AI-MO/NuminaMath-1.5", split="train")
dataset = load_dataset("amphora/QwQ-LongCoT-130K", split="train")
tokenizer = Tokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
batch_size = 50
total_time_buffer = 0
total_time_no_buffer = 0
total_length = 0
tqdm_total = len(dataset) // batch_size if len(dataset) % batch_size == 0 else len(dataset) // batch_size + 1
for batch in tqdm.tqdm(dataset.iter(batch_size=batch_size), total=tqdm_total):
encodings = tokenizer.encode_batch(batch["qwq"]) # Update the column name depending the dataset
total_length += sum(len(encoding.ids) for encoding in encodings)
# Uncomment to test PyArrow conversion
# lengths = []
# batch_ids = []
# batch_attention_masks = []
# start_time_buffer = time.perf_counter()
# for encoding in encodings:
# ids = memoryview(encoding.ids_buffer)
# length = len(ids)
# ids = pa.py_buffer(memoryview(encoding.ids_buffer))
# attention_mask = pa.py_buffer(memoryview(encoding.attention_mask_buffer))
# lengths.append(length)
# batch_ids.append(pa.Array.from_buffers(pa.uint32(), length, [None, ids]))
# batch_attention_masks.append(pa.Array.from_buffers(pa.uint32(), length, [None, attention_mask]))
# offsets = np.concatenate(([0], np.cumsum(lengths, dtype=np.int32)))
# batch_ids = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_ids))
# batch_attention_masks = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_attention_masks))
# total_time_buffer += time.perf_counter() - start_time_buffer
# batch_ids = []
# batch_attention_masks = []
# start_time_no_buffer = time.perf_counter()
# for encoding in encodings:
# batch_ids.append(encoding.ids)
# batch_attention_masks.append(encoding.attention_mask)
# batch_ids = pa.array(batch_ids, type=pa.list_(pa.uint32()))
# batch_attention_masks = pa.array(batch_attention_masks, type=pa.list_(pa.uint32()))
# total_time_no_buffer += time.perf_counter() - start_time_no_buffer
# Uncomment to test PyTorch conversion
# lengths = []
# batch_ids = []
# batch_attention_masks = []
# batch_special_token_masks = []
# batch_type_ids = []
# start_time_buffer = time.perf_counter()
# for encoding in encodings:
# ids = torch.frombuffer(memoryview(encoding.ids_buffer), dtype=torch.uint32)
# attention_mask = torch.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=torch.uint32)
# special_token_mask = torch.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=torch.uint32)
# type_ids = torch.frombuffer(memoryview(encoding.type_ids_buffer), dtype=torch.uint32)
# batch_ids.append(ids)
# batch_attention_masks.append(attention_mask)
# batch_special_token_masks.append(special_token_mask)
# batch_type_ids.append(type_ids)
# total_time_buffer += time.perf_counter() - start_time_buffer
# batch_ids = []
# batch_attention_masks = []
# batch_special_token_masks = []
# batch_type_ids = []
# start_time_no_buffer = time.perf_counter()
# for encoding in encodings:
# ids = torch.tensor(encoding.ids, dtype=torch.uint32)
# attention_mask = torch.tensor(encoding.attention_mask, dtype=torch.uint32)
# special_token_mask = torch.tensor(encoding.special_tokens_mask, dtype=torch.uint32)
# type_ids = torch.tensor(encoding.type_ids, dtype=torch.uint32)
# batch_ids.append(ids)
# batch_attention_masks.append(attention_mask)
# batch_special_token_masks.append(special_token_mask)
# batch_type_ids.append(type_ids)
# total_time_no_buffer += time.perf_counter() - start_time_no_buffer
# Uncomment to test NumPy conversion
lengths = []
batch_ids = []
batch_attention_masks = []
batch_special_token_masks = []
batch_type_ids = []
start_time_buffer = time.perf_counter()
for encoding in encodings:
ids = np.frombuffer(memoryview(encoding.ids_buffer), dtype=np.uint32)
attention_mask = np.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=np.uint32)
special_token_mask = np.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=np.uint32)
type_ids = np.frombuffer(memoryview(encoding.type_ids_buffer), dtype=np.uint32)
batch_ids.append(ids)
batch_attention_masks.append(attention_mask)
batch_special_token_masks.append(special_token_mask)
batch_type_ids.append(type_ids)
total_time_buffer += time.perf_counter() - start_time_buffer
batch_ids = []
batch_attention_masks = []
batch_special_token_masks = []
batch_type_ids = []
start_time_no_buffer = time.perf_counter()
for encoding in encodings:
ids = np.array(encoding.ids, dtype=np.uint32)
attention_mask = np.array(encoding.attention_mask, dtype=np.uint32)
special_token_mask = np.array(encoding.special_tokens_mask, dtype=np.uint32)
type_ids = np.array(encoding.type_ids, dtype=np.uint32)
batch_ids.append(ids)
batch_attention_masks.append(attention_mask)
batch_special_token_masks.append(special_token_mask)
batch_type_ids.append(type_ids)
total_time_no_buffer += time.perf_counter() - start_time_no_buffer
print("Time taken (buffer protocol)", total_time_buffer)
print("Time taken (no buffer protocol)", total_time_no_buffer)
print("Average sequence length", total_length / len(dataset))
@mariosasko
Copy link
Author

Results on my Mac M1 Max:

100%|███████████████████████████████████████████████████| 2663/2663 [02:16<00:00, 19.53it/s]
Time taken (buffer protocol) 0.6644201530143619
Time taken (no buffer protocol) 29.29986348748207
Average sequence length 2034.4527279830506

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment