mariosasko · June 4, 2025 00:20 · mariosasko · Jun 4, 2025
diff --git a/benchmark.py b/benchmark.py
 import time

 import numpy as np
 import pyarrow as pa
 import torch
 import tqdm

 from tokenizers import Tokenizer
 from datasets import load_dataset


 # dataset = load_dataset("parquet", data_files="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/main/data/CC-MAIN-2013-20/train-00000-of-00014.parquet", split="train").select(range(100000))
 # dataset = load_dataset("AI-MO/NuminaMath-1.5", split="train")
 dataset = load_dataset("amphora/QwQ-LongCoT-130K", split="train")
 tokenizer = Tokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")

 batch_size = 50
 total_time_buffer = 0
 total_time_no_buffer = 0
 total_length = 0
 tqdm_total = len(dataset) // batch_size if len(dataset) % batch_size == 0 else len(dataset) // batch_size + 1
 for batch in tqdm.tqdm(dataset.iter(batch_size=batch_size), total=tqdm_total):
    encodings = tokenizer.encode_batch(batch["qwq"])  # Update the column name depending the dataset
    total_length += sum(len(encoding.ids) for encoding in encodings)

    # Uncomment to test PyArrow conversion
    # lengths = []
    # batch_ids = []
    # batch_attention_masks = []
    # start_time_buffer = time.perf_counter()
    # for encoding in encodings:
    #     ids = memoryview(encoding.ids_buffer)
    #     length = len(ids)
    #     ids = pa.py_buffer(memoryview(encoding.ids_buffer))
    #     attention_mask = pa.py_buffer(memoryview(encoding.attention_mask_buffer))
    #     lengths.append(length)
    #     batch_ids.append(pa.Array.from_buffers(pa.uint32(), length, [None, ids]))
    #     batch_attention_masks.append(pa.Array.from_buffers(pa.uint32(), length, [None, attention_mask]))
    # offsets = np.concatenate(([0], np.cumsum(lengths, dtype=np.int32)))
    # batch_ids = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_ids))
    # batch_attention_masks = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_attention_masks))
    # total_time_buffer += time.perf_counter() - start_time_buffer

    # batch_ids = []
    # batch_attention_masks = []
    # start_time_no_buffer = time.perf_counter()
    # for encoding in encodings:
    #     batch_ids.append(encoding.ids)
    #     batch_attention_masks.append(encoding.attention_mask)
    # batch_ids = pa.array(batch_ids, type=pa.list_(pa.uint32()))
    # batch_attention_masks = pa.array(batch_attention_masks, type=pa.list_(pa.uint32()))
    # total_time_no_buffer += time.perf_counter() - start_time_no_buffer

    # Uncomment to test PyTorch conversion
    # lengths = []
    # batch_ids = []
    # batch_attention_masks = []
    # batch_special_token_masks = []
    # batch_type_ids = []
    # start_time_buffer = time.perf_counter()
    # for encoding in encodings: 
    #     ids = torch.frombuffer(memoryview(encoding.ids_buffer), dtype=torch.uint32)
    #     attention_mask = torch.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=torch.uint32)
    #     special_token_mask = torch.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=torch.uint32)
    #     type_ids = torch.frombuffer(memoryview(encoding.type_ids_buffer), dtype=torch.uint32)
    #     batch_ids.append(ids)
    #     batch_attention_masks.append(attention_mask)
    #     batch_special_token_masks.append(special_token_mask)
    #     batch_type_ids.append(type_ids)
    # total_time_buffer += time.perf_counter() - start_time_buffer

    # batch_ids = []
    # batch_attention_masks = []
    # batch_special_token_masks = []
    # batch_type_ids = []
    # start_time_no_buffer = time.perf_counter()
    # for encoding in encodings:
    #     ids = torch.tensor(encoding.ids, dtype=torch.uint32)
    #     attention_mask = torch.tensor(encoding.attention_mask, dtype=torch.uint32)
    #     special_token_mask = torch.tensor(encoding.special_tokens_mask, dtype=torch.uint32)
    #     type_ids = torch.tensor(encoding.type_ids, dtype=torch.uint32)
    #     batch_ids.append(ids)
    #     batch_attention_masks.append(attention_mask)
    #     batch_special_token_masks.append(special_token_mask)
    #     batch_type_ids.append(type_ids)
    # total_time_no_buffer += time.perf_counter() - start_time_no_buffer
    
    # Uncomment to test NumPy conversion
    lengths = []
    batch_ids = []
    batch_attention_masks = []
    batch_special_token_masks = []
    batch_type_ids = []
    start_time_buffer = time.perf_counter()
    for encoding in encodings: 
        ids = np.frombuffer(memoryview(encoding.ids_buffer), dtype=np.uint32)
        attention_mask = np.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=np.uint32)
        special_token_mask = np.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=np.uint32)
        type_ids = np.frombuffer(memoryview(encoding.type_ids_buffer), dtype=np.uint32)
        batch_ids.append(ids)
        batch_attention_masks.append(attention_mask)
        batch_special_token_masks.append(special_token_mask)
        batch_type_ids.append(type_ids)
    total_time_buffer += time.perf_counter() - start_time_buffer

    batch_ids = []
    batch_attention_masks = []
    batch_special_token_masks = []
    batch_type_ids = []
    start_time_no_buffer = time.perf_counter()
    for encoding in encodings:
        ids = np.array(encoding.ids, dtype=np.uint32)
        attention_mask = np.array(encoding.attention_mask, dtype=np.uint32)
        special_token_mask = np.array(encoding.special_tokens_mask, dtype=np.uint32)
        type_ids = np.array(encoding.type_ids, dtype=np.uint32)
        batch_ids.append(ids)
        batch_attention_masks.append(attention_mask)
        batch_special_token_masks.append(special_token_mask)
        batch_type_ids.append(type_ids)
    total_time_no_buffer += time.perf_counter() - start_time_no_buffer

 print("Time taken (buffer protocol)", total_time_buffer)
 print("Time taken (no buffer protocol)", total_time_no_buffer)
 print("Average sequence length", total_length / len(dataset))
	import time

	import numpy as np
	import pyarrow as pa
	import torch
	import tqdm

	from tokenizers import Tokenizer
	from datasets import load_dataset


	# dataset = load_dataset("parquet", data_files="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/main/data/CC-MAIN-2013-20/train-00000-of-00014.parquet", split="train").select(range(100000))
	# dataset = load_dataset("AI-MO/NuminaMath-1.5", split="train")
	dataset = load_dataset("amphora/QwQ-LongCoT-130K", split="train")
	tokenizer = Tokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")

	batch_size = 50
	total_time_buffer = 0
	total_time_no_buffer = 0
	total_length = 0
	tqdm_total = len(dataset) // batch_size if len(dataset) % batch_size == 0 else len(dataset) // batch_size + 1
	for batch in tqdm.tqdm(dataset.iter(batch_size=batch_size), total=tqdm_total):
	encodings = tokenizer.encode_batch(batch["qwq"]) # Update the column name depending the dataset
	total_length += sum(len(encoding.ids) for encoding in encodings)

	# Uncomment to test PyArrow conversion
	# lengths = []
	# batch_ids = []
	# batch_attention_masks = []
	# start_time_buffer = time.perf_counter()
	# for encoding in encodings:
	# ids = memoryview(encoding.ids_buffer)
	# length = len(ids)
	# ids = pa.py_buffer(memoryview(encoding.ids_buffer))
	# attention_mask = pa.py_buffer(memoryview(encoding.attention_mask_buffer))
	# lengths.append(length)
	# batch_ids.append(pa.Array.from_buffers(pa.uint32(), length, [None, ids]))
	# batch_attention_masks.append(pa.Array.from_buffers(pa.uint32(), length, [None, attention_mask]))
	# offsets = np.concatenate(([0], np.cumsum(lengths, dtype=np.int32)))
	# batch_ids = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_ids))
	# batch_attention_masks = pa.ListArray.from_arrays(offsets, pa.concat_arrays(batch_attention_masks))
	# total_time_buffer += time.perf_counter() - start_time_buffer

	# batch_ids = []
	# batch_attention_masks = []
	# start_time_no_buffer = time.perf_counter()
	# for encoding in encodings:
	# batch_ids.append(encoding.ids)
	# batch_attention_masks.append(encoding.attention_mask)
	# batch_ids = pa.array(batch_ids, type=pa.list_(pa.uint32()))
	# batch_attention_masks = pa.array(batch_attention_masks, type=pa.list_(pa.uint32()))
	# total_time_no_buffer += time.perf_counter() - start_time_no_buffer

	# Uncomment to test PyTorch conversion
	# lengths = []
	# batch_ids = []
	# batch_attention_masks = []
	# batch_special_token_masks = []
	# batch_type_ids = []
	# start_time_buffer = time.perf_counter()
	# for encoding in encodings:
	# ids = torch.frombuffer(memoryview(encoding.ids_buffer), dtype=torch.uint32)
	# attention_mask = torch.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=torch.uint32)
	# special_token_mask = torch.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=torch.uint32)
	# type_ids = torch.frombuffer(memoryview(encoding.type_ids_buffer), dtype=torch.uint32)
	# batch_ids.append(ids)
	# batch_attention_masks.append(attention_mask)
	# batch_special_token_masks.append(special_token_mask)
	# batch_type_ids.append(type_ids)
	# total_time_buffer += time.perf_counter() - start_time_buffer

	# batch_ids = []
	# batch_attention_masks = []
	# batch_special_token_masks = []
	# batch_type_ids = []
	# start_time_no_buffer = time.perf_counter()
	# for encoding in encodings:
	# ids = torch.tensor(encoding.ids, dtype=torch.uint32)
	# attention_mask = torch.tensor(encoding.attention_mask, dtype=torch.uint32)
	# special_token_mask = torch.tensor(encoding.special_tokens_mask, dtype=torch.uint32)
	# type_ids = torch.tensor(encoding.type_ids, dtype=torch.uint32)
	# batch_ids.append(ids)
	# batch_attention_masks.append(attention_mask)
	# batch_special_token_masks.append(special_token_mask)
	# batch_type_ids.append(type_ids)
	# total_time_no_buffer += time.perf_counter() - start_time_no_buffer

	# Uncomment to test NumPy conversion
	lengths = []
	batch_ids = []
	batch_attention_masks = []
	batch_special_token_masks = []
	batch_type_ids = []
	start_time_buffer = time.perf_counter()
	for encoding in encodings:
	ids = np.frombuffer(memoryview(encoding.ids_buffer), dtype=np.uint32)
	attention_mask = np.frombuffer(memoryview(encoding.attention_mask_buffer), dtype=np.uint32)
	special_token_mask = np.frombuffer(memoryview(encoding.special_tokens_mask_buffer), dtype=np.uint32)
	type_ids = np.frombuffer(memoryview(encoding.type_ids_buffer), dtype=np.uint32)
	batch_ids.append(ids)
	batch_attention_masks.append(attention_mask)
	batch_special_token_masks.append(special_token_mask)
	batch_type_ids.append(type_ids)
	total_time_buffer += time.perf_counter() - start_time_buffer

	batch_ids = []
	batch_attention_masks = []
	batch_special_token_masks = []
	batch_type_ids = []
	start_time_no_buffer = time.perf_counter()
	for encoding in encodings:
	ids = np.array(encoding.ids, dtype=np.uint32)
	attention_mask = np.array(encoding.attention_mask, dtype=np.uint32)
	special_token_mask = np.array(encoding.special_tokens_mask, dtype=np.uint32)
	type_ids = np.array(encoding.type_ids, dtype=np.uint32)
	batch_ids.append(ids)
	batch_attention_masks.append(attention_mask)
	batch_special_token_masks.append(special_token_mask)
	batch_type_ids.append(type_ids)
	total_time_no_buffer += time.perf_counter() - start_time_no_buffer

	print("Time taken (buffer protocol)", total_time_buffer)
	print("Time taken (no buffer protocol)", total_time_no_buffer)
	print("Average sequence length", total_length / len(dataset))