curious

youkaichao youkaichao

curious

Ph.D. candidate in Tsinghua University. Visiting UC Berkeley, working on @vllm-project now.

youkaichao / test.py

Created February 6, 2025 04:34

gloo v.s. nccl

	import torch
	import torch.distributed as dist

	use_nccl = False

	dist.init_process_group(backend="nccl" if use_nccl else "gloo")

	rank = dist.get_rank()
	torch.cuda.set_device(rank % 8)

youkaichao / test_pytorch.py

Created January 3, 2025 03:05

cmp shm broadcast and pytorch broadcast object list

	import torch.distributed as dist
	import torch
	import time

	dist.init_process_group(backend="nccl")

	rank = dist.get_rank()
	torch.cuda.set_device(rank)

	N_warmup = 10

youkaichao / embedding.py

Created November 6, 2024 20:28

inplace embedding

	import torch
	import torch.nn as nn

	# an Embedding module containing 10 tensors of size 3
	embedding = nn.Embedding(10, 3)
	embedding.weight.requires_grad_(False)
	# a batch of 4 indices
	input = torch.LongTensor([1, 2, 4, 5])
	output = embedding(input)

youkaichao / ipc.py

Created November 5, 2024 00:06

cuda ipc

	import os
	from typing import List
	# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	import torch
	import torch.distributed as dist

	dist.init_process_group(backend="gloo")
	rank = local_rank = dist.get_rank()
	world_size = dist.get_world_size()
	torch.cuda.set_device(local_rank)

youkaichao / overhead.py

Created October 31, 2024 22:51

direct custom op

	import os
	from dataclasses import dataclass
	from typing import Optional, Tuple

	import torch
	from torch import nn

	def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
	out: torch.Tensor) -> None:
	out.copy_(q)

youkaichao / overhead.py

Created October 31, 2024 21:27

custom op overhead (no mutation)

	import os
	from dataclasses import dataclass
	from typing import Optional, Tuple

	import torch
	from torch import nn

	def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
	out = q.clone()
	out += k

youkaichao / overhead.py

Created October 31, 2024 21:15

custom op overhead

	import os
	from dataclasses import dataclass
	from typing import Optional, Tuple

	import torch
	from torch import nn

	def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
	out: torch.Tensor) -> None:
	out.copy_(q)

youkaichao / client.py

Created October 27, 2024 03:43

client.py

	import openai
	import asyncio

	async def get_choice_completion(prompt, choices):
	# Initialize an asynchronous OpenAI client
	async with openai.AsyncClient(base_url="http://127.0.0.1:8000/v1", api_key="abc") as client:
	choice_probs = {}

	# Calculate logprobs for each prompt + choice sequence
	for choice in choices:

youkaichao / data.txt

Created September 30, 2024 04:28

profiling

	unified benchmark script

	$ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100

	vLLM default
	$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 198.86

youkaichao / custom_op_inductor.py

Created September 12, 2024 05:38

	import torch
	from typing import Optional, Tuple, Union
	torch.cuda.is_available()

	def report_memory(prefix):
	free, total = torch.cuda.mem_get_info()
	used = total - free
	print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB")

	output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB