Created
January 2, 2022 21:16
-
-
Save rizar/be08b9b5b2a8aed1a24cc316ba6a6a96 to your computer and use it in GitHub Desktop.
Bespoke MLP Implementation for Measuring Throughput
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import torch | |
from torch.nn import Linear, ReLU | |
from torch.profiler import profile, ProfilerActivity | |
from codeparrot.build_table import build_table | |
device = torch.device('cuda') | |
# gpt-2 v2 | |
bs = 2 ** 13 | |
d = 1600 | |
d_ff = 4 * d | |
fp16_training = os.environ.get('USE_FP16') == '1' | |
weight_dtype = torch.float16 if os.environ.get('USE_FP16_WEIGHTS') == '1' else torch.float32 | |
x = torch.rand((bs, d)).to(device) | |
class MLP(torch.nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.lin1 = Linear(d, d_ff, dtype=weight_dtype) | |
self.lin2 = Linear(d_ff, d, dtype=weight_dtype) | |
self.out = Linear(d, 1, dtype=weight_dtype) | |
self.relu = ReLU() | |
def forward(self, x): | |
return self.out(x + self.lin2(self.relu(self.lin1(x)))) | |
model = MLP() | |
model.to(device) | |
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
torch.cuda.synchronize() | |
optimizer = torch.optim.SGD(lr=0.000001, params=model.parameters()) | |
step_flops = 6 * model_size * bs | |
total_flops = 1000 * 10 ** 12 | |
steps = int(total_flops / step_flops) | |
burnin_steps = steps // 2 | |
assert steps > burnin_steps | |
print(model) | |
print(f'{steps} steps of with batch size {bs}') | |
print(f"Model_size: {model_size / 10 ** 9} billions") | |
print(f"One step is {step_flops / 10 ** 12} teraFLOPs") | |
print(f'fp16 is {fp16_training}') | |
for i in range(steps): | |
if i == burnin_steps: | |
after_burnin = time.time() | |
def compute_loss(): | |
with torch.cuda.amp.autocast(enabled=fp16_training): | |
out = model(x) | |
loss = (out ** 2).mean() | |
return loss | |
def train(): | |
optimizer.zero_grad() | |
if os.environ.get("PROFILE") == '1' and i == burnin_steps: | |
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_flops=True, use_cuda=True) as prof: | |
loss = compute_loss() | |
loss.backward() | |
else: | |
loss = compute_loss() | |
loss.backward() | |
prof = None | |
optimizer.step() | |
return prof | |
prof = train() | |
torch.cuda.synchronize() | |
if i == burnin_steps and prof: | |
break | |
time_took = time.time() - after_burnin | |
flops = 6 * model_size * bs * (steps - burnin_steps) | |
print(f"{time_took} seconds") | |
print(f"{flops / time_took / 10 ** 12} teraFLOPs per second") | |
if prof: | |
with open(f"profile_mlp_{int(time.time())}.txt", 'w') as prof_dst: | |
print( | |
build_table( | |
prof.key_averages(), sort_by="cuda_time_total", max_src_column_width=150, top_level_events_only=True), | |
file=prof_dst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment