Skip to content

Instantly share code, notes, and snippets.

View davidberard98's full-sized avatar

David Berard davidberard98

  • PyTorch
  • Menlo Park, CA
View GitHub Profile
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
#!/usr/bin/env python3
import argparse
import os
import sys
import stat
import subprocess
import re
def parse_glibcxx_version(version_string):
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 9.
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup ((new-old)/old),speedup (delta)
tritonbench_ragged_attention_bwd[hstu]-tflops-avg,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_ragged_attention_bwd[x_(128, 4, 1024, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,132.42012023926,-1,-132.42012023926
"tritonbench_ragged_attention_bwd[x_(128, 4, 256, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,65.420997619629,-1,-65.420997619629
"tritonbench_ragged_attention_bwd[x_(128, 4, 512, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,98.577156066895,-1,-98.577156066895
tritonbench_ragged_attention_bwd[x_average-hstu]_tflops,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup",1,1,0.60214412212372,0.79111462831497,-0.23886615090629,-0.18897050619125
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_tflops",1,1,73.069320678711,95.622283935547,-0.23585468081935,-22.552963256836
"tritonbench_int4_gemm_fwd[x_(16, 1, 8192,
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 9.
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup difference ((new-old)/old),speedup (delta)
tritonbench_fused_linear_jsd_bwd-pass,1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compile
import argparse
import torch
import triton # @manual=//triton:triton
import triton.language as tl # @manual=//triton:triton
# best config selected: BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 128, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None;
def get_cuda_autotune_config():
#loc = loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0)
module {
tt.func public @triton_tem_fused_zeros_7(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6
# AOT ID: ['0_backward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
AUTOTUNE addmm(4096x16, 4096x3, 3x16)
triton_mm_4 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
triton_mm_5 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
triton_mm_2 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_3 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_6 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
triton_mm_8 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
triton_mm_10 0.0035 ms 96.4% ACC_TYPE='t
This file has been truncated, but you can view the full file.
============================= test session starts ==============================
platform linux -- Python 3.12.9, pytest-8.3.5, pluggy-1.5.0
rootdir: /workspace/triton/python
configfile: pyproject.toml
collected 21530 items
unit/blackwell/test_tmem.py s [ 0%]
unit/cuda/test_experimental_tma.py .........F.......F................... [ 0%]
......................s..s................................s..s.......... [ 0%]
......................s..s.....s..s.....s..s.....s..s................... [ 0%]