Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active July 3, 2025 17:56
Show Gist options
  • Save davidberard98/ec81ec7a5c035db225053074e7e280d6 to your computer and use it in GitHub Desktop.
Save davidberard98/ec81ec7a5c035db225053074e7e280d6 to your computer and use it in GitHub Desktop.
metric_id Samples (3.3) Samples (3.4) speedup (3.3) speedup (3.4) speedup ((new-old)/old) speedup (delta)
tritonbench_ragged_attention_bwd[hstu]-tflops-avg 0 1 0 98.806091308594 -1 -98.806091308594
tritonbench_ragged_attention_bwd[x_(128, 4, 1024, 128, 128, 1.0, 20, 0)-hstu]_tflops 0 1 0 132.42012023926 -1 -132.42012023926
tritonbench_ragged_attention_bwd[x_(128, 4, 256, 128, 128, 1.0, 20, 0)-hstu]_tflops 0 1 0 65.420997619629 -1 -65.420997619629
tritonbench_ragged_attention_bwd[x_(128, 4, 512, 128, 128, 1.0, 20, 0)-hstu]_tflops 0 1 0 98.577156066895 -1 -98.577156066895
tritonbench_ragged_attention_bwd[x_average-hstu]_tflops 0 1 0 98.806091308594 -1 -98.806091308594
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup 1 1 0.60214412212372 0.79111462831497 -0.23886615090629 -0.18897050619125
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_tflops 1 1 73.069320678711 95.622283935547 -0.23585468081935 -22.552963256836
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 3584)-triton]_speedup 1 1 0.68327975273132 0.82453906536102 -0.1713191267267 -0.1412593126297
tritonbench_int4_gemm_fwd[x_(4, 1, 7168, 8192)-triton]_tflops 1 1 3.7420506477356 4.4838314056396 -0.16543457833206 -0.74178075790405
tritonbench_flex_attention_bwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_tflops 1 1 348.47463989258 416.44131469727 -0.16320828987416 -67.966674804688
tritonbench_flex_attention_bwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 1 1 11.71953868866 14.004383087158 -0.16315209204707 -2.2848443984985
tritonbench_int4_gemm_fwd[x_(64, 1, 1280, 8192)-triton]_speedup 1 1 0.50401866436005 0.60148519277573 -0.16204310527728 -0.09746652841568
tritonbench_int4_gemm_fwd[x_(64, 1, 1280, 8192)-triton]_tflops 1 1 10.874524116516 12.977425575256 -0.16204303746883 -2.1029014587402
tritonbench_flex_attention_bwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_tflops 1 1 328.60260009766 391.91949462891 -0.16155586899601 -63.31689453125
tritonbench_int4_gemm_fwd[x_(16, 1, 7168, 8192)-triton]_speedup 1 1 0.59562009572983 0.71029460430145 -0.16144640248873 -0.11467450857162
tritonbench_int4_gemm_fwd[x_(16, 1, 7168, 8192)-triton]_tflops 1 1 14.952956199646 17.831842422485 -0.16144636962523 -2.8788862228394
tritonbench_flex_attention_bwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 1 1 11.371434211731 13.560061454773 -0.16140245752881 -2.188627243042
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 3584)-triton]_tflops 1 1 15.734259605408 18.66505241394 -0.15702033637708 -2.9307928085327
tritonbench_gemm_fwd[x_(256, 256, 256)-triton_tutorial_matmul]_speedup 1 1 0.87671238183975 1.034653544426 -0.15265125552132 -0.15794116258621
tritonbench_flex_attention_fwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 1 1 25.154708862305 29.665010452271 -0.15204112593261 -4.5103015899658
tritonbench_flex_attention_fwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_tflops 1 1 401.48468017578 473.37594604492 -0.15186928374751 -71.891265869141
tritonbench_int4_gemm_fwd[x_(4, 1, 7168, 8192)-triton]_speedup 1 1 0.27478969097137 0.32345753908157 -0.15046131943125 -0.048667848110199
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 3584)-triton]_tflops 1 1 3.9804947376251 4.6692314147949 -0.14750536351389 -0.6887366771698
tritonbench_flex_attention_bwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_tflops 1 1 290.67184448242 338.86947631836 -0.14223066757024 -48.197631835938
tritonbench_flex_attention_bwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 1 1 10.773795127869 12.553303718567 -0.1417561966629 -1.7795085906982
tritonbench_flex_attention_fwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_tflops 1 1 370.63174438477 431.04516601562 -0.14015566440355 -60.413421630859
tritonbench_flex_attention_fwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 1 1 23.245239257812 27.030340194702 -0.14003156858646 -3.7851009368896
tritonbench_flex_attention_bwd[x_average-compiled]_tflops 1 1 168.55917358398 194.89813232422 -0.13514218133409 -26.338958740234
tritonbench_flex_attention_bwd[compiled]-tflops-avg 1 1 216.71894836426 250.58331298828 -0.13514213783903 -33.864364624023
tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 1 1 20.280294418335 23.408653259277 -0.13364112861566 -3.1283588409424
tritonbench_flex_attention_fwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 1 1 27.321741104126 31.118991851807 -0.1220235785839 -3.7972507476807
tritonbench_flex_attention_bwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 1 1 12.472326278687 14.193314552307 -0.12125344416755 -1.7209882736206
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 4096, 2304)-_triton]_speedup 1 1 0.34353530406952 0.38920053839684 -0.11733086114273 -0.045665234327316
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 3584)-triton]_speedup 1 1 0.35412150621414 0.40076339244843 -0.11638260158776 -0.046641886234283
tritonbench_flex_attention_bwd[compiled]-speedup-avg 1 1 14.457441329956 16.330188751221 -0.1146800842167 -1.8727474212646
tritonbench_flex_attention_bwd[x_average-compiled]_speedup 1 1 11.244676589966 12.701257705688 -0.11468006944464 -1.4565811157227
tritonbench_flex_attention_fwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_tflops 1 1 315.00180053711 354.81768798828 -0.11221505803985 -39.815887451172
tritonbench_flex_attention_bwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_tflops 1 1 227.87390136719 256.38534545898 -0.11120543586746 -28.511444091797
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 4096, 2304)-_triton]_tflops 1 1 1.4736390113831 1.6544852256775 -0.1093066359782 -0.18084621429443
tritonbench_low_mem_dropout_fwd[x_8192-triton_dropout]_speedup 1 1 1.1655173301697 1.3066666126251 -0.10802241451006 -0.14114928245544
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 1024)-triton]_speedup 1 1 0.43308272957802 0.48513305187225 -0.10729081865966 -0.052050322294235
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 1024)-triton]_speedup 1 1 0.7222221493721 0.80787402391434 -0.10602132511605 -0.085651874542236
tritonbench_low_mem_dropout_fwd[x_2048-torch_dropout]_tflops 1 1 0.00067368417512625 0.00075294118141755 -0.10526321078903 -0.0000792570062913
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 4096, 2048)-_triton]_speedup 1 1 0.34268927574158 0.38288614153862 -0.10498386187474 -0.040196865797043
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 1024)-triton]_speedup 1 1 0.44493392109871 0.49592167139053 -0.10281412011872 -0.050987750291824
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 4096, 2048)-_triton]_tflops 1 1 21.902370452881 24.332437515259 -0.099869446324646 -2.4300670623779
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 1024)-triton]_tflops 1 1 3.0795183181763 3.4211287498474 -0.099853135222221 -0.34161043167114
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 2304, 2304)-_triton]_speedup 1 1 0.32127264142036 0.35587656497955 -0.097235746785341 -0.034603923559189
tritonbench_flex_attention_fwd[compiled]-tflops-avg 1 1 218.37632751465 241.76184082031 -0.096729546839632 -23.385513305664
tritonbench_flex_attention_fwd[x_average-compiled]_tflops 1 1 169.84826660156 188.03698730469 -0.096729483724671 -18.188720703125
tritonbench_int4_gemm_fwd[x_(16, 1, 1280, 8192)-triton]_tflops 1 1 3.3046832084656 3.6497597694397 -0.094547746364988 -0.34507656097412
tritonbench_int4_gemm_fwd[x_(4, 1, 1280, 8192)-triton]_tflops 1 1 0.82695269584656 0.91307556629181 -0.094321733736688 -0.086122870445251
tritonbench_int4_gemm_fwd[x_(16, 1, 1280, 8192)-triton]_speedup 1 1 0.25590923428535 0.28228333592415 -0.093431309193118 -0.026374101638794
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 13312, 2048)-_triton]_speedup 1 1 0.44009348750114 0.484345048666 -0.091363711235895 -0.044251561164856
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 13312, 2048)-_triton]_tflops 1 1 3.9834856987 4.3551077842712 -0.085330169534133 -0.37162208557129
tritonbench_gemm_fwd[x_(640, 640, 640)-triton_tutorial_matmul]_speedup 1 1 0.77635782957077 0.84444439411163 -0.080628831235822 -0.068086564540863
tritonbench_low_mem_dropout_fwd[x_524288-triton_dropout]_speedup 1 1 1.0486725568771 1.1298701763153 -0.071864556778523 -0.081197619438171
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 1024)-triton]_tflops 1 1 12.264046669006 13.210406303406 -0.071637435871706 -0.94635963439941
tritonbench_int4_gemm_fwd[x_(1, 1, 1280, 8192)-triton]_tflops 1 1 0.21305592358112 0.22946777939796 -0.07152139555235 -0.016411855816841
tritonbench_gemm_fwd[x_(896, 896, 896)-triton_tutorial_matmul]_speedup 1 1 0.64908254146576 0.69907408952713 -0.071511087036835 -0.049991548061371
tritonbench_softmax_fwd[x_2304.0-triton_softmax]_speedup 1 1 3.6175634860992 3.8809525966644 -0.06786712901146 -0.26338911056519
tritonbench_gemm_fwd[x_(1024, 1024, 1024)-triton_tutorial_matmul]_speedup 1 1 0.64439654350281 0.69063180685043 -0.066946327824775 -0.046235263347626
tritonbench_int4_gemm_fwd[x_(1, 1, 7168, 8192)-triton]_speedup 1 1 0.27390822768211 0.29354265332222 -0.066887811423281 -0.019634425640106
tritonbench_int4_gemm_fwd[x_(1, 1, 7168, 8192)-triton]_tflops 1 1 1.0143769979477 1.0870900154114 -0.066887761301136 -0.072713017463684
tritonbench_rms_norm_bwd[x_(2048, 2048)-liger_rms]_speedup 1 1 0.40108770132065 0.42877045273781 -0.064563104198059 -0.02768275141716
tritonbench_gemm_fwd[x_(3968, 3968, 3968)-triton_tutorial_matmul]_speedup 1 1 0.9412767291069 1.0061401128769 -0.064467545762113 -0.064863383769989
tritonbench_low_mem_dropout_fwd[x_32-torch_dropout]_tflops 1 1 0.000011695906323439 0.000012500000593718 -0.064327538566939 -8.0409427027917e-7
tritonbench_rms_norm_fwd[x_(2048, 1024)-liger_rms]_speedup 1 1 3.7409639358521 3.9904458522797 -0.062519809981907 -0.24948191642761
tritonbench_int4_gemm_fwd[x_(64, 1, 7168, 8192)-triton]_speedup 1 1 1.5582255125046 1.6620663404465 -0.062476945363084 -0.10384082794189
tritonbench_int4_gemm_fwd[x_(64, 1, 7168, 8192)-triton]_tflops 1 1 43.416084289551 46.309349060059 -0.062476904323478 -2.8932647705078
tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_tflops 1 1 28.159139633179 29.99144744873 -0.06109434426878 -1.8323078155518
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 1024)-triton]_speedup 1 1 1.6592245101929 1.765721321106 -0.060313487547617 -0.10649681091309
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 3584)-triton]_tflops 1 1 1.0588620901108 1.1250815391541 -0.058857466537994 -0.066219449043274
tritonbench_int4_gemm_fwd[x_(4, 1, 1280, 8192)-triton]_speedup 1 1 0.15772871673107 0.16753743588924 -0.058546432360687 -0.0098087191581726
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 4096, 6656)-_triton]_speedup 1 1 0.60163551568985 0.6378778219223 -0.056817003173481 -0.036242306232452
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 4096, 6656)-_triton]_tflops 1 1 15.924635887146 16.818614959717 -0.0531541434721 -0.8939790725708
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 2304, 2304)-_triton]_tflops 1 1 1.6557754278183 1.7427499294281 -0.049906472604675 -0.086974501609802
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 3584)-triton]_speedup 1 1 1.760048866272 1.8471986055374 -0.047179409406325 -0.087149739265442
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 1024)-triton]_tflops 1 1 39.429412841797 41.374145507812 -0.047003572935383 -1.9447326660156
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 3584)-triton]_tflops 1 1 47.681900024414 49.847415924072 -0.043442891863376 -2.1655158996582
tritonbench_embedding_fwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 1 1 0.9772726893425 1.0203803777695 -0.042246685026621 -0.043107688426971
tritonbench_int4_gemm_fwd[x_(1, 1, 1280, 8192)-triton]_speedup 1 1 0.15019506216049 0.15651260316372 -0.040364423538588 -0.0063175410032272
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 3584)-tinygemm]_tflops 1 1 3.0737152099609 3.2024571895599 -0.040200999413419 -0.128741979599
tritonbench_softmax_fwd[x_2624.0-triton_softmax]_speedup 1 1 3.7012090682983 3.8553569316864 -0.0399827736107 -0.15414786338806
tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-eager]_tflops 1 1 3.0194535255432 3.1452057361603 -0.039982189136723 -0.12575221061707
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 1024)-triton]_tflops 1 1 0.78840303421021 0.82048201560974 -0.039097726445225 -0.032078981399536
tritonbench_softmax_fwd[x_2560.0-triton_softmax]_speedup 1 1 3.7215909957886 3.8708415031433 -0.038557638496316 -0.14925050735474
tritonbench_int4_gemm_fwd[x_(4, 1, 1280, 8192)-tinygemm]_tflops 1 1 5.2428798675537 5.4499793052673 -0.038000041121893 -0.20709943771362
tritonbench_welford_fwd[x_1024-test_welford]_speedup 1 1 0.5749539732933 0.59726840257645 -0.037360806610368 -0.022314429283142
tritonbench_int4_gemm_fwd[x_average-triton]_speedup 1 1 1.4225736856461 1.4773001670837 -0.037044930107681 -0.054726481437683
tritonbench_int4_gemm_fwd[triton]-speedup-avg 1 1 1.4225736856461 1.4773001670837 -0.037044930107681 -0.054726481437683
tritonbench_rms_norm_bwd[x_(2048, 1024)-liger_rms]_speedup 1 1 0.27284902334213 0.28308320045471 -0.036152541359361 -0.010234177112579
tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-eager]_tflops 1 1 0.74164092540741 0.76901942491531 -0.035601830878224 -0.027378499507904
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 3584)-tinygemm]_tflops 1 1 11.240477561951 11.6508436203 -0.035222003807054 -0.41036605834961
tritonbench_rms_norm_fwd[x_(2048, 2048)-liger_rms]_speedup 1 1 3.4962546825409 3.6165702342987 -0.033267859868106 -0.12031555175781
tritonbench_gemm_fwd[x_(1920, 1920, 1920)-triton_tutorial_matmul]_speedup 1 1 0.92721170186996 0.95847743749619 -0.032620210349339 -0.031265735626221
tritonbench_int4_gemm_fwd[x_(1, 1, 1280, 8192)-tinygemm]_tflops 1 1 1.4185281991959 1.4661297798157 -0.032467508180481 -0.047601580619812
tritonbench_int4_gemm_fwd[x_average-triton]_tflops 1 1 38.422355651855 39.672805786133 -0.031519074829702 -1.2504501342773
tritonbench_int4_gemm_fwd[triton]-tflops-avg 1 1 38.422355651855 39.672805786133 -0.031519074829702 -1.2504501342773
tritonbench_softmax_fwd[x_2816.0-triton_softmax]_speedup 1 1 4.0876941680908 4.2162551879883 -0.030491754925964 -0.12856101989746
tritonbench_int4_gemm_fwd[x_(1, 4096, 1280, 8192)-triton]_tflops 1 1 62.10046005249 64.004638671875 -0.02975063462426 -1.9041786193848
tritonbench_int4_gemm_fwd[x_(1, 4096, 1280, 8192)-triton]_speedup 1 1 2.1022531986237 2.1666667461395 -0.029729328532244 -0.064413547515869
tritonbench_embedding_fwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1 1 1.0305602550507 1.0620155334473 -0.029618472993991 -0.031455278396606
tritonbench_addmm_fwd[x_(19747, 512, 1536)-aten_addmm]_tflops 1 1 400.54473876953 412.10546875 -0.028052842917942 -11.560729980469
tritonbench_cross_entropy_fwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1 1 0.97025346755981 0.9976818561554 -0.027492119282671 -0.027428388595581
tritonbench_grouped_gemm_fwd[x_128-triton]_speedup 1 1 0.15998917818069 0.16427387297153 -0.026082630873277 -0.0042846947908401
tritonbench_softmax_fwd[x_3456.0-triton_softmax]_speedup 1 1 4.5356521606445 4.6566371917725 -0.025981201915771 -0.12098503112793
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 3584)-triton]_speedup 1 1 2.1862864494324 2.241589307785 -0.024671271477159 -0.055302858352661
tritonbench_softmax_fwd[x_2944.0-triton_softmax]_speedup 1 1 4.2317380905151 4.338481426239 -0.024603847576319 -0.10674333572388
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 3584)-triton]_tflops 1 1 64.036811828613 65.645896911621 -0.024511586537908 -1.6090850830078
tritonbench_flex_attention_bwd[x_ (8, 16, 256, 16, 256, 128) | noop-eager]_tflops 1 1 5.4229383468628 5.5581302642822 -0.02432327257391 -0.13519191741943
tritonbench_fp8_gemm_blockwise_fwd[x_average-_triton]_speedup 1 1 0.71849012374878 0.73637920618057 -0.024293301985779 -0.017889082431793
tritonbench_fp8_gemm_blockwise_fwd[_triton]-speedup-avg 1 1 0.71849012374878 0.73637920618057 -0.024293301985779 -0.017889082431793
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 3584)-triton]_speedup 1 1 2.2025487422943 2.2549140453339 -0.023222749065718 -0.052365303039551
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 3584)-triton]_tflops 1 1 64.61498260498 66.149406433105 -0.023196335551048 -1.534423828125
tritonbench_flex_attention_bwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 1 1 23.340023040771 23.890344619751 -0.023035313543553 -0.55032157897949
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 3584)-triton]_speedup 1 1 2.2062826156616 2.2581899166107 -0.022986242462283 -0.051907300949097
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 3584)-triton]_tflops 1 1 64.738258361816 66.260307312012 -0.022970749939752 -1.5220489501953
tritonbench_fused_linear_jsd_fwd[x_(1024, 4096)-liger_lm_head_jsd]_speedup 1 1 0.1776294708252 0.18172663450241 -0.022545752241734 -0.0040971636772156
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 3584)-triton]_tflops 1 1 64.78099822998 66.269020080566 -0.022454260660213 -1.4880218505859
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 3584)-triton]_speedup 1 1 2.2076418399811 2.2583112716675 -0.022436867903063 -0.050669431686401
tritonbench_softmax_fwd[x_3392.0-triton_softmax]_speedup 1 1 4.5094847679138 4.6102938652039 -0.021866089285738 -0.10080909729004
tritonbench_int4_gemm_fwd[x_(16, 4096, 1280, 8192)-triton]_speedup 1 1 2.198029756546 2.2459466457367 -0.021334829694922 -0.047916889190674
tritonbench_int4_gemm_fwd[x_(1, 4096, 7168, 8192)-triton]_speedup 1 1 2.1919093132019 2.2393078804016 -0.021166614744912 -0.047398567199707
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 8192, 13312)-_triton]_tflops 1 1 1126.2078857422 1150.5166015625 -0.021128522428359 -24.308715820312
tritonbench_int4_gemm_fwd[x_(1, 4096, 7168, 8192)-triton]_tflops 1 1 64.89966583252 66.299942016602 -0.021120322906638 -1.400276184082
tritonbench_addmm_fwd[x_(35541, 512, 1536)-triton_addmm]_speedup 1 1 0.97830325365067 0.99926257133484 -0.020974785091946 -0.020959317684174
tritonbench_int4_gemm_fwd[x_(4, 4096, 1280, 8192)-triton]_tflops 1 1 64.70858001709 66.09228515625 -0.020935955473304 -1.3837051391602
tritonbench_int4_gemm_fwd[x_(16, 4096, 1280, 8192)-triton]_tflops 1 1 65.200729370117 66.590599060059 -0.020871860436154 -1.3898696899414
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 8192, 13312)-_triton]_speedup 1 1 1.1740809679031 1.1989696025848 -0.020758353362791 -0.024888634681702
tritonbench_int4_gemm_fwd[x_(16, 4096, 7168, 8192)-triton]_speedup 1 1 2.2063279151917 2.2530641555786 -0.020743413040966 -0.046736240386963
tritonbench_int4_gemm_fwd[x_(64, 4096, 1280, 8192)-triton]_speedup 1 1 2.2030389308929 2.2496812343597 -0.020732849949771 -0.046642303466797
tritonbench_int4_gemm_fwd[x_(16, 4096, 7168, 8192)-triton]_tflops 1 1 65.384864807129 66.768981933594 -0.020729942053654 -1.3841171264648
tritonbench_int4_gemm_fwd[x_(4, 4096, 7168, 8192)-triton]_speedup 1 1 2.2032639980316 2.2498755455017 -0.02071738926328 -0.046611547470093
tritonbench_int4_gemm_fwd[x_(64, 4096, 1280, 8192)-triton]_tflops 1 1 65.363006591797 66.745697021484 -0.02071579879138 -1.3826904296875
tritonbench_int4_gemm_fwd[x_(4, 4096, 7168, 8192)-triton]_tflops 1 1 65.285461425781 66.660614013672 -0.020629161735723 -1.3751525878906
tritonbench_int4_gemm_fwd[x_(64, 4096, 7168, 8192)-triton]_tflops 1 1 65.39958190918 66.768783569336 -0.020506613824025 -1.3692016601562
tritonbench_int4_gemm_fwd[x_(64, 4096, 7168, 8192)-triton]_speedup 1 1 2.2067773342133 2.2529644966125 -0.020500617061981 -0.046187162399292
tritonbench_int4_gemm_fwd[x_(4, 4096, 1280, 8192)-triton]_speedup 1 1 2.1848373413086 2.2299630641937 -0.020236085345856 -0.045125722885132
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 3584)-triton]_speedup 1 1 0.34448930621147 0.35131821036339 -0.019437945288555 -0.0068289041519165
tritonbench_gemm_fwd[x_(3200, 3200, 3200)-triton_tutorial_matmul]_speedup 1 1 0.77564442157745 0.79094076156616 -0.019339425570152 -0.015296339988708
tritonbench_embedding_fwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1 1 1.0112220048904 1.0306122303009 -0.018814278387517 -0.019390225410461
tritonbench_rope_fwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 1 1 2.828547000885 2.8826596736908 -0.018771786798024 -0.054112672805786
tritonbench_addmm_fwd[x_(34238, 512, 1536)-triton_addmm]_speedup 1 1 0.93957775831223 0.95723766088486 -0.018448817147778 -0.017659902572632
tritonbench_int4_gemm_fwd[x_(4, 1, 7168, 8192)-tinygemm]_tflops 1 1 13.617871284485 13.862194061279 -0.017625115888176 -0.24432277679443
tritonbench_layer_norm_bwd[x_1024-liger_layer_norm]_speedup 1 1 0.4849428832531 0.49359658360481 -0.017531929189047 -0.0086537003517151
tritonbench_softmax_fwd[x_2240.0-triton_softmax]_speedup 1 1 3.4922120571136 3.5535168647766 -0.017251869062627 -0.061304807662964
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 1024)-triton]_tflops 1 1 62.091121673584 63.153854370117 -0.016827677536592 -1.0627326965332
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 1024)-triton]_speedup 1 1 2.2468917369843 2.2852900028229 -0.016802360221763 -0.038398265838623
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 1024)-triton]_speedup 1 1 2.2665507793427 2.3050208091736 -0.01668966704241 -0.038470029830933
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 1024)-triton]_tflops 1 1 62.818809509277 63.872032165527 -0.016489574866203 -1.05322265625
tritonbench_addmm_fwd[x_(33961, 512, 1536)-triton_addmm]_speedup 1 1 0.95396971702576 0.96959215402603 -0.016112379762363 -0.015622437000275
tritonbench_rms_norm_fwd[x_(2048, 8192)-liger_rms]_speedup 1 1 4.0878081321716 4.1530089378357 -0.015699654549273 -0.065200805664062
tritonbench_rms_norm_fwd[liger_rms]-speedup-avg 1 1 3.863664150238 3.9247057437897 -0.015553164373718 -0.061041593551636
tritonbench_rms_norm_fwd[x_average-liger_rms]_speedup 1 1 3.863664150238 3.9247057437897 -0.015553164373718 -0.061041593551636
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 1024)-triton]_speedup 1 1 2.2667870521545 2.3020849227905 -0.015333001092418 -0.035297870635986
tritonbench_addmm_fwd[x_(34533, 512, 1536)-aten_addmm]_tflops 1 1 419.75738525391 426.29138183594 -0.015327536188723 -6.5339965820312
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 1024)-triton]_tflops 1 1 62.791000366211 63.758075714111 -0.015167887943117 -0.96707534790039
tritonbench_addmm_fwd[x_(33961, 512, 1536)-triton_addmm]_tflops 1 1 406.67181396484 412.80456542969 -0.014856307266031 -6.1327514648438
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 1024)-triton]_speedup 1 1 2.2711310386658 2.3050866127014 -0.014730715040617 -0.033955574035645
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 1024)-triton]_tflops 1 1 62.93949508667 63.874828338623 -0.014643221379705 -0.93533325195312
tritonbench_welford_fwd[x_2048-test_welford]_speedup 1 1 0.56446379423141 0.57284736633301 -0.014634914279626 -0.008383572101593
tritonbench_flex_attention_fwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_tflops 1 1 12.847255706787 13.036186218262 -0.014492774827806 -0.18893051147461
tritonbench_softmax_fwd[x_4160.0-triton_softmax]_speedup 1 1 4.6946330070496 4.762912273407 -0.014335612843144 -0.068279266357422
tritonbench_kl_div_fwd[x_(8, 512, 131072)-liger_kl_div]_speedup 1 1 4.5417404174805 4.6071724891663 -0.01420221878813 -0.065432071685791
tritonbench_swiglu_bwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1 1 0.91010457277298 0.92312401533127 -0.014103676583061 -0.013019442558289
tritonbench_addmm_fwd[x_(20224, 512, 1536)-triton_addmm]_speedup 1 1 0.94524836540222 0.95875877141953 -0.014091559232673 -0.013510406017303
tritonbench_addmm_fwd[x_(34308, 512, 1536)-triton_addmm]_speedup 1 1 0.92780488729477 0.9409584403038 -0.013978888381921 -0.013153553009033
tritonbench_jsd_fwd[x_(4, 2048, 4096)-liger_jsd]_speedup 1 1 4.3638157844543 4.4245738983154 -0.013731969508796 -0.060758113861084
tritonbench_flex_attention_fwd[x_ (8, 16, 128, 16, 128, 128) | noop-eager]_tflops 1 1 0.18656378984451 0.1891456246376 -0.013649984227959 -0.0025818347930908
tritonbench_softmax_fwd[x_2432.0-triton_softmax]_speedup 1 1 3.7875895500183 3.8399999141693 -0.013648532636058 -0.052410364151001
tritonbench_welford_fwd[x_1536-test_welford]_speedup 1 1 0.5701510310173 0.57784122228622 -0.013308485051473 -0.0076901912689209
tritonbench_gemm_fwd[x_(1536, 1536, 1536)-triton_tutorial_matmul]_speedup 1 1 0.71613836288452 0.72573834657669 -0.013227885418281 -0.0095999836921692
tritonbench_gemm_fwd[x_(384, 384, 384)-triton_tutorial_matmul]_speedup 1 1 0.88326847553253 0.89495801925659 -0.013061555371916 -0.01168954372406
tritonbench_gemm_fwd[x_(2176, 2176, 2176)-triton_tutorial_matmul]_speedup 1 1 0.65899455547333 0.66747277975082 -0.012701977570773 -0.0084782242774963
tritonbench_addmm_fwd[x_(34308, 512, 1536)-triton_addmm]_tflops 1 1 411.42822265625 416.71337890625 -0.012682953122052 -5.28515625
tritonbench_addmm_fwd[x_(34238, 512, 1536)-triton_addmm]_tflops 1 1 408.49649047852 413.71688842773 -0.012618285825986 -5.2203979492188
tritonbench_softmax_fwd[x_4224.0-triton_softmax]_speedup 1 1 4.7268881797791 4.7869353294373 -0.01254396508951 -0.060047149658203
tritonbench_gemm_fwd[x_average-triton_tutorial_matmul]_speedup 1 1 0.80918508768082 0.81946325302124 -0.012542557951842 -0.010278165340424
tritonbench_gemm_fwd[triton_tutorial_matmul]-speedup-avg 1 1 0.80918508768082 0.81946325302124 -0.012542557951842 -0.010278165340424
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 2304, 16384)-_triton]_speedup 1 1 0.77092003822327 0.78059870004654 -0.012399023752788 -0.0096786618232727
tritonbench_cross_entropy_fwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 1 1 1.540071606636 1.5593444108963 -0.012359555801515 -0.019272804260254
tritonbench_gemm_fwd[x_(3584, 3584, 3584)-triton_tutorial_matmul]_speedup 1 1 0.91369438171387 0.92469543218613 -0.011896944755368 -0.01100105047226
tritonbench_addmm_fwd[x_(20203, 512, 1536)-aten_addmm]_tflops 1 1 400.21801757812 404.7844543457 -0.011281156473658 -4.5664367675781
tritonbench_addmm_fwd[x_(34579, 512, 1536)-triton_addmm]_tflops 1 1 414.27392578125 418.86679077148 -0.010964977628747 -4.5928649902344
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 2304, 13312)-_triton]_speedup 1 1 1.0860670804977 1.0980268716812 -0.01089207513215 -0.011959791183472
tritonbench_swiglu_fwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1 1 1.2292054891586 1.2425323724747 -0.010725582376174 -0.01332688331604
tritonbench_gemm_fwd[x_(1280, 1280, 1280)-triton_tutorial_matmul]_speedup 1 1 0.71816635131836 0.72580647468567 -0.010526391860335 -0.0076401233673096
tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-eager]_tflops 1 1 1.3940353393555 1.4087373018265 -0.010436269737407 -0.014701962471008
tritonbench_gemm_fwd[x_(2304, 2304, 2304)-triton_tutorial_matmul]_speedup 1 1 0.68742924928665 0.69463467597961 -0.010372972934012 -0.0072054266929626
tritonbench_addmm_fwd[x_(33887, 512, 1536)-triton_addmm]_tflops 1 1 410.78796386719 415.0862121582 -0.010355073633179 -4.2982482910156
tritonbench_jsd_fwd[x_(4, 2048, 8192)-liger_jsd]_speedup 1 1 4.2700200080872 4.3146076202393 -0.010334105920303 -0.0445876121521
tritonbench_rope_bwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 1 1 3.6437911987305 3.6809666156769 -0.0100993627022 -0.037175416946411
tritonbench_gemm_fwd[x_(3840, 3840, 3840)-triton_tutorial_matmul]_speedup 1 1 0.83101361989975 0.83908796310425 -0.0096227613307988 -0.0080743432044983
tritonbench_embedding_fwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 1 1 1.0134600400925 1.0232744216919 -0.0095911530586283 -0.0098143815994263
tritonbench_addmm_fwd[x_(19747, 512, 1536)-triton_addmm]_tflops 1 1 382.10171508789 385.7451171875 -0.0094451023156786 -3.6434020996094
tritonbench_layer_norm_fwd[x_2560-liger_layer_norm]_speedup 1 1 1.2690168619156 1.2811017036438 -0.0094331634200766 -0.01208484172821
tritonbench_addmm_fwd[x_(35405, 512, 1536)-aten_addmm]_tflops 1 1 431.31640625 435.41595458984 -0.0094152460345774 -4.0995483398438
tritonbench_gemm_fwd[x_(2688, 2688, 2688)-triton_tutorial_matmul]_speedup 1 1 0.92427098751068 0.9330188035965 -0.0093758197070577 -0.0087478160858154
tritonbench_kl_div_fwd[x_(8, 512, 65536)-liger_kl_div]_speedup 1 1 4.5090565681458 4.5509958267212 -0.0092154025563357 -0.041939258575439
tritonbench_cross_entropy_bwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 1 1 1.4337615966797 1.4469134807587 -0.009089613341693 -0.013151884078979
tritonbench_addmm_fwd[x_(20120, 512, 1536)-triton_addmm]_tflops 1 1 390.24069213867 393.8137512207 -0.0090729667792347 -3.5730590820312
tritonbench_addmm_fwd[x_(33887, 512, 1536)-aten_addmm]_tflops 1 1 429.20040893555 433.10528564453 -0.0090159987384437 -3.9048767089844
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 13312, 13312)-_triton]_speedup 1 1 0.81117695569992 0.81849122047424 -0.0089362776183287 -0.0073142647743225
tritonbench_softmax_fwd[x_4800.0-triton_softmax]_speedup 1 1 4.7331051826477 4.7757005691528 -0.0089191912031208 -0.042595386505127
tritonbench_cross_entropy_fwd[liger_cross_entropy_loss]-speedup-avg 1 1 1.088454246521 1.0982412099838 -0.0089114880901016 -0.0097869634628296
tritonbench_cross_entropy_fwd[x_average-liger_cross_entropy_loss]_speedup 1 1 1.088454246521 1.0982412099838 -0.0089114880901016 -0.0097869634628296
tritonbench_softmax_fwd[x_4544.0-triton_softmax]_speedup 1 1 4.7233815193176 4.7651796340942 -0.0087715716900893 -0.041798114776611
tritonbench_grouped_gemm_fwd[x_1024-triton]_speedup 1 1 0.15219810605049 0.15353785455227 -0.0087258513914011 -0.0013397485017776
tritonbench_addmm_fwd[x_(35410, 512, 1536)-aten_addmm]_tflops 1 1 432.66375732422 436.45999145508 -0.0086977826265437 -3.7962341308594
tritonbench_addmm_fwd[x_(19735, 512, 1536)-triton_addmm]_tflops 1 1 383.07559204102 386.43188476562 -0.0086853410831899 -3.3562927246094
tritonbench_softmax_fwd[x_7104.0-triton_softmax]_speedup 1 1 4.7325577735901 4.7731070518494 -0.0084953632547517 -0.040549278259277
tritonbench_addmm_fwd[x_(35541, 512, 1536)-triton_addmm]_tflops 1 1 426.00680541992 429.56735229492 -0.0082886812882267 -3.560546875
tritonbench_softmax_fwd[x_5568.0-triton_softmax]_speedup 1 1 4.6962127685547 4.7350053787231 -0.008192727793462 -0.038792610168457
tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_tflops 1 1 51.764122009277 52.185131072998 -0.0080676057540563 -0.4210090637207
tritonbench_addmm_fwd[x_(19410, 512, 1536)-triton_addmm]_tflops 1 1 377.21380615234 380.21948242188 -0.0079051085188641 -3.0056762695312
tritonbench_softmax_fwd[x_3200.0-triton_softmax]_speedup 1 1 4.2482690811157 4.2807879447937 -0.0075964668414674 -0.032518863677979
tritonbench_addmm_fwd[x_(34579, 512, 1536)-triton_addmm]_speedup 1 1 0.96564328670502 0.97289973497391 -0.0074585777013137 -0.0072564482688904
tritonbench_kl_div_fwd[x_(8, 512, 16384)-liger_kl_div]_speedup 1 1 4.194197177887 4.2250413894653 -0.0073003335908793 -0.030844211578369
tritonbench_addmm_fwd[x_(20067, 512, 1536)-triton_addmm]_speedup 1 1 0.95089107751846 0.95787394046783 -0.0072899602487994 -0.0069828629493713
tritonbench_softmax_fwd[x_8192.0-triton_softmax]_speedup 1 1 4.6892170906067 4.7234978675842 -0.0072574981377247 -0.034280776977539
tritonbench_kl_div_fwd[x_(8, 512, 32768)-liger_kl_div]_speedup 1 1 4.4066281318665 4.4381508827209 -0.0071026766974552 -0.031522750854492
tritonbench_flex_attention_bwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_tflops 1 1 124.69131469727 125.57795715332 -0.0070604943427466 -0.88664245605469
tritonbench_layer_norm_fwd[x_4608-liger_layer_norm]_speedup 1 1 1.2801905870438 1.2891948223114 -0.0069843867752241 -0.0090042352676392
tritonbench_addmm_fwd[x_(20120, 512, 1536)-triton_addmm]_speedup 1 1 0.93451678752899 0.94108283519745 -0.0069771197846568 -0.006566047668457
tritonbench_addmm_fwd[x_(35410, 512, 1536)-triton_addmm]_tflops 1 1 424.95455932617 427.87881469727 -0.0068343074502596 -2.9242553710938
tritonbench_cross_entropy_fwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 1 1 1.3602063655853 1.3695088624954 -0.0067925788323449 -0.0093024969100952
tritonbench_softmax_fwd[x_6464.0-triton_softmax]_speedup 1 1 4.8188977241516 4.8515009880066 -0.0067202426497653 -0.03260326385498
tritonbench_softmax_fwd[x_3264.0-triton_softmax]_speedup 1 1 4.3702988624573 4.3996157646179 -0.0066635142087665 -0.029316902160645
tritonbench_addmm_fwd[x_(19735, 512, 1536)-triton_addmm]_speedup 1 1 0.94630879163742 0.95260852575302 -0.0066131405979395 -0.0062997341156006
tritonbench_cross_entropy_fwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 1 1 1.2801901102066 1.2886205911636 -0.0065422522461933 -0.0084304809570312
tritonbench_softmax_fwd[x_4736.0-triton_softmax]_speedup 1 1 4.6713008880615 4.7011957168579 -0.0063589840961497 -0.029894828796387
tritonbench_kl_div_fwd[liger_kl_div]-speedup-avg 1 1 4.1495904922485 4.1760897636414 -0.0063454745689461 -0.026499271392822
tritonbench_kl_div_fwd[x_average-liger_kl_div]_speedup 1 1 4.1495904922485 4.1760897636414 -0.0063454745689461 -0.026499271392822
tritonbench_addmm_fwd[x_(27456, 512, 1536)-triton_addmm]_tflops 1 1 406.73596191406 409.32583618164 -0.0063271702850168 -2.5898742675781
tritonbench_addmm_fwd[x_(34839, 512, 1536)-triton_addmm]_tflops 1 1 418.61288452148 421.18612670898 -0.0061095131684571 -2.5732421875
tritonbench_gemm_fwd[x_(3712, 3712, 3712)-triton_tutorial_matmul]_speedup 1 1 0.87552499771118 0.88076990842819 -0.0059549158830488 -0.0052449107170105
tritonbench_softmax_fwd[x_7424.0-triton_softmax]_speedup 1 1 4.7579255104065 4.7862777709961 -0.0059236554889081 -0.0283522605896
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 8192, 13312)-_triton]_speedup 1 1 0.52093267440796 0.52389752864838 -0.005659225475002 -0.0029648542404175
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 4096, 16384)-_triton]_speedup 1 1 1.2475334405899 1.2546100616455 -0.0056404944228819 -0.007076621055603
tritonbench_addmm_fwd[x_(36032, 512, 1536)-triton_addmm]_speedup 1 1 1.0044182538986 1.0101033449173 -0.0056282270990225 -0.0056850910186768
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 8192, 16384)-_cutlass]_tflops 1 1 3.601806640625 3.6220242977142 -0.0055818667759883 -0.020217657089233
tritonbench_layer_norm_fwd[x_1536-liger_layer_norm]_speedup 1 1 1.2841225862503 1.2913165092468 -0.0055709990114793 -0.007193922996521
tritonbench_addmm_fwd[x_(20068, 512, 1536)-triton_addmm]_tflops 1 1 389.53945922852 391.70440673828 -0.0055269929889049 -2.1649475097656
tritonbench_addmm_fwd[x_(34516, 512, 1536)-triton_addmm]_tflops 1 1 416.66650390625 418.92932128906 -0.0054014299496863 -2.2628173828125
tritonbench_fused_linear_jsd_fwd[x_(2048, 4096)-liger_lm_head_jsd]_speedup 1 1 0.29311120510101 0.2946372628212 -0.0051794457550009 -0.0015260577201843
tritonbench_addmm_fwd[x_(35656, 512, 1536)-triton_addmm]_tflops 1 1 427.69805908203 429.90051269531 -0.0051231704737282 -2.2024536132812
tritonbench_addmm_fwd[x_(34516, 512, 1536)-aten_addmm]_tflops 1 1 430.6223449707 432.81887817383 -0.0050749477758288 -2.196533203125
tritonbench_softmax_fwd[x_7040.0-triton_softmax]_speedup 1 1 4.7460007667542 4.7701406478882 -0.0050606225090491 -0.024139881134033
tritonbench_addmm_fwd[x_(35901, 512, 1536)-aten_addmm]_tflops 1 1 434.55938720703 436.70962524414 -0.0049237248570083 -2.1502380371094
tritonbench_gemm_fwd[x_(3328, 3328, 3328)-triton_tutorial_matmul]_speedup 1 1 0.8518425822258 0.85602420568466 -0.0048849359995817 -0.0041816234588623
tritonbench_softmax_fwd[x_5120.0-triton_softmax]_speedup 1 1 4.7266697883606 4.7497782707214 -0.0048651707603458 -0.02310848236084
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 13312, 13312)-_triton]_tflops 1 1 50.016750335693 50.25789642334 -0.0047981731192095 -0.24114608764648
tritonbench_softmax_fwd[x_6208.0-triton_softmax]_speedup 1 1 4.8063020706177 4.8293981552124 -0.0047823939655501 -0.023096084594727
tritonbench_addmm_fwd[x_(20203, 512, 1536)-triton_addmm]_tflops 1 1 388.78323364258 390.61779785156 -0.0046965709680272 -1.8345642089844
tritonbench_gemm_fwd[x_(3072, 3072, 3072)-triton_tutorial_matmul]_speedup 1 1 0.71186918020248 0.71522235870361 -0.0046883021207656 -0.0033531785011292
tritonbench_softmax_fwd[x_5632.0-triton_softmax]_speedup 1 1 4.7012286186218 4.7225432395935 -0.0045133776209775 -0.02131462097168
tritonbench_addmm_fwd[x_(35791, 512, 1536)-aten_addmm]_tflops 1 1 435.58709716797 437.53652954102 -0.0044554734094816 -1.9494323730469
tritonbench_addmm_fwd[x_(34533, 512, 1536)-triton_addmm]_tflops 1 1 417.38412475586 419.23913574219 -0.0044247085450268 -1.8550109863281
tritonbench_softmax_fwd[x_2880.0-triton_softmax]_speedup 1 1 4.2774724960327 4.2964482307434 -0.004416609648619 -0.018975734710693
tritonbench_softmax_fwd[x_6720.0-triton_softmax]_speedup 1 1 4.775661945343 4.7968378067017 -0.0044145460430323 -0.021175861358643
tritonbench_addmm_fwd[x_(20224, 512, 1536)-triton_addmm]_tflops 1 1 388.88293457031 390.56307983398 -0.0043018537860416 -1.6801452636719
tritonbench_addmm_fwd[x_(20068, 512, 1536)-triton_addmm]_speedup 1 1 0.93328076601028 0.93727666139603 -0.0042633040491913 -0.0039958953857422
tritonbench_addmm_fwd[x_(33894, 512, 1536)-triton_addmm]_speedup 1 1 0.94539171457291 0.94937020540237 -0.0041906632489921 -0.0039784908294678
tritonbench_addmm_fwd[x_(27456, 512, 1536)-triton_addmm]_speedup 1 1 0.97438991069794 0.97847175598145 -0.0041716536614938 -0.0040818452835083
tritonbench_addmm_fwd[x_(19410, 512, 1536)-triton_addmm]_speedup 1 1 0.94861668348312 0.95258963108063 -0.0041706811284485 -0.0039729475975037
tritonbench_addmm_fwd[x_(35917, 512, 1536)-triton_addmm]_speedup 1 1 1.0036559104919 1.0078086853027 -0.004120598354978 -0.004152774810791
tritonbench_addmm_fwd[x_(35656, 512, 1536)-triton_addmm]_speedup 1 1 0.96267384290695 0.96665042638779 -0.0041137761617659 -0.003976583480835
tritonbench_softmax_fwd[x_6400.0-triton_softmax]_speedup 1 1 4.8012089729309 4.8209953308105 -0.0041042059827741 -0.019786357879639
tritonbench_softmax_fwd[x_5504.0-triton_softmax]_speedup 1 1 4.7005157470703 4.7198567390442 -0.0040977921668431 -0.019340991973877
tritonbench_jsd_fwd[liger_jsd]-speedup-avg 1 1 1.8437962532043 1.8513215780258 -0.0040648393616721 -0.0075253248214722
tritonbench_jsd_fwd[x_average-liger_jsd]_speedup 1 1 1.8437962532043 1.8513215780258 -0.0040648393616721 -0.0075253248214722
tritonbench_softmax_fwd[x_7360.0-triton_softmax]_speedup 1 1 4.7780427932739 4.7974448204041 -0.0040442418529981 -0.019402027130127
tritonbench_softmax_fwd[x_5056.0-triton_softmax]_speedup 1 1 4.6975698471069 4.7166066169739 -0.0040361156680811 -0.019036769866943
tritonbench_gemm_fwd[x_(1664, 1664, 1664)-triton_tutorial_matmul]_speedup 1 1 0.82526880502701 0.82861000299454 -0.0040322925808938 -0.0033411979675293
tritonbench_grouped_gemm_fwd[triton]-speedup-avg 1 1 0.17178927361965 0.17247937619686 -0.0040010730118934 -0.00069010257720947
tritonbench_grouped_gemm_fwd[x_average-triton]_speedup 1 1 0.17178927361965 0.17247937619686 -0.0040010730118934 -0.00069010257720947
tritonbench_addmm_fwd[x_(35249, 512, 1536)-triton_addmm]_speedup 1 1 0.98031491041183 0.98425197601318 -0.0040000586204524 -0.0039370656013489
tritonbench_jsd_bwd[x_(4, 2048, 131072)-liger_jsd]_speedup 1 1 5.831259727478 5.8544912338257 -0.003968151188515 -0.023231506347656
tritonbench_addmm_fwd[x_(36032, 512, 1536)-triton_addmm]_tflops 1 1 434.86044311523 436.57498168945 -0.0039272488029063 -1.7145385742188
tritonbench_softmax_fwd[x_2688.0-triton_softmax]_speedup 1 1 3.8132045269012 3.8280253410339 -0.0038716604025112 -0.01482081413269
tritonbench_softmax_fwd[x_4864.0-triton_softmax]_speedup 1 1 4.7352232933044 4.7533717155457 -0.003818010315048 -0.018148422241211
tritonbench_addmm_fwd[x_(34181, 512, 1536)-aten_addmm]_tflops 1 1 426.98458862305 428.61807250977 -0.0038110476237129 -1.6334838867188
tritonbench_addmm_fwd[x_(35380, 512, 1536)-aten_addmm]_tflops 1 1 441.4016418457 443.08807373047 -0.0038060872877194 -1.6864318847656
tritonbench_addmm_fwd[x_(19410, 512, 1536)-aten_addmm]_tflops 1 1 397.64617919922 399.14297485352 -0.0037500237974781 -1.4967956542969
tritonbench_rope_fwd[liger_rotary_pos_emb]-speedup-avg 1 1 2.8306114673615 2.8411982059479 -0.0037261527774666 -0.010586738586426
tritonbench_rope_fwd[x_average-liger_rotary_pos_emb]_speedup 1 1 2.8306114673615 2.8411982059479 -0.0037261527774666 -0.010586738586426
tritonbench_addmm_fwd[x_(34579, 512, 1536)-aten_addmm]_tflops 1 1 429.01342773438 430.53436279297 -0.0035326682142794 -1.5209350585938
tritonbench_layer_norm_bwd[x_9728-liger_layer_norm]_speedup 1 1 0.1488119661808 0.14933185279369 -0.0034814180843948 -0.00051988661289215
tritonbench_softmax_fwd[x_7872.0-triton_softmax]_speedup 1 1 4.7300968170166 4.7465920448303 -0.0034751728520015 -0.016495227813721
tritonbench_addmm_fwd[x_(35561, 512, 1536)-aten_addmm]_tflops 1 1 432.78793334961 434.29290771484 -0.0034653440995692 -1.5049743652344
tritonbench_rope_fwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 1 1 2.774352312088 2.7839529514313 -0.0034485637906797 -0.0096006393432617
tritonbench_addmm_fwd[x_(35884, 512, 1536)-triton_addmm]_tflops 1 1 430.32794189453 431.80239868164 -0.0034146563141176 -1.4744567871094
tritonbench_rope_fwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 1 1 2.9231371879578 2.9330217838287 -0.0033701065315882 -0.0098845958709717
tritonbench_jsd_fwd[x_(4, 2048, 32768)-liger_jsd]_speedup 1 1 0.59508979320526 0.59697759151459 -0.0031622599175568 -0.0018877983093262
tritonbench_addmm_fwd[x_average-triton_addmm]_tflops 1 1 413.62057495117 414.92672729492 -0.0031479108426332 -1.30615234375
tritonbench_addmm_fwd[triton_addmm]-tflops-avg 1 1 413.62057495117 414.92672729492 -0.0031479108426332 -1.30615234375
tritonbench_addmm_fwd[x_(35504, 512, 1536)-triton_addmm]_tflops 1 1 416.62548828125 417.92211914062 -0.0031025657652226 -1.296630859375
tritonbench_addmm_fwd[x_(34839, 512, 1536)-triton_addmm]_speedup 1 1 0.97116327285767 0.97418242692947 -0.0030991670434088 -0.0030191540718079
tritonbench_rms_norm_fwd[x_(2048, 16384)-liger_rms]_speedup 1 1 3.9816236495972 3.9939901828766 -0.0030962853470291 -0.012366533279419
tritonbench_softmax_fwd[x_5184.0-triton_softmax]_speedup 1 1 4.7012138366699 4.7157759666443 -0.0030879605132573 -0.014562129974365
tritonbench_addmm_fwd[x_(34839, 512, 1536)-aten_addmm]_tflops 1 1 431.04275512695 432.34829711914 -0.0030196533694864 -1.3055419921875
tritonbench_addmm_fwd[x_(35791, 512, 1536)-triton_addmm]_tflops 1 1 430.68328857422 431.9518737793 -0.0029368670032123 -1.2685852050781
tritonbench_cross_entropy_fwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 1 1 0.59793102741241 0.59961903095245 -0.0028151266936238 -0.0016880035400391
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 8192, 2048)-_triton]_speedup 1 1 1.3148661851883 1.3185391426086 -0.0027856263812407 -0.0036729574203491
tritonbench_addmm_fwd[x_(35405, 512, 1536)-triton_addmm]_tflops 1 1 426.76956176758 427.92358398438 -0.0026967950820841 -1.1540222167969
tritonbench_kl_div_bwd[x_(8, 512, 32768)-liger_kl_div]_speedup 1 1 1.0351914167404 1.037954211235 -0.0026617691461953 -0.0027627944946289
tritonbench_jsd_bwd[x_(4, 2048, 32768)-liger_jsd]_speedup 1 1 5.7887377738953 5.8039908409119 -0.0026280308557836 -0.015253067016602
tritonbench_addmm_fwd[x_(15168, 512, 1536)-aten_addmm]_tflops 1 1 383.63180541992 384.62106323242 -0.0025720323379747 -0.9892578125
tritonbench_addmm_fwd[x_(20116, 512, 1536)-aten_addmm]_tflops 1 1 417.50250244141 418.56262207031 -0.0025327622988948 -1.0601196289062
tritonbench_addmm_fwd[x_(35504, 512, 1536)-triton_addmm]_speedup 1 1 0.95179003477097 0.95403397083282 -0.0023520504829616 -0.0022439360618591
tritonbench_fused_linear_cross_entropy_fwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 1 1 0.34759637713432 0.34840965270996 -0.0023342509867683 -0.00081327557563782
tritonbench_rope_bwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 1 1 3.6788063049316 3.6873371601105 -0.0023135544183807 -0.008530855178833
tritonbench_fused_linear_jsd_fwd[liger_lm_head_jsd]-speedup-avg 1 1 0.32255062460899 0.32329457998276 -0.0023011687167899 -0.00074395537376404
tritonbench_fused_linear_jsd_fwd[x_average-liger_lm_head_jsd]_speedup 1 1 0.32255062460899 0.32329457998276 -0.0023011687167899 -0.00074395537376404
tritonbench_embedding_fwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 1 1 1.163771033287 1.1664091348648 -0.0022617291813859 -0.0026381015777588
tritonbench_addmm_fwd[x_(35503, 512, 1536)-triton_addmm]_speedup 1 1 0.98901098966599 0.99121099710464 -0.0022195147603144 -0.0022000074386597
tritonbench_addmm_fwd[x_(27456, 512, 1536)-aten_addmm]_tflops 1 1 417.42630004883 418.33178710938 -0.0021645189020985 -0.90548706054688
tritonbench_geglu_bwd[x_(8, 4096, 4096)-liger_geglu]_speedup 1 1 1.001692533493 1.0038558244705 -0.0021549817461278 -0.002163290977478
tritonbench_addmm_fwd[x_(20120, 512, 1536)-aten_addmm]_tflops 1 1 417.58554077148 418.46875 -0.0021105739162497 -0.88320922851562
tritonbench_addmm_fwd[x_(19735, 512, 1536)-aten_addmm]_tflops 1 1 404.81036376953 405.65655517578 -0.0020859798651185 -0.84619140625
tritonbench_layer_norm_fwd[x_7680-liger_layer_norm]_speedup 1 1 1.5593526363373 1.5626120567322 -0.0020858794611592 -0.0032594203948975
tritonbench_addmm_fwd[x_(34181, 512, 1536)-triton_addmm]_tflops 1 1 413.53625488281 414.35189819336 -0.0019684797248503 -0.81564331054688
tritonbench_addmm_fwd[x_(35380, 512, 1536)-triton_addmm]_tflops 1 1 425.73760986328 426.57281494141 -0.0019579425806582 -0.835205078125
tritonbench_addmm_fwd[x_(35884, 512, 1536)-triton_addmm]_speedup 1 1 0.96121948957443 0.96304452419281 -0.0018950677487184 -0.0018250346183777
tritonbench_rope_fwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 1 1 2.78497838974 2.7901020050049 -0.0018363541030765 -0.0051236152648926
tritonbench_addmm_fwd[triton_addmm]-speedup-avg 1 1 0.97238206863403 0.97404986619949 -0.0017122301674015 -0.0016677975654602
tritonbench_addmm_fwd[x_average-triton_addmm]_speedup 1 1 0.97238206863403 0.97404986619949 -0.0017122301674015 -0.0016677975654602
tritonbench_layer_norm_fwd[x_14336-liger_layer_norm]_speedup 1 1 1.5907131433487 1.5933464765549 -0.0016527059524872 -0.0026333332061768
tritonbench_layer_norm_fwd[x_1024-liger_layer_norm]_speedup 1 1 1.2959001064301 1.2980251312256 -0.0016371214581383 -0.0021250247955322
tritonbench_cross_entropy_bwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 1 1 1.8119332790375 1.8148946762085 -0.0016317184737173 -0.0029613971710205
tritonbench_softmax_fwd[x_7232.0-triton_softmax]_speedup 1 1 4.7790474891663 4.786684513092 -0.0015954725875276 -0.0076370239257812
tritonbench_addmm_fwd[x_(35884, 512, 1536)-aten_addmm]_tflops 1 1 447.68954467773 448.37219238281 -0.0015225023243531 -0.68264770507812
tritonbench_cross_entropy_bwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 1 1 2.0630431175232 2.0659890174866 -0.0014259030122836 -0.0029458999633789
tritonbench_addmm_fwd[aten_addmm]-tflops-avg 1 1 425.37796020508 425.97244262695 -0.001395588921689 -0.594482421875
tritonbench_addmm_fwd[x_average-aten_addmm]_tflops 1 1 425.37796020508 425.97244262695 -0.001395588921689 -0.594482421875
tritonbench_rms_norm_fwd[x_(2048, 32768)-liger_rms]_speedup 1 1 3.5887908935547 3.5937695503235 -0.0013853578252814 -0.0049786567687988
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 8192, 2048)-_triton]_tflops 1 1 991.45135498047 992.82647705078 -0.0013850578143296 -1.3751220703125
tritonbench_gemm_fwd[x_(2432, 2432, 2432)-triton_tutorial_matmul]_speedup 1 1 0.79625672101974 0.79734045267105 -0.0013591830787911 -0.0010837316513062
tritonbench_addmm_fwd[x_(33887, 512, 1536)-triton_addmm]_speedup 1 1 0.95710057020187 0.95839565992355 -0.0013513100860484 -0.0012950897216797
tritonbench_softmax_fwd[x_8384.0-triton_softmax]_speedup 1 1 4.7408828735352 4.7471313476562 -0.0013162631626316 -0.0062484741210938
tritonbench_softmax_fwd[x_6784.0-triton_softmax]_speedup 1 1 4.7854399681091 4.7915558815002 -0.0012763940445162 -0.0061159133911133
tritonbench_addmm_fwd[x_(20068, 512, 1536)-aten_addmm]_tflops 1 1 417.38723754883 417.91757202148 -0.0012689929980474 -0.53033447265625
tritonbench_gemm_fwd[x_(1792, 1792, 1792)-triton_tutorial_matmul]_speedup 1 1 0.88695651292801 0.88807791471481 -0.0012627290558896 -0.0011214017868042
tritonbench_int4_gemm_fwd[x_(16, 1, 1280, 8192)-tinygemm]_tflops 1 1 12.91349697113 12.929420471191 -0.0012315710589284 -0.015923500061035
tritonbench_softmax_fwd[x_4352.0-triton_softmax]_speedup 1 1 4.7289991378784 4.7348227500916 -0.001229953584434 -0.0058236122131348
tritonbench_addmm_fwd[x_(35917, 512, 1536)-triton_addmm]_tflops 1 1 430.40875244141 430.9338684082 -0.0012185534841728 -0.52511596679688
tritonbench_cross_entropy_bwd[liger_cross_entropy_loss]-speedup-avg 1 1 1.8443877696991 1.8465194702148 -0.0011544424795581 -0.0021317005157471
tritonbench_cross_entropy_bwd[x_average-liger_cross_entropy_loss]_speedup 1 1 1.8443877696991 1.8465194702148 -0.0011544424795581 -0.0021317005157471
tritonbench_softmax_fwd[x_6336.0-triton_softmax]_speedup 1 1 4.7975540161133 4.803083896637 -0.0011513187449325 -0.0055298805236816
tritonbench_flex_attention_bwd[x_average-eager]_tflops 1 1 13.841172218323 13.85685634613 -0.001131867677332 -0.015684127807617
tritonbench_flex_attention_bwd[eager]-tflops-avg 1 1 17.795793533325 17.815958023071 -0.0011318218037998 -0.020164489746094
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 2304, 13312)-_triton]_tflops 1 1 979.38598632812 980.48657226562 -0.0011224895563402 -1.1005859375
tritonbench_softmax_fwd[x_5696.0-triton_softmax]_speedup 1 1 4.7087745666504 4.7139625549316 -0.0011005578047756 -0.00518798828125
tritonbench_rope_bwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 1 1 2.1907494068146 2.1930866241455 -0.001065720480532 -0.0023372173309326
tritonbench_addmm_fwd[x_(33660, 512, 1536)-aten_addmm]_tflops 1 1 438.525390625 438.99066162109 -0.001059865361089 -0.46527099609375
tritonbench_fused_linear_cross_entropy_bwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 1 1 163.07585144043 163.24407958984 -0.001030531397137 -0.16822814941406
tritonbench_addmm_fwd[x_(35656, 512, 1536)-aten_addmm]_tflops 1 1 444.28137207031 444.7321472168 -0.001013587952446 -0.45077514648438
tritonbench_softmax_fwd[x_8000.0-triton_softmax]_speedup 1 1 4.7050309181213 4.7097458839417 -0.0010011083265423 -0.0047149658203125
tritonbench_swiglu_fwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1 1 1.0251451730728 1.0261573791504 -0.00098640432563448 -0.0010122060775757
tritonbench_addmm_fwd[x_(35678, 512, 1536)-triton_addmm]_speedup 1 1 0.98730146884918 0.9882755279541 -0.00098561491949103 -0.00097405910491943
tritonbench_jsd_bwd[x_(4, 2048, 65536)-liger_jsd]_speedup 1 1 5.8248076438904 5.8303127288818 -0.00094421778855606 -0.0055050849914551
tritonbench_softmax_fwd[x_6144.0-triton_softmax]_speedup 1 1 4.8020339012146 4.8065719604492 -0.0009441363349931 -0.0045380592346191
tritonbench_flex_attention_fwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 1 1 68.594589233398 68.653251647949 -0.00085447394176753 -0.058662414550781
tritonbench_grouped_gemm_fwd[x_512-triton]_speedup 1 1 0.19722293317318 0.19738560914993 -0.00082415317638312 -0.00016267597675323
tritonbench_softmax_fwd[x_8064.0-triton_softmax]_speedup 1 1 4.7106871604919 4.7144875526428 -0.00080610927665903 -0.0038003921508789
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 2304, 16384)-_triton]_tflops 1 1 28.586698532104 28.608362197876 -0.00075724942314568 -0.021663665771484
tritonbench_addmm_fwd[x_(35504, 512, 1536)-aten_addmm]_tflops 1 1 437.72833251953 438.0578918457 -0.00075231911650608 -0.32955932617188
tritonbench_layer_norm_fwd[x_14848-liger_layer_norm]_speedup 1 1 1.5790971517563 1.5802351236343 -0.00072012820183022 -0.0011379718780518
tritonbench_int4_gemm_fwd[x_(4, 4096, 1280, 8192)-tinygemm]_tflops 1 1 29.617115020752 29.638286590576 -0.0007143317735159 -0.021171569824219
tritonbench_softmax_fwd[x_6656.0-triton_softmax]_speedup 1 1 4.8014569282532 4.8046636581421 -0.0006674202643679 -0.003206729888916
tritonbench_rope_bwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 1 1 3.9488558769226 3.9514882564545 -0.00066617420096354 -0.0026323795318604
tritonbench_softmax_fwd[x_6080.0-triton_softmax]_speedup 1 1 4.6886825561523 4.6915149688721 -0.00060373093521378 -0.0028324127197266
tritonbench_flex_attention_fwd[x_average-eager]_tflops 1 1 5.9016289710999 5.9051160812378 -0.00059052355448506 -0.0034871101379395
tritonbench_flex_attention_fwd[eager]-tflops-avg 1 1 7.5878086090088 7.5922918319702 -0.00059049665906512 -0.0044832229614258
tritonbench_softmax_fwd[x_8128.0-triton_softmax]_speedup 1 1 4.7288055419922 4.7313995361328 -0.00054825091832029 -0.002593994140625
tritonbench_swiglu_fwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 1 1 1.027358174324 1.0279189348221 -0.00054552988475101 -0.00056076049804688
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 4096, 16384)-_triton]_tflops 1 1 1160.9588623047 1161.4886474609 -0.00045612598746284 -0.52978515625
tritonbench_softmax_fwd[x_7936.0-triton_softmax]_speedup 1 1 4.71284532547 4.7148680686951 -0.00042901374876805 -0.0020227432250977
tritonbench_softmax_fwd[x_5760.0-triton_softmax]_speedup 1 1 4.6898474693298 4.6916513442993 -0.00038448615148571 -0.0018038749694824
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 8192, 13312)-_cutlass]_tflops 1 1 959.22509765625 959.58782958984 -0.00037800805971955 -0.36273193359375
tritonbench_softmax_fwd[x_5440.0-triton_softmax]_speedup 1 1 4.6911587715149 4.6928339004517 -0.00035695466157589 -0.0016751289367676
tritonbench_addmm_fwd[x_(34516, 512, 1536)-triton_addmm]_speedup 1 1 0.9675914645195 0.96790915727615 -0.00032822579915131 -0.00031769275665283
tritonbench_rope_fwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 1 1 2.7579574584961 2.7588572502136 -0.000326146529495 -0.0008997917175293
tritonbench_layer_norm_fwd[x_15360-liger_layer_norm]_speedup 1 1 1.5953041315079 1.5957839488983 -0.00030067816559575 -0.00047981739044189
tritonbench_jsd_bwd[x_(4, 2048, 16384)-liger_jsd]_speedup 1 1 6.0247311592102 6.0264863967896 -0.00029125388556104 -0.0017552375793457
tritonbench_addmm_fwd[x_(33660, 512, 1536)-triton_addmm]_tflops 1 1 470.16900634766 470.30261230469 -0.00028408508380705 -0.13360595703125
tritonbench_rope_fwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 1 1 3.0554838180542 3.0563411712646 -0.00028051619973253 -0.00085735321044922
tritonbench_addmm_fwd[x_(35916, 512, 1536)-aten_addmm]_tflops 1 1 438.41061401367 438.51943969727 -0.00024816615580116 -0.10882568359375
tritonbench_addmm_fwd[x_(35678, 512, 1536)-triton_addmm]_tflops 1 1 428.37994384766 428.48458862305 -0.00024422062816053 -0.10464477539062
tritonbench_geglu_fwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1 1 1.004363656044 1.0046082735062 -0.00024349536890082 -0.0002446174621582
tritonbench_jsd_bwd[x_average-liger_jsd]_speedup 1 1 5.909158706665 5.9105486869812 -0.00023516942161795 -0.0013899803161621
tritonbench_jsd_bwd[liger_jsd]-speedup-avg 1 1 5.909158706665 5.9105486869812 -0.00023516942161795 -0.0013899803161621
tritonbench_kl_div_fwd[x_(8, 512, 4096)-liger_kl_div]_speedup 1 1 3.3659336566925 3.3666036128998 -0.00019900062030122 -0.00066995620727539
tritonbench_flex_attention_bwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-eager]_tflops 1 1 29.741235733032 29.746559143066 -0.00017895885062123 -0.0053234100341797
tritonbench_flex_attention_fwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-eager]_tflops 1 1 15.882374763489 15.884665489197 -0.00014420988024996 -0.0022907257080078
tritonbench_softmax_fwd[x_7552.0-triton_softmax]_speedup 1 1 4.7670950889587 4.7675132751465 -0.000087715788841992 -0.00041818618774414
tritonbench_flex_attention_bwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-eager]_tflops 1 1 28.888723373413 28.89012336731 -0.000048459256427701 -0.0013999938964844
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 1024)-tinygemm]_tflops 1 1 27.634229660034 27.634943008423 -0.000025813275187666 -0.00071334838867188
tritonbench_int4_gemm_fwd[x_(1, 4096, 1280, 8192)-tinygemm]_tflops 1 1 29.539953231812 29.540601730347 -0.000021952786915984 -0.00064849853515625
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 3584)-tinygemm]_tflops 1 1 29.343978881836 29.344501495361 -0.000017809589488962 -0.00052261352539062
tritonbench_geglu_bwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1 1 1.0050340890884 1.0050485134125 -0.000014351868435355 -0.000014424324035645
tritonbench_int4_gemm_fwd[x_(64, 4096, 7168, 8192)-tinygemm]_tflops 1 1 29.635786056519 29.635967254639 -0.0000061141287733346 -0.00018119812011719
tritonbench_layer_norm_bwd-pass 1 1 1 1 0 0
tritonbench_kl_div_fwd-pass 1 1 1 1 0 0
tritonbench_jsd_fwd-pass 1 1 1 1 0 0
tritonbench_kl_div_bwd-pass 1 1 1 1 0 0
tritonbench_layer_norm_fwd-pass 1 1 1 1 0 0
tritonbench_jsd_bwd-pass 1 1 1 1 0 0
tritonbench_int4_gemm_fwd[x_(16, 1, 7168, 8192)-tinygemm]_tflops 1 1 25.10485458374 25.10485458374 0 0
tritonbench_int4_gemm_fwd[x_(1, 1, 7168, 8192)-tinygemm]_tflops 1 1 3.7033462524414 3.7033462524414 0 0
tritonbench_int4_gemm_fwd[x_(64, 1, 7168, 8192)-tinygemm]_tflops 1 1 27.862516403198 27.862516403198 0 0
tritonbench_int4_gemm_fwd[x_(64, 1, 1280, 8192)-tinygemm]_tflops 1 1 21.575637817383 21.575637817383 0 0
tritonbench_welford_fwd-pass 1 1 1 1 0 0
tritonbench_swiglu_fwd-pass 1 1 1 1 0 0
tritonbench_swiglu_bwd-pass 1 1 1 1 0 0
tritonbench_rms_norm_bwd-pass 1 1 1 1 0 0
tritonbench_rms_norm_fwd-pass 1 1 1 1 0 0
tritonbench_low_mem_dropout_fwd-pass 1 1 1 1 0 0
tritonbench_rope_bwd-pass 1 1 1 1 0 0
tritonbench_rope_fwd-pass 1 1 1 1 0 0
tritonbench_softmax_fwd-pass 1 1 1 1 0 0
tritonbench_bf16_flex_attention_bwd-pass 1 1 1 1 0 0
tritonbench_bf16_flex_attention_fwd-pass 1 1 1 1 0 0
tritonbench_bf16_ragged_attention_bwd-pass 1 1 1 1 0 0
tritonbench_cross_entropy_bwd-pass 1 1 1 1 0 0
tritonbench_cross_entropy_fwd-pass 1 1 1 1 0 0
tritonbench_embedding_bwd-pass 1 1 1 1 0 0
tritonbench_embedding_fwd-pass 1 1 1 1 0 0
tritonbench_addmm_fwd[x_(35249, 512, 1536)-triton_addmm]_tflops 1 1 426.45736694336 426.45736694336 0 0
tritonbench_fused_linear_jsd_fwd-pass 1 1 1 1 0 0
tritonbench_fused_linear_cross_entropy_fwd-pass 1 1 1 1 0 0
tritonbench_geglu_bwd-pass 1 1 1 1 0 0
tritonbench_fused_linear_cross_entropy_bwd-pass 1 1 1 1 0 0
tritonbench_geglu_fwd-pass 1 1 1 1 0 0
tritonbench_int4_gemm_fwd-pass 1 1 1 1 0 0
tritonbench_fp16_addmm_fwd-pass 1 1 1 1 0 0
tritonbench_fp16_gemm_fwd-pass 1 1 1 1 0 0
tritonbench_fp16_grouped_gemm_fwd-pass 1 1 1 1 0 0
tritonbench_fp8_gemm_blockwise_fwd-pass 1 1 1 1 0 0
tritonbench_int4_gemm_fwd[x_(16, 4096, 7168, 8192)-tinygemm]_tflops 1 1 29.635152816772 29.634744644165 0.000013773447766227 0.00040817260742188
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 3584)-tinygemm]_tflops 1 1 29.342685699463 29.342220306396 0.00001586086743084 0.00046539306640625
tritonbench_int4_gemm_fwd[x_(64, 4096, 1280, 8192)-tinygemm]_tflops 1 1 29.669473648071 29.668956756592 0.000017421963425706 0.00051689147949219
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 3584)-tinygemm]_tflops 1 1 29.336462020874 29.335664749146 0.000027177557942976 0.00079727172851562
tritonbench_int4_gemm_fwd[x_(1, 4096, 7168, 8192)-tinygemm]_tflops 1 1 29.608737945557 29.60733795166 0.000047285368876126 0.0013999938964844
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 1024)-tinygemm]_tflops 1 1 27.712841033936 27.710382461548 0.000088723870596407 0.0024585723876953
tritonbench_int4_gemm_fwd[x_(4, 4096, 7168, 8192)-tinygemm]_tflops 1 1 29.631248474121 29.628580093384 0.00009006104001253 0.0026683807373047
tritonbench_softmax_fwd[x_5824.0-triton_softmax]_speedup 1 1 4.7046537399292 4.7039470672607 0.00015022972375166 0.00070667266845703
tritonbench_softmax_fwd[x_average-triton_softmax]_speedup 1 1 4.6013903617859 4.6006407737732 0.00016293121970498 0.00074958801269531
tritonbench_softmax_fwd[triton_softmax]-speedup-avg 1 1 4.6013903617859 4.6006407737732 0.00016293121970498 0.00074958801269531
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 3584)-tinygemm]_tflops 1 1 29.290222167969 29.285425186157 0.00016380099592308 0.0047969818115234
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 1024)-tinygemm]_tflops 1 1 27.700441360474 27.695796966553 0.00016769309532805 0.0046443939208984
tritonbench_flex_attention_fwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-eager]_tflops 1 1 15.898512840271 15.895292282104 0.00020261081767774 0.0032205581665039
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 1024)-tinygemm]_tflops 1 1 27.71559715271 27.709959030151 0.00020346917700091 0.0056381225585938
tritonbench_addmm_fwd[x_(35503, 512, 1536)-triton_addmm]_tflops 1 1 426.27874755859 426.17468261719 0.00024418377170407 0.10406494140625
tritonbench_cross_entropy_bwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 1 1 2.1176381111145 2.1171009540558 0.00025372293073071 0.00053715705871582
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 8192, 16384)-_triton]_tflops 1 1 2.4049909114838 2.404301404953 0.0002867804050446 0.00068950653076172
tritonbench_rope_fwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 1 1 2.7601881027222 2.759259223938 0.00033664063750197 0.00092887878417969
tritonbench_addmm_fwd[x_(20211, 512, 1536)-triton_addmm]_speedup 1 1 0.97277027368546 0.97239744663239 0.0003834101522595 0.00037282705307007
tritonbench_geglu_bwd[liger_geglu]-speedup-avg 1 1 1.0043283700943 1.0039280653 0.00039873852335417 0.00040030479431152
tritonbench_geglu_bwd[x_average-liger_geglu]_speedup 1 1 1.0043283700943 1.0039280653 0.00039873852335417 0.00040030479431152
tritonbench_addmm_fwd[x_(20211, 512, 1536)-aten_addmm]_tflops 1 1 403.13772583008 402.97424316406 0.00040569011243001 0.16348266601562
tritonbench_softmax_fwd[x_7616.0-triton_softmax]_speedup 1 1 4.748733997345 4.7467141151428 0.00042553272709318 0.0020198822021484
tritonbench_int4_gemm_fwd[x_(16, 4096, 1280, 8192)-tinygemm]_tflops 1 1 29.663259506226 29.64923286438 0.00047308616414675 0.014026641845703
tritonbench_addmm_fwd[x_(35844, 512, 1536)-triton_addmm]_speedup 1 1 0.96374332904816 0.96328711509705 0.00047360121812164 0.00045621395111084
tritonbench_addmm_fwd[x_(33894, 512, 1536)-triton_addmm]_tflops 1 1 411.78656005859 411.58316040039 0.00049418848430353 0.20339965820312
tritonbench_geglu_bwd[x_(8, 2048, 4096)-liger_geglu]_speedup 1 1 1.0048246383667 1.0042968988419 0.00052548158363318 0.00052773952484131
tritonbench_layer_norm_fwd[x_12800-liger_layer_norm]_speedup 1 1 1.5407487154007 1.5398589372635 0.00057783094001342 0.00088977813720703
tritonbench_flex_attention_bwd[x_ (8, 16, 512, 16, 512, 128) | noop-eager]_tflops 1 1 18.300754547119 18.289531707764 0.00061362092451524 0.011222839355469
tritonbench_softmax_fwd[x_2496.0-triton_softmax]_speedup 1 1 3.7682402133942 3.7656903266907 0.00067713658911833 0.0025498867034912
tritonbench_fused_linear_cross_entropy_fwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 1 1 0.31438690423965 0.31417319178581 0.00068023771419638 0.00021371245384216
tritonbench_flex_attention_bwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-eager]_tflops 1 1 26.981693267822 26.961772918701 0.00073883676645302 0.019920349121094
tritonbench_addmm_fwd[x_(35678, 512, 1536)-aten_addmm]_tflops 1 1 433.88967895508 433.56793212891 0.00074209092123588 0.32174682617188
tritonbench_addmm_fwd[x_(33660, 512, 1536)-triton_addmm]_speedup 1 1 1.0721590518951 1.0713270902634 0.0007765710765051 0.0008319616317749
tritonbench_addmm_fwd[x_(20211, 512, 1536)-triton_addmm]_tflops 1 1 392.16040039062 391.85113525391 0.00078924139525169 0.30926513671875
tritonbench_addmm_fwd[x_(19632, 512, 1536)-aten_addmm]_tflops 1 1 390.47985839844 390.16418457031 0.00080907946092656 0.315673828125
tritonbench_int4_gemm_fwd[x_average-tinygemm]_tflops 1 1 21.576118469238 21.558313369751 0.00082590410399579 0.017805099487305
tritonbench_int4_gemm_fwd[tinygemm]-tflops-avg 1 1 21.576118469238 21.558313369751 0.00082590410399579 0.017805099487305
tritonbench_softmax_fwd[x_6848.0-triton_softmax]_speedup 1 1 4.7963123321533 4.7923483848572 0.00082714088747551 0.0039639472961426
tritonbench_softmax_fwd[x_5376.0-triton_softmax]_speedup 1 1 4.7339677810669 4.7299752235413 0.00084409692164214 0.0039925575256348
tritonbench_cross_entropy_bwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 1 1 2.1033871173859 2.1014928817749 0.00090137617281009 0.0018942356109619
tritonbench_softmax_fwd[x_6528.0-triton_softmax]_speedup 1 1 4.8149876594543 4.810601234436 0.00091182469810861 0.0043864250183105
tritonbench_softmax_fwd[x_6912.0-triton_softmax]_speedup 1 1 4.7793526649475 4.774956703186 0.00092062861188698 0.0043959617614746
tritonbench_kl_div_bwd[x_(8, 512, 4096)-liger_kl_div]_speedup 1 1 0.91202181577682 0.91116416454315 0.0009412697152144 0.0008576512336731
tritonbench_layer_norm_fwd[x_3072-liger_layer_norm]_speedup 1 1 1.2606941461563 1.2594681978226 0.00097338570029772 0.0012259483337402
tritonbench_kl_div_bwd[x_(8, 512, 16384)-liger_kl_div]_speedup 1 1 1.0149542093277 1.0138709545135 0.0010684346063229 0.0010832548141479
tritonbench_addmm_fwd[x_(35561, 512, 1536)-triton_addmm]_tflops 1 1 426.97512817383 426.45443725586 0.001220976668268 0.52069091796875
tritonbench_layer_norm_fwd[x_15872-liger_layer_norm]_speedup 1 1 1.6051309108734 1.6031517982483 0.0012345135546644 0.0019791126251221
tritonbench_softmax_fwd[x_7680.0-triton_softmax]_speedup 1 1 4.7407875061035 4.7349338531494 0.0012362692142379 0.0058536529541016
tritonbench_embedding_fwd[x_average-liger_embedding]_speedup 1 1 1.0626790523529 1.0613275766373 0.0012733822670652 0.0013514757156372
tritonbench_embedding_fwd[liger_embedding]-speedup-avg 1 1 1.0626790523529 1.0613275766373 0.0012733822670652 0.0013514757156372
tritonbench_addmm_fwd[x_(33961, 512, 1536)-aten_addmm]_tflops 1 1 426.29425048828 425.75073242188 0.0012766109956276 0.54351806640625
tritonbench_addmm_fwd[x_(34308, 512, 1536)-aten_addmm]_tflops 1 1 443.44262695312 442.86053466797 0.0013143918673915 0.58209228515625
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 8192, 2048)-_cutlass]_tflops 1 1 754.03216552734 752.974609375 0.0014045044005157 1.0575561523438
tritonbench_fused_linear_cross_entropy_bwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 1 1 85.001419067383 84.879432678223 0.0014371725318029 0.12198638916016
tritonbench_fused_linear_cross_entropy_fwd[liger_lm_head_ce]-speedup-avg 1 1 0.31769463419914 0.31723618507385 0.00144513503459 0.00045844912528992
tritonbench_fused_linear_cross_entropy_fwd[x_average-liger_lm_head_ce]_speedup 1 1 0.31769463419914 0.31723618507385 0.00144513503459 0.00045844912528992
tritonbench_softmax_fwd[x_4416.0-triton_softmax]_speedup 1 1 4.7143659591675 4.7074527740479 0.0014685617575903 0.0069131851196289
tritonbench_addmm_fwd[x_(35791, 512, 1536)-triton_addmm]_speedup 1 1 0.98874205350876 0.98723614215851 0.0015253810977358 0.0015059113502502
tritonbench_addmm_fwd[x_(20116, 512, 1536)-triton_addmm]_tflops 1 1 391.24346923828 390.62539672852 0.0015822640180131 0.61807250976562
tritonbench_layer_norm_fwd[x_12288-liger_layer_norm]_speedup 1 1 1.5310435295105 1.5285838842392 0.0016091006170234 0.0024596452713013
tritonbench_softmax_fwd[x_3072.0-triton_softmax]_speedup 1 1 4.4426422119141 4.4354839324951 0.001613866610248 0.0071582794189453
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 2304, 2048)-_triton]_speedup 1 1 0.32097947597504 0.32044562697411 0.0016659581407673 0.00053384900093079
tritonbench_addmm_fwd[x_(36032, 512, 1536)-aten_addmm]_tflops 1 1 432.9475402832 432.20822143555 0.0017105617408217 0.73931884765625
tritonbench_kl_div_bwd[x_(8, 512, 131072)-liger_kl_div]_speedup 1 1 1.0521993637085 1.0503873825073 0.0017250599458331 0.0018119812011719
tritonbench_layer_norm_fwd[x_11776-liger_layer_norm]_speedup 1 1 1.5060023069382 1.5033905506134 0.0017372440738718 0.0026117563247681
tritonbench_addmm_fwd[x_(34181, 512, 1536)-triton_addmm]_speedup 1 1 0.96850395202637 0.96671587228775 0.0018496435094062 0.0017880797386169
tritonbench_addmm_fwd[x_(35380, 512, 1536)-triton_addmm]_speedup 1 1 0.96451300382614 0.9627268910408 0.0018552642519504 0.0017861127853394
tritonbench_addmm_fwd[x_(35410, 512, 1536)-triton_addmm]_speedup 1 1 0.98218202590942 0.98033910989761 0.0018798760482001 0.0018429160118103
tritonbench_addmm_fwd[x_(35916, 512, 1536)-triton_addmm]_tflops 1 1 431.34292602539 430.5016784668 0.0019541098227301 0.84124755859375
tritonbench_softmax_fwd[x_3008.0-triton_softmax]_speedup 1 1 4.4129900932312 4.403892993927 0.0020656949014756 0.0090970993041992
tritonbench_kl_div_bwd[x_average-liger_kl_div]_speedup 1 1 1.0077267885208 1.005628824234 0.002086221313716 0.0020979642868042
tritonbench_kl_div_bwd[liger_kl_div]-speedup-avg 1 1 1.0077267885208 1.005628824234 0.002086221313716 0.0020979642868042
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 8192, 6656)-_triton]_speedup 1 1 0.69043576717377 0.68897634744644 0.0021182435837377 0.0014594197273254
tritonbench_addmm_fwd[x_(35916, 512, 1536)-triton_addmm]_speedup 1 1 0.98387885093689 0.98171627521515 0.002202852062595 0.0021625757217407
tritonbench_softmax_fwd[x_3840.0-triton_softmax]_speedup 1 1 4.7897810935974 4.7790694236755 0.0022413714830777 0.010711669921875
tritonbench_softmax_fwd[x_4672.0-triton_softmax]_speedup 1 1 4.7278776168823 4.717125415802 0.0022793969064938 0.010752201080322
tritonbench_embedding_fwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1 1 1.1008973121643 1.0982532501221 0.0024075157910459 0.0026440620422363
tritonbench_addmm_fwd[x_(35503, 512, 1536)-aten_addmm]_tflops 1 1 431.01516723633 429.95355224609 0.0024691387818253 1.0616149902344
tritonbench_cross_entropy_bwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1 1 1.5365635156631 1.5327256917953 0.002503920883131 0.0038378238677979
tritonbench_softmax_fwd[x_5248.0-triton_softmax]_speedup 1 1 4.7398023605347 4.7278161048889 0.0025352626624706 0.011986255645752
tritonbench_geglu_fwd[liger_geglu]-speedup-avg 1 1 0.98137181997299 0.97881311178207 0.0026140926803274 0.002558708190918
tritonbench_geglu_fwd[x_average-liger_geglu]_speedup 1 1 0.98137181997299 0.97881311178207 0.0026140926803274 0.002558708190918
tritonbench_fused_linear_cross_entropy_bwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 1 1 558.11975097656 556.66003417969 0.0026222769863946 1.459716796875
tritonbench_jsd_bwd[x_(4, 2048, 4096)-liger_jsd]_speedup 1 1 5.7843570709229 5.7690997123718 0.0026446688931907 0.015257358551025
tritonbench_embedding_fwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1 1 1.0563923120499 1.053574681282 0.0026743531501665 0.0028176307678223
tritonbench_softmax_fwd[x_5312.0-triton_softmax]_speedup 1 1 4.72811460495 4.7154197692871 0.0026921963014888 0.012694835662842
tritonbench_fused_linear_jsd_fwd[x_(8192, 4096)-liger_lm_head_jsd]_speedup 1 1 0.44636809825897 0.4451567530632 0.0027211654938059 0.0012113451957703
tritonbench_kl_div_bwd[x_(8, 512, 65536)-liger_kl_div]_speedup 1 1 1.0487148761749 1.0458581447601 0.0027314712125229 0.0028567314147949
tritonbench_softmax_fwd[x_3776.0-triton_softmax]_speedup 1 1 4.7573251724243 4.7442035675049 0.0027658182733366 0.013121604919434
tritonbench_geglu_fwd[x_(8, 4096, 4096)-liger_geglu]_speedup 1 1 0.96102768182755 0.95834523439407 0.0027990408228697 0.0026824474334717
tritonbench_softmax_fwd[x_8256.0-triton_softmax]_speedup 1 1 4.7324299812317 4.7190856933594 0.0028277273903062 0.013344287872314
tritonbench_layer_norm_bwd[x_3584-liger_layer_norm]_speedup 1 1 1.2481770515442 1.2446568012238 0.0028282899486617 0.0035202503204346
tritonbench_addmm_fwd[x_(35917, 512, 1536)-aten_addmm]_tflops 1 1 428.84094238281 427.59494018555 0.0029139778799182 1.2460021972656
tritonbench_embedding_fwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1 1 1.0150814056396 1.0121160745621 0.0029298330024634 0.0029653310775757
tritonbench_kl_div_fwd[x_(8, 512, 8192)-liger_kl_div]_speedup 1 1 3.879988193512 3.8685746192932 0.0029503306364646 0.01141357421875
tritonbench_layer_norm_bwd[x_10240-liger_layer_norm]_speedup 1 1 0.15419176220894 0.15373694896698 0.0029583860289585 0.00045481324195862
tritonbench_geglu_fwd[x_(8, 2048, 4096)-liger_geglu]_speedup 1 1 0.94575214385986 0.94280004501343 0.0031312035484613 0.0029520988464355
tritonbench_layer_norm_bwd[x_9216-liger_layer_norm]_speedup 1 1 0.14446890354156 0.14401112496853 0.0031787722867677 0.00045777857303619
tritonbench_addmm_fwd[x_(35901, 512, 1536)-triton_addmm]_tflops 1 1 432.11267089844 430.7419128418 0.0031823187290903 1.3707580566406
tritonbench_geglu_bwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1 1 1.0057621002197 1.0025110244751 0.0032429326613452 0.0032510757446289
tritonbench_addmm_fwd[x_(15168, 512, 1536)-triton_addmm]_tflops 1 1 405.9772644043 404.65557861328 0.0032661993578463 1.3216857910156
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 1024)-tinygemm]_tflops 1 1 6.9212937355042 6.8985266685486 0.0033002796175834 0.022767066955566
tritonbench_softmax_fwd[x_4992.0-triton_softmax]_speedup 1 1 4.7355527877808 4.7197246551514 0.0033536135655962 0.015828132629395
tritonbench_layer_norm_fwd[x_5632-liger_layer_norm]_speedup 1 1 1.4522352218628 1.4473438262939 0.0033795670938622 0.0048913955688477
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 8192, 13312)-_triton]_tflops 1 1 57.791149139404 57.592765808105 0.00344458767547 0.19838333129883
tritonbench_jsd_bwd[x_(4, 2048, 8192)-liger_jsd]_speedup 1 1 6.2010579109192 6.1789107322693 0.0035843176264449 0.022147178649902
tritonbench_fused_linear_cross_entropy_fwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 1 1 0.27612632513046 0.27513551712036 0.003601163602836 0.00099080801010132
tritonbench_layer_norm_fwd[x_6144-liger_layer_norm]_speedup 1 1 1.5052447319031 1.499564409256 0.0037879817712619 0.0056803226470947
tritonbench_fused_linear_jsd_fwd[x_(4096, 4096)-liger_lm_head_jsd]_speedup 1 1 0.37309375405312 0.37165760993958 0.0038641590408283 0.0014361441135406
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 4096, 6656)-_cutlass]_tflops 1 1 26.468908309937 26.366516113281 0.0038834177490631 0.10239219665527
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 3584)-tinygemm]_tflops 1 1 27.091236114502 26.985410690308 0.0039215791602662 0.10582542419434
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_cutlass]_tflops 1 1 121.34855651855 120.87033081055 0.0039565185666397 0.47822570800781
tritonbench_softmax_fwd[x_6016.0-triton_softmax]_speedup 1 1 4.7411432266235 4.7222995758057 0.0039903548081564 0.018843650817871
tritonbench_addmm_fwd[x_(35249, 512, 1536)-aten_addmm]_tflops 1 1 435.0207824707 433.28070068359 0.0040160611454976 1.7400817871094
tritonbench_flex_attention_fwd[x_average-compiled]_speedup 1 1 35.26859664917 35.127269744873 0.0040232817786103 0.14132690429688
tritonbench_flex_attention_fwd[compiled]-speedup-avg 1 1 45.34534072876 45.163631439209 0.0040233542733464 0.18170928955078
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 8192, 6656)-_cutlass]_tflops 1 1 22.346702575684 22.255491256714 0.0040983736515908 0.091211318969727
tritonbench_addmm_fwd[x_(35605, 512, 1536)-triton_addmm]_speedup 1 1 0.99926996231079 0.99518299102783 0.0041067535516638 0.004086971282959
tritonbench_addmm_fwd[x_(20116, 512, 1536)-triton_addmm]_speedup 1 1 0.93710452318192 0.93325436115265 0.0041255226758449 0.0038501620292664
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 13312, 13312)-_cutlass]_tflops 1 1 61.659481048584 61.403099060059 0.0041753916732219 0.25638198852539
tritonbench_softmax_fwd[x_4928.0-triton_softmax]_speedup 1 1 4.7366433143616 4.7164945602417 0.0042719765415865 0.020148754119873
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 13312, 6656)-_cutlass]_tflops 1 1 956.03118896484 951.93872070312 0.0042990879273153 4.0924682617188
tritonbench_addmm_fwd[x_(19632, 512, 1536)-triton_addmm]_speedup 1 1 0.97862237691879 0.97439938783646 0.0043339406151651 0.0042229890823364
tritonbench_fused_linear_cross_entropy_fwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 1 1 0.33266890048981 0.3312264084816 0.0043550030168847 0.0014424920082092
tritonbench_softmax_fwd[x_7488.0-triton_softmax]_speedup 1 1 4.7796564102173 4.7587189674377 0.0043998065283553 0.020937442779541
tritonbench_low_mem_dropout_fwd[x_32768-triton_dropout]_speedup 1 1 1.3006535768509 1.2948718070984 0.0044651290736328 0.0057817697525024
tritonbench_softmax_fwd[x_7168.0-triton_softmax]_speedup 1 1 4.7505497932434 4.7290358543396 0.0045493287778874 0.021513938903809
tritonbench_fp8_gemm_blockwise_fwd[x_average-_cutlass]_tflops 1 1 310.30810546875 308.90057373047 0.0045565850567484 1.4075317382812
tritonbench_fp8_gemm_blockwise_fwd[_cutlass]-tflops-avg 1 1 310.30810546875 308.90057373047 0.0045565850567484 1.4075317382812
tritonbench_layer_norm_bwd[x_12800-liger_layer_norm]_speedup 1 1 0.17960648238659 0.17878346145153 0.0046034511714705 0.00082302093505859
tritonbench_layer_norm_fwd[x_11264-liger_layer_norm]_speedup 1 1 1.4942582845688 1.4874007701874 0.0046104012575876 0.0068575143814087
tritonbench_addmm_fwd[x_(35561, 512, 1536)-triton_addmm]_speedup 1 1 0.98656892776489 0.98195117712021 0.0047026275361535 0.0046177506446838
tritonbench_addmm_fwd[x_(33894, 512, 1536)-aten_addmm]_tflops 1 1 435.57244873047 433.53283691406 0.0047046305210107 2.0396118164062
tritonbench_fp8_gemm_blockwise_fwd[x_average-_triton]_tflops 1 1 338.03369140625 336.43594360352 0.0047490401460116 1.5977478027344
tritonbench_fp8_gemm_blockwise_fwd[_triton]-tflops-avg 1 1 338.03369140625 336.43594360352 0.0047490401460116 1.5977478027344
tritonbench_geglu_fwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1 1 1.0143437385559 1.00949883461 0.0047993160366497 0.0048449039459229
tritonbench_softmax_fwd[x_8320.0-triton_softmax]_speedup 1 1 4.7263035774231 4.7037205696106 0.0048010946820273 0.0225830078125
tritonbench_layer_norm_bwd[x_11776-liger_layer_norm]_speedup 1 1 0.16982352733612 0.16898182034492 0.0049810505619929 0.00084170699119568
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 13312, 2304)-_cutlass]_tflops 1 1 835.38006591797 831.22351074219 0.0050005264794182 4.1565551757812
tritonbench_welford_fwd[x_3072-test_welford]_speedup 1 1 0.63485509157181 0.6316431760788 0.0050850157409296 0.0032119154930115
tritonbench_softmax_fwd[x_5888.0-triton_softmax]_speedup 1 1 4.7194061279297 4.6952924728394 0.0051357088466419 0.024113655090332
tritonbench_addmm_fwd[x_(19632, 512, 1536)-triton_addmm]_tflops 1 1 382.13229370117 380.17572021484 0.0051464977437866 1.9565734863281
tritonbench_layer_norm_fwd[liger_layer_norm]-speedup-avg 1 1 1.4466648101807 1.4392035007477 0.0051843324652193 0.0074613094329834
tritonbench_layer_norm_fwd[x_average-liger_layer_norm]_speedup 1 1 1.4466648101807 1.4392035007477 0.0051843324652193 0.0074613094329834
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 4096, 16384)-_cutlass]_tflops 1 1 930.60339355469 925.77667236328 0.0052136993029699 4.8267211914062
tritonbench_softmax_fwd[x_7296.0-triton_softmax]_speedup 1 1 4.7907791137695 4.765398979187 0.0053259201786394 0.02538013458252
tritonbench_softmax_fwd[x_7744.0-triton_softmax]_speedup 1 1 4.7621541023254 4.7358260154724 0.0055593441919131 0.026328086853027
tritonbench_embedding_fwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1 1 1.0932704210281 1.0871613025665 0.0056193303120583 0.0061091184616089
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 13312, 2048)-_cutlass]_tflops 1 1 155.12362670898 154.24597167969 0.0056899705045098 0.87765502929688
tritonbench_softmax_fwd[x_6592.0-triton_softmax]_speedup 1 1 4.8254308700562 4.7980446815491 0.0057077810493083 0.02738618850708
tritonbench_layer_norm_fwd[x_9728-liger_layer_norm]_speedup 1 1 1.3799800872803 1.3721449375153 0.0057101473399763 0.0078351497650146
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 4096, 2048)-_cutlass]_tflops 1 1 63.913204193115 63.550060272217 0.005714297033597 0.36314392089844
tritonbench_softmax_fwd[x_5952.0-triton_softmax]_speedup 1 1 4.7139296531677 4.6870794296265 0.0057285616649769 0.02685022354126
tritonbench_addmm_fwd[x_(15168, 512, 1536)-triton_addmm]_speedup 1 1 1.058247089386 1.0520889759064 0.0058532249844259 0.0061581134796143
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 8192, 16384)-_triton]_speedup 1 1 0.66771787405014 0.66380047798157 0.0059014661762292 0.003917396068573
tritonbench_addmm_fwd[x_(34238, 512, 1536)-aten_addmm]_tflops 1 1 434.76602172852 432.19869995117 0.0059401422948144 2.5673217773438
tritonbench_addmm_fwd[x_(20067, 512, 1536)-triton_addmm]_tflops 1 1 390.7541809082 388.4465637207 0.0059406296850632 2.3076171875
tritonbench_softmax_fwd[x_6976.0-triton_softmax]_speedup 1 1 4.7836909294128 4.7551832199097 0.0059950811955707 0.028507709503174
tritonbench_softmax_fwd[x_6272.0-triton_softmax]_speedup 1 1 4.8474683761597 4.8185377120972 0.0060040339603171 0.0289306640625
tritonbench_embedding_fwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1 1 1.02791929245 1.0216972827911 0.0060898758992642 0.0062220096588135
tritonbench_layer_norm_bwd[x_11264-liger_layer_norm]_speedup 1 1 0.16335791349411 0.16234821081161 0.0062193643985812 0.0010097026824951
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 8192, 6656)-_triton]_tflops 1 1 15.42896270752 15.333506584167 0.006225329009257 0.095456123352051
tritonbench_layer_norm_fwd[x_13824-liger_layer_norm]_speedup 1 1 1.5709091424942 1.5611670017242 0.0062402938053384 0.0097421407699585
tritonbench_addmm_fwd[x_(35605, 512, 1536)-aten_addmm]_tflops 1 1 426.35815429688 423.67538452148 0.0063321351048531 2.6827697753906
tritonbench_softmax_fwd[x_4608.0-triton_softmax]_speedup 1 1 4.73885679245 4.7087230682373 0.0063995532920408 0.030133724212646
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 8192, 2304)-_cutlass]_tflops 1 1 247.53269958496 245.92008972168 0.0065574547614484 1.6126098632812
tritonbench_layer_norm_fwd[x_8192-liger_layer_norm]_speedup 1 1 1.5726380348206 1.5623528957367 0.0065831087918281 0.010285139083862
tritonbench_layer_norm_fwd[x_10752-liger_layer_norm]_speedup 1 1 1.4465304613113 1.4370265007019 0.0066136293274995 0.009503960609436
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 13312, 2048)-_cutlass]_tflops 1 1 9.0514535903931 8.9917469024658 0.0066401655401213 0.059706687927246
tritonbench_embedding_fwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1 1 1.1679855585098 1.1602795124054 0.0066415428541487 0.0077060461044312
tritonbench_addmm_fwd[x_(20203, 512, 1536)-triton_addmm]_speedup 1 1 0.97142863273621 0.96500200033188 0.006659708893989 0.0064266324043274
tritonbench_layer_norm_bwd[x_12288-liger_layer_norm]_speedup 1 1 0.17285060882568 0.17169557511806 0.0067272188396496 0.0011550337076187
tritonbench_jsd_fwd[x_(4, 2048, 65536)-liger_jsd]_speedup 1 1 0.58741188049316 0.58348619937897 0.006727975945918 0.0039256811141968
tritonbench_addmm_fwd[x_(35405, 512, 1536)-triton_addmm]_speedup 1 1 0.98945820331573 0.9827926158905 0.0067822929450811 0.0066655874252319
tritonbench_low_mem_dropout_fwd[x_128-triton_dropout]_speedup 1 1 1.1777778863907 1.1698113679886 0.006810088036499 0.0079665184020996
tritonbench_layer_norm_fwd[x_6656-liger_layer_norm]_speedup 1 1 1.5326796770096 1.5223033428192 0.0068162066642725 0.010376334190369
tritonbench_softmax_fwd[x_7808.0-triton_softmax]_speedup 1 1 4.7610359191895 4.7274060249329 0.007113815500345 0.033629894256592
tritonbench_addmm_fwd[x_(35844, 512, 1536)-aten_addmm]_tflops 1 1 447.98620605469 444.82025146484 0.0071173796143901 3.1659545898438
tritonbench_welford_fwd[test_welford]-speedup-avg 1 1 0.62961786985397 0.62512421607971 0.007188417371578 0.0044936537742615
tritonbench_welford_fwd[x_average-test_welford]_speedup 1 1 0.62961786985397 0.62512421607971 0.007188417371578 0.0044936537742615
tritonbench_jsd_fwd[x_(4, 2048, 131072)-liger_jsd]_speedup 1 1 0.58466857671738 0.58046960830688 0.0072337437660854 0.0041989684104919
tritonbench_gemm_fwd[x_(2816, 2816, 2816)-triton_tutorial_matmul]_speedup 1 1 0.91614627838135 0.90929508209229 0.0075346237145569 0.0068511962890625
tritonbench_addmm_fwd[x_(35844, 512, 1536)-triton_addmm]_tflops 1 1 431.74371337891 428.48962402344 0.0075943247468012 3.2540893554688
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 13312, 2304)-_triton]_speedup 1 1 1.1847976446152 1.1757529973984 0.0076926422784464 0.0090446472167969
tritonbench_embedding_fwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1 1 1.1233299970627 1.1147540807724 0.0076931014994276 0.0085759162902832
tritonbench_layer_norm_fwd[x_13312-liger_layer_norm]_speedup 1 1 1.5643110275269 1.5521242618561 0.0078516688194817 0.012186765670776
tritonbench_welford_fwd[x_2560-test_welford]_speedup 1 1 0.62982392311096 0.62474364042282 0.0081317877597002 0.0050802826881409
tritonbench_addmm_fwd[x_(35901, 512, 1536)-triton_addmm]_speedup 1 1 0.99436968564987 0.98633480072021 0.0081462044366578 0.008034884929657
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 2304, 2048)-_cutlass]_tflops 1 1 0.60806596279144 0.60309201478958 0.0082474114726871 0.0049739480018616
tritonbench_softmax_fwd[x_4288.0-triton_softmax]_speedup 1 1 4.7428908348083 4.7028603553772 0.00851194303173 0.040030479431152
tritonbench_layer_norm_bwd[x_15360-liger_layer_norm]_speedup 1 1 0.20112508535385 0.19937302172184 0.0087878671691893 0.0017520636320114
tritonbench_kl_div_bwd[x_(8, 512, 8192)-liger_kl_div]_speedup 1 1 0.98327893018723 0.97453808784485 0.0089692157252742 0.0087408423423767
tritonbench_fused_linear_cross_entropy_bwd[x_average-liger_lm_head_ce]_speedup 1 1 278.9919128418 276.50030517578 0.0090112293526462 2.4916076660156
tritonbench_fused_linear_cross_entropy_bwd[liger_lm_head_ce]-speedup-avg 1 1 278.9919128418 276.50030517578 0.0090112293526462 2.4916076660156
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 4096, 2304)-_cutlass]_tflops 1 1 4.2896289825439 4.2509841918945 0.0090907867225428 0.038644790649414
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 8192, 13312)-_cutlass]_tflops 1 1 110.93784332275 109.9313583374 0.0091555767214524 1.0064849853516
tritonbench_layer_norm_bwd[x_8704-liger_layer_norm]_speedup 1 1 0.14031882584095 0.13901317119598 0.0093923088994595 0.0013056546449661
tritonbench_layer_norm_bwd[x_1536-liger_layer_norm]_speedup 1 1 0.628637611866 0.62275296449661 0.0094494088424637 0.0058846473693848
tritonbench_cross_entropy_fwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 1 1 0.78207284212112 0.77467256784439 0.0095527769846368 0.0074002742767334
tritonbench_layer_norm_bwd[x_14848-liger_layer_norm]_speedup 1 1 0.1967901289463 0.19489553570747 0.0097210704799018 0.0018945932388306
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 2304, 13312)-_cutlass]_tflops 1 1 901.77294921875 892.953125 0.0098771413323068 8.81982421875
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 2304, 2048)-_triton]_tflops 1 1 0.19517670571804 0.19325819611549 0.0099271836388257 0.0019185096025467
tritonbench_addmm_fwd[x_(20224, 512, 1536)-aten_addmm]_tflops 1 1 411.408203125 407.36322021484 0.0099296713827599 4.0449829101562
tritonbench_embedding_fwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 1 1 1.0869565010071 1.0760413408279 0.010143811176195 0.010915160179138
tritonbench_gemm_fwd[x_(3456, 3456, 3456)-triton_tutorial_matmul]_speedup 1 1 0.92233914136887 0.9130026102066 0.010226182332764 0.009336531162262
tritonbench_softmax_fwd[x_4480.0-triton_softmax]_speedup 1 1 4.7839775085449 4.7355508804321 0.010226186844048 0.048426628112793
tritonbench_layer_norm_bwd[x_10752-liger_layer_norm]_speedup 1 1 0.1592473089695 0.15762677788734 0.010280810810658 0.0016205310821533
tritonbench_addmm_fwd[x_(35605, 512, 1536)-triton_addmm]_tflops 1 1 426.04690551758 421.63455200195 0.010464876501878 4.412353515625
tritonbench_swiglu_bwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1 1 1.0346910953522 1.0239633321762 0.010476706380848 0.010727763175964
tritonbench_gemm_fwd[x_(2048, 2048, 2048)-triton_tutorial_matmul]_speedup 1 1 0.96544277667999 0.95528894662857 0.010629066825548 0.010153830051422
tritonbench_layer_norm_fwd[x_10240-liger_layer_norm]_speedup 1 1 1.4120078086853 1.3971049785614 0.010666936524159 0.014902830123901
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 13312, 2048)-_triton]_speedup 1 1 0.41280093789101 0.40843445062637 0.010690790793815 0.0043664872646332
tritonbench_addmm_fwd[x_(34533, 512, 1536)-triton_addmm]_speedup 1 1 0.99434614181519 0.98345673084259 0.011072587772383 0.010889410972595
tritonbench_flex_attention_fwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-eager]_tflops 1 1 11.484484672546 11.357600212097 0.011171766753515 0.12688446044922
tritonbench_layer_norm_fwd[x_9216-liger_layer_norm]_speedup 1 1 1.3294117450714 1.3143794536591 0.011436797319454 0.015032291412354
tritonbench_gemm_fwd[x_(2560, 2560, 2560)-triton_tutorial_matmul]_speedup 1 1 0.80434787273407 0.79514420032501 0.011574846933796 0.0092036724090576
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 2304, 16384)-_cutlass]_tflops 1 1 37.081275939941 36.649257659912 0.011787913524422 0.4320182800293
tritonbench_gemm_fwd[x_(2944, 2944, 2944)-triton_tutorial_matmul]_speedup 1 1 0.71892529726028 0.71019679307938 0.012290261327514 0.0087285041809082
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 13312, 2304)-_triton]_tflops 1 1 989.75634765625 977.31353759766 0.012731646068445 12.442810058594
tritonbench_layer_norm_fwd[x_8704-liger_layer_norm]_speedup 1 1 1.2826479673386 1.2664103507996 0.012821765495481 0.016237616539001
tritonbench_addmm_fwd[x_(35541, 512, 1536)-aten_addmm]_tflops 1 1 435.45474243164 429.88433837891 0.012957913455839 5.5704040527344
tritonbench_addmm_fwd[x_(20067, 512, 1536)-aten_addmm]_tflops 1 1 410.93472290039 405.52990722656 0.013327785639269 5.4048156738281
tritonbench_layer_norm_bwd[x_15872-liger_layer_norm]_speedup 1 1 0.20774175226688 0.2049452662468 0.013645038362197 0.0027964860200882
tritonbench_swiglu_fwd[liger_swiglu]-speedup-avg 1 1 1.0763176679611 1.061812877655 0.013660401574828 0.014504790306091
tritonbench_swiglu_fwd[x_average-liger_swiglu]_speedup 1 1 1.0763176679611 1.061812877655 0.013660401574828 0.014504790306091
tritonbench_layer_norm_fwd[x_7168-liger_layer_norm]_speedup 1 1 1.5588009357452 1.53737616539 0.013935932426655 0.021424770355225
tritonbench_layer_norm_bwd[x_14336-liger_layer_norm]_speedup 1 1 0.19314520061016 0.19044855237007 0.014159457798605 0.0026966482400894
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 1024)-tinygemm]_tflops 1 1 23.763761520386 23.431865692139 0.014164293727512 0.33189582824707
tritonbench_swiglu_bwd[x_average-liger_swiglu]_speedup 1 1 1.0506772994995 1.0357780456543 0.014384600936201 0.014899253845215
tritonbench_swiglu_bwd[liger_swiglu]-speedup-avg 1 1 1.0506772994995 1.0357780456543 0.014384600936201 0.014899253845215
tritonbench_low_mem_dropout_fwd[x_32768-torch_dropout]_tflops 1 1 0.010291457176208 0.010138614103198 0.015075341802607 0.00015284307301044
tritonbench_layer_norm_bwd[x_13824-liger_layer_norm]_speedup 1 1 0.18860530853271 0.18571864068508 0.015543231616304 0.0028866678476334
tritonbench_softmax_fwd[x_3712.0-triton_softmax]_speedup 1 1 4.7741432189941 4.698703289032 0.016055478569642 0.075439929962158
tritonbench_softmax_fwd[x_3968.0-triton_softmax]_speedup 1 1 4.7921810150146 4.7158250808716 0.016191426279313 0.076355934143066
tritonbench_softmax_fwd[x_4096.0-triton_softmax]_speedup 1 1 4.8334412574768 4.7562065124512 0.016238728243495 0.077234745025635
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 13312, 2048)-_triton]_tflops 1 1 64.035171508789 62.999366760254 0.016441510475446 1.0358047485352
tritonbench_low_mem_dropout_fwd[x_131072-triton_dropout]_speedup 1 1 1.1437125205994 1.125 0.01663335164388 0.018712520599365
tritonbench_welford_fwd[x_8192-test_welford]_speedup 1 1 0.68730229139328 0.6760168671608 0.016693998006116 0.011285424232483
tritonbench_layer_norm_fwd[x_5120-liger_layer_norm]_speedup 1 1 1.3843469619751 1.3613030910492 0.016927803277184 0.023043870925903
tritonbench_softmax_fwd[x_3904.0-triton_softmax]_speedup 1 1 4.7625088691711 4.682758808136 0.017030571998839 0.079750061035156
tritonbench_layer_norm_bwd[x_13312-liger_layer_norm]_speedup 1 1 0.18425744771957 0.1811715811491 0.01703284008949 0.0030858665704727
tritonbench_layer_norm_fwd[x_3584-liger_layer_norm]_speedup 1 1 1.3205400705338 1.2983927726746 0.017057471610513 0.022147297859192
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 3584)-tinygemm]_tflops 1 1 23.027549743652 22.636953353882 0.017254812680149 0.39059638977051
tritonbench_grouped_gemm_fwd[x_256-triton]_speedup 1 1 0.17774686217308 0.17472016811371 0.017323094935452 0.0030266940593719
tritonbench_rms_norm_bwd[x_(2048, 4096)-liger_rms]_speedup 1 1 0.6856546998024 0.67370247840881 0.017741097556617 0.011952221393585
tritonbench_layer_norm_bwd[x_2560-liger_layer_norm]_speedup 1 1 0.97717136144638 0.95987576246262 0.018018580799865 0.017295598983765
tritonbench_softmax_fwd[x_4032.0-triton_softmax]_speedup 1 1 4.8149633407593 4.7282252311707 0.018344749953282 0.086738109588623
tritonbench_welford_fwd[x_6144-test_welford]_speedup 1 1 0.66931861639023 0.65720987319946 0.018424469388777 0.012108743190765
tritonbench_softmax_fwd[x_3520.0-triton_softmax]_speedup 1 1 4.7058825492859 4.6194090843201 0.018719594516827 0.08647346496582
tritonbench_layer_norm_fwd[x_2048-liger_layer_norm]_speedup 1 1 1.3134160041809 1.2892655134201 0.018731976082055 0.024150490760803
tritonbench_low_mem_dropout_fwd[x_512-torch_dropout]_tflops 1 1 0.00020000000949949 0.00019631901523098 0.018750064858342 0.0000036809942685068
tritonbench_softmax_fwd[x_3136.0-triton_softmax]_speedup 1 1 4.2041449546814 4.1261353492737 0.018906215818016 0.078009605407715
tritonbench_addmm_fwd[x_(19747, 512, 1536)-triton_addmm]_speedup 1 1 0.95395517349243 0.93603491783142 0.019144858081286 0.017920255661011
tritonbench_low_mem_dropout_fwd[x_32768-triton_dropout]_tflops 1 1 0.013385620899498 0.01312820520252 0.019607836182295 0.00025741569697857
tritonbench_rms_norm_fwd[x_(2048, 4096)-liger_rms]_speedup 1 1 4.2865428924561 4.2004504203796 0.020496009584761 0.086092472076416
tritonbench_softmax_fwd[x_3584.0-triton_softmax]_speedup 1 1 4.7383332252502 4.6401305198669 0.021163780838242 0.098202705383301
tritonbench_welford_fwd[x_4096-test_welford]_speedup 1 1 0.60431778430939 0.59127241373062 0.022063215323131 0.013045370578766
tritonbench_low_mem_dropout_fwd[x_524288-triton_dropout]_tflops 1 1 0.14499114453793 0.14185281097889 0.022123872888943 0.0031383335590363
tritonbench_softmax_fwd[x_3328.0-triton_softmax]_speedup 1 1 4.5794034004211 4.4784641265869 0.022538814866238 0.10093927383423
tritonbench_softmax_fwd[x_3648.0-triton_softmax]_speedup 1 1 4.7673664093018 4.659574508667 0.023133421395938 0.10779190063477
tritonbench_gemm_fwd[x_(1408, 1408, 1408)-triton_tutorial_matmul]_speedup 1 1 0.71864950656891 0.70172679424286 0.024115813255084 0.01692271232605
tritonbench_embedding_fwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1 1 1.0256623029709 1.0008090734482 0.024833137690365 0.024853229522705
tritonbench_embedding_fwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 1 1 1.0267640352249 1.0008952617645 0.025845634851726 0.025868773460388
tritonbench_welford_fwd[x_5120-test_welford]_speedup 1 1 0.68712955713272 0.66927117109299 0.026683333768238 0.017858386039734
tritonbench_rms_norm_bwd[x_(2048, 8192)-liger_rms]_speedup 1 1 1.1387900114059 1.1091717481613 0.026703045126895 0.029618263244629
tritonbench_swiglu_bwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1 1 1.0493154525757 1.0216253995895 0.027103919888122 0.027690052986145
tritonbench_embedding_fwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 1 1 1.08231985569 1.0529681444168 0.027875212967103 0.029351711273193
tritonbench_fused_linear_cross_entropy_bwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 1 1 309.77059936523 301.21765136719 0.028394577672411 8.5529479980469
tritonbench_layer_norm_fwd[x_4096-liger_layer_norm]_speedup 1 1 1.382669210434 1.3443751335144 0.028484666195401 0.038294076919556
tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 1 1 69.525161743164 67.595268249512 0.028550718765234 1.9298934936523
tritonbench_swiglu_bwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 1 1 1.2085978984833 1.1743993759155 0.029120010849026 0.034198522567749
tritonbench_gemm_fwd[x_(1152, 1152, 1152)-triton_tutorial_matmul]_speedup 1 1 0.6414048075676 0.62298023700714 0.0295748877187 0.018424570560455
tritonbench_low_mem_dropout_fwd[x_2048-triton_dropout]_tflops 1 1 0.00081012659939006 0.00078527606092393 0.031645608089577 0.000024850538466126
tritonbench_welford_fwd[x_7168-test_welford]_speedup 1 1 0.67386239767075 0.65312790870667 0.031746444590218 0.020734488964081
tritonbench_gemm_fwd[x_(4096, 4096, 4096)-triton_tutorial_matmul]_speedup 1 1 0.93079632520676 0.90207272768021 0.031841775773908 0.02872359752655
tritonbench_rms_norm_bwd[liger_rms]-speedup-avg 1 1 0.67494732141495 0.65355312824249 0.03273520123756 0.021394193172455
tritonbench_rms_norm_bwd[x_average-liger_rms]_speedup 1 1 0.67494732141495 0.65355312824249 0.03273520123756 0.021394193172455
tritonbench_layer_norm_bwd[x_3072-liger_layer_norm]_speedup 1 1 1.1580902338028 1.1213291883469 0.032783455418768 0.036761045455933
tritonbench_low_mem_dropout_fwd[x_8192-triton_dropout]_tflops 1 1 0.0035310345701873 0.0034133330918849 0.034482857410638 0.00011770147830248
tritonbench_layer_norm_bwd[x_4096-liger_layer_norm]_speedup 1 1 1.3873783349991 1.3410683870316 0.03453213006537 0.046309947967529
tritonbench_rope_bwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 1 1 2.2819972038269 2.2028198242188 0.03594364765454 0.079177379608154
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 8192, 2304)-_triton]_speedup 1 1 0.3853442966938 0.37189581990242 0.036161946630404 0.013448476791382
tritonbench_softmax_fwd[x_2752.0-triton_softmax]_speedup 1 1 4.084876537323 3.9392590522766 0.036965704238778 0.14561748504639
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 1024)-tinygemm]_tflops 1 1 16.980989456177 16.352062225342 0.03846164613172 0.62892723083496
tritonbench_low_mem_dropout_fwd[x_average-triton_dropout]_speedup 1 1 1.1832580566406 1.1369156837463 0.040761486147839 0.046342372894287
tritonbench_low_mem_dropout_fwd[triton_dropout]-speedup-avg 1 1 1.1832580566406 1.1369156837463 0.040761486147839 0.046342372894287
tritonbench_rms_norm_bwd[x_(2048, 32768)-liger_rms]_speedup 1 1 0.41984125971794 0.40266972780228 0.04264420871513 0.017171531915665
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 8192, 2304)-_triton]_tflops 1 1 95.385314941406 91.456657409668 0.042956495929437 3.9286575317383
tritonbench_low_mem_dropout_fwd[x_average-triton_dropout]_tflops 1 1 0.026509527117014 0.025263078510761 0.049338745700439 0.0012464486062527
tritonbench_low_mem_dropout_fwd[triton_dropout]-tflops-avg 1 1 0.026509527117014 0.025263078510761 0.049338745700439 0.0012464486062527
tritonbench_layer_norm_bwd[x_2048-liger_layer_norm]_speedup 1 1 0.82723355293274 0.78804343938828 0.049730905157824 0.039190113544464
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 2304, 2304)-_cutlass]_tflops 1 1 5.1538019180298 4.8970627784729 0.05242716934026 0.25673913955688
tritonbench_softmax_fwd[x_2368.0-triton_softmax]_speedup 1 1 3.7730867862701 3.5819070339203 0.053373733751156 0.19117975234985
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 13312, 6656)-_triton]_speedup 1 1 1.1307787895203 1.0638449192047 0.06291694316272 0.066933870315552
tritonbench_gemm_fwd[x_(768, 768, 768)-triton_tutorial_matmul]_speedup 1 1 0.74530833959579 0.69999998807907 0.064726217554743 0.045308351516724
tritonbench_gemm_fwd[x_(512, 512, 512)-triton_tutorial_matmul]_speedup 1 1 0.84999996423721 0.7978338599205 0.065384670841006 0.052166104316711
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 13312, 6656)-_triton]_tflops 1 1 1081.0598144531 1012.7152099609 0.067486499481748 68.344604492188
tritonbench_softmax_fwd[x_2176.0-triton_softmax]_speedup 1 1 3.6578948497772 3.4172413349152 0.070423330188365 0.24065351486206
tritonbench_rope_bwd[liger_rotary_pos_emb]-speedup-avg 1 1 3.2496359348297 3.0274174213409 0.073402006582344 0.22221851348877
tritonbench_rope_bwd[x_average-liger_rotary_pos_emb]_speedup 1 1 3.2496359348297 3.0274174213409 0.073402006582344 0.22221851348877
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 1024)-tinygemm]_tflops 1 1 1.8204443454742 1.6912516355515 0.076388815955625 0.12919270992279
tritonbench_swiglu_fwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1 1 1.023561835289 0.9506431221962 0.076704613319397 0.072918713092804
tritonbench_embedding_bwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 1 1 1.475198507309 1.3675272464752 0.078734271007222 0.10767126083374
tritonbench_embedding_bwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1 1 1.2366671562195 1.1448746919632 0.08017686555625 0.091792464256287
tritonbench_jsd_fwd[x_(4, 2048, 16384)-liger_jsd]_speedup 1 1 0.66177201271057 0.60781478881836 0.088772476229328 0.053957223892212
tritonbench_low_mem_dropout_fwd[x_524288-torch_dropout]_tflops 1 1 0.13826160132885 0.12554788589478 0.10126586635421 0.012713715434074
tritonbench_low_mem_dropout_fwd[x_average-torch_dropout]_tflops 1 1 0.024426048621535 0.022153681144118 0.10257290707735 0.002272367477417
tritonbench_low_mem_dropout_fwd[torch_dropout]-tflops-avg 1 1 0.024426048621535 0.022153681144118 0.10257290707735 0.002272367477417
tritonbench_rms_norm_bwd[x_(2048, 16384)-liger_rms]_speedup 1 1 1.1314612627029 1.0239211320877 0.10502774798286 0.10754013061523
tritonbench_low_mem_dropout_fwd[x_131072-torch_dropout]_tflops 1 1 0.042890053242445 0.03792592510581 0.13089010018306 0.0049641281366348
tritonbench_embedding_bwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1 1 1.8207459449768 1.5952908992767 0.14132535063184 0.22545504570007
tritonbench_embedding_bwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1 1 1.4066215753555 1.2307544946671 0.14289371393769 0.17586708068848
tritonbench_rope_bwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 1 1 3.6184692382812 3.1576962471008 0.14592061906001 0.46077299118042
tritonbench_low_mem_dropout_fwd[x_131072-triton_dropout]_tflops 1 1 0.049053888767958 0.042666666209698 0.14970053031254 0.00638722255826
tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_tflops 1 1 207.05648803711 179.83586120605 0.15136373050682 27.220626831055
tritonbench_low_mem_dropout_fwd[x_2048-triton_dropout]_speedup 1 1 1.2025316953659 1.0429447889328 0.1530156803376 0.15958690643311
tritonbench_low_mem_dropout_fwd[x_32-triton_dropout]_tflops 1 1 0.000014925372852304 0.000012903225979244 0.15671638056347 0.0000020221468730597
tritonbench_low_mem_dropout_fwd[x_512-triton_dropout]_speedup 1 1 1.1510791778564 0.99390250444412 0.15814093707333 0.15717667341232
tritonbench_low_mem_dropout_fwd[x_8192-torch_dropout]_tflops 1 1 0.0030295855831355 0.0026122450362891 0.15976316962947 0.00041734054684639
tritonbench_low_mem_dropout_fwd[x_128-torch_dropout]_tflops 1 1 0.000050314465624979 0.000043010750232497 0.16981139257048 0.0000073037153924815
tritonbench_low_mem_dropout_fwd[x_128-triton_dropout]_tflops 1 1 0.000059259262343403 0.000050314465624979 0.17777783401487 0.0000089447967184242
tritonbench_low_mem_dropout_fwd[x_512-triton_dropout]_tflops 1 1 0.00023021583911031 0.00019512196013238 0.17985612154635 0.000035093878977932
tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 1 1 68.307334899902 56.955291748047 0.19931498555171 11.352043151855
tritonbench_embedding_bwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1 1 2.2547099590302 1.8318628072739 0.23082905012169 0.42284715175629
tritonbench_low_mem_dropout_fwd[x_32-triton_dropout]_speedup 1 1 1.2761194705963 1.0322580337524 0.23624077398302 0.24386143684387
tritonbench_embedding_bwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1 1 1.6281609535217 1.316552400589 0.23668526432623 0.31160855293274
tritonbench_embedding_bwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 1 1 2.6422991752625 2.0475790500641 0.29045038587387 0.59472012519836
tritonbench_embedding_bwd[liger_embedding]-speedup-avg 1 1 1.7441607713699 1.3459738492966 0.2958355560039 0.39818692207336
tritonbench_embedding_bwd[x_average-liger_embedding]_speedup 1 1 1.7441607713699 1.3459738492966 0.2958355560039 0.39818692207336
tritonbench_embedding_bwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1 1 2.2862944602966 1.7267206907272 0.32406733328349 0.5595737695694
tritonbench_embedding_bwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1 1 2.5408017635345 1.8995307683945 0.33759442374397 0.64127099514008
tritonbench_embedding_bwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1 1 1.8265942335129 1.3648246526718 0.33833619574287 0.46176958084106
tritonbench_embedding_bwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1 1 1.5812239646912 1.1168842315674 0.41574562519532 0.46433973312378
tritonbench_embedding_bwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 1 1 0.87206310033798 0.61563575267792 0.41652445710738 0.25642734766006
tritonbench_embedding_bwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 1 1 0.98931086063385 0.69035649299622 0.43304346474695 0.29895436763763
tritonbench_embedding_bwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 1 1 1.2360243797302 0.85892456769943 0.43903717068057 0.37709981203079
tritonbench_embedding_bwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1 1 2.1009476184845 1.4227942228317 0.47663490951142 0.67815339565277
tritonbench_embedding_bwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 1 1 2.0089087486267 1.3054693937302 0.53884017371452 0.70343935489655
tritonbench_rope_bwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 1 1 2.9531304836273 1.7136546373367 0.72329384187756 1.2394758462906
tritonbench_layer_norm_bwd[liger_layer_norm]-speedup-avg 1 1 0.67075318098068 0.37378814816475 0.79447418082674 0.29696503281593
tritonbench_layer_norm_bwd[x_average-liger_layer_norm]_speedup 1 1 0.67075318098068 0.37378814816475 0.79447418082674 0.29696503281593
tritonbench_layer_norm_bwd[x_8192-liger_layer_norm]_speedup 1 1 1.4726729393005 0.29377841949463 4.0128697057936 1.1788945198059
tritonbench_layer_norm_bwd[x_4608-liger_layer_norm]_speedup 1 1 1.1561365127563 0.22226889431477 4.2015218608101 0.93386761844158
tritonbench_layer_norm_bwd[x_7680-liger_layer_norm]_speedup 1 1 1.4936227798462 0.2869755923748 4.2047031856823 1.2066471874714
tritonbench_layer_norm_bwd[x_5120-liger_layer_norm]_speedup 1 1 1.222773194313 0.23219281435013 4.2661973960538 0.99058037996292
tritonbench_layer_norm_bwd[x_6144-liger_layer_norm]_speedup 1 1 1.3486423492432 0.25371152162552 4.3156527563371 1.0949308276176
tritonbench_layer_norm_bwd[x_7168-liger_layer_norm]_speedup 1 1 1.4414019584656 0.27081799507141 4.3224009655839 1.1705839633942
tritonbench_layer_norm_bwd[x_5632-liger_layer_norm]_speedup 1 1 1.2834794521332 0.24102148413658 4.3251661640498 1.0424579679966
tritonbench_layer_norm_bwd[x_6656-liger_layer_norm]_speedup 1 1 1.3878934383392 0.2594730257988 4.3488929497259 1.1284204125404
tritonbench_fused_linear_jsd_bwd-pass 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_speedup 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-eager]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_speedup 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-eager]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_speedup 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-eager]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_speedup 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_tflops 1 1 0 0 3.4028235e+38 0
tritonbench_flex_attention_bwd[x_(8, 16, 16384, 16, 16384, 128) | noop-eager]_tflops 1 1 0 0 3.4028235e+38 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment