Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active July 9, 2025 23:42
Show Gist options
  • Save davidberard98/9a4b820d39b7d2f89479d59def8f10c4 to your computer and use it in GitHub Desktop.
Save davidberard98/9a4b820d39b7d2f89479d59def8f10c4 to your computer and use it in GitHub Desktop.
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
// @_layer_norm_backward_kernel
.visible .entry _layer_norm_backward_kernel(
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_0,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_1,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_2,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_3,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_4,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_5,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_6,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_7,
.param .u32 _layer_norm_backward_kernel_param_8,
.param .u32 _layer_norm_backward_kernel_param_9,
.param .u32 _layer_norm_backward_kernel_param_10,
.param .u32 _layer_norm_backward_kernel_param_11,
.param .u32 _layer_norm_backward_kernel_param_12,
.param .u32 _layer_norm_backward_kernel_param_13,
.param .u32 _layer_norm_backward_kernel_param_14,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_15
)
.reqntid 128
{
.reg .pred %p<118>;
.reg .b32 %r<1184>;
.reg .b32 %f<1317>;
.reg .b64 %rd<168>;
.loc 1 73 0 // layer_norm.py:73:0
$L__func_begin0:
.loc 1 73 0 // layer_norm.py:73:0
// %bb.0:
ld.param.b32 %r285, [_layer_norm_backward_kernel_param_14];
ld.param.b32 %r283, [_layer_norm_backward_kernel_param_11];
ld.param.b32 %r282, [_layer_norm_backward_kernel_param_10];
ld.param.b64 %rd46, [_layer_norm_backward_kernel_param_6];
ld.param.b64 %rd45, [_layer_norm_backward_kernel_param_5];
$L__tmp0:
.loc 1 100 33 // layer_norm.py:100:33
mov.u32 %r1, %ctaid.x;
.loc 1 101 31 // layer_norm.py:101:31
shl.b32 %r2, %r1, 5;
.loc 1 102 39 // layer_norm.py:102:39
add.s32 %r414, %r2, 32;
ld.param.b32 %r415, [_layer_norm_backward_kernel_param_13];
.loc 1 102 57 // layer_norm.py:102:57
min.s32 %r3, %r414, %r415;
.loc 1 103 24 // layer_norm.py:103:24
mov.u32 %r4, %tid.x;
shl.b32 %r416, %r4, 2;
and.b32 %r417, %r416, 508;
or.b32 %r5, %r417, 512;
or.b32 %r6, %r417, 1024;
or.b32 %r7, %r417, 1536;
or.b32 %r8, %r417, 2048;
or.b32 %r9, %r417, 2560;
or.b32 %r10, %r417, 3072;
or.b32 %r418, %r416, 3584;
or.b32 %r11, %r417, 4096;
or.b32 %r12, %r417, 4608;
or.b32 %r13, %r417, 5120;
or.b32 %r14, %r417, 5632;
or.b32 %r15, %r417, 6144;
or.b32 %r16, %r417, 6656;
or.b32 %r17, %r417, 7168;
or.b32 %r419, %r416, 7680;
.loc 1 117 28 // layer_norm.py:117:28
cvt.u64.u32 %rd1, %r417;
cvt.u64.u32 %rd9, %r418;
cvt.u64.u32 %rd18, %r419;
.loc 1 115 30 // layer_norm.py:115:30
setp.le.s32 %p1, %r3, %r2;
mov.b32 %r1056, 0;
shl.b64 %rd161, %rd18, 2;
shl.b64 %rd162, %rd9, 2;
setp.lt.s32 %p87, %r17, %r285;
cvt.u32.u64 %r1052, %rd18;
cvt.u32.u64 %r1053, %rd9;
cvt.u32.u64 %r1054, %rd1;
setp.lt.s32 %p86, %r16, %r285;
setp.lt.s32 %p85, %r15, %r285;
setp.lt.s32 %p84, %r14, %r285;
setp.lt.s32 %p83, %r13, %r285;
setp.lt.s32 %p82, %r12, %r285;
setp.lt.s32 %p81, %r11, %r285;
setp.lt.s32 %p79, %r10, %r285;
setp.lt.s32 %p78, %r9, %r285;
setp.lt.s32 %p77, %r8, %r285;
setp.lt.s32 %p76, %r7, %r285;
setp.lt.s32 %p75, %r6, %r285;
setp.lt.s32 %p74, %r5, %r285;
mov.b32 %r1057, %r1056;
mov.b32 %r1058, %r1056;
mov.b32 %r1059, %r1056;
mov.b32 %r1060, %r1056;
mov.b32 %r1061, %r1056;
mov.b32 %r1062, %r1056;
mov.b32 %r1063, %r1056;
mov.b32 %r1064, %r1056;
mov.b32 %r1065, %r1056;
mov.b32 %r1066, %r1056;
mov.b32 %r1067, %r1056;
mov.b32 %r1068, %r1056;
mov.b32 %r1069, %r1056;
mov.b32 %r1070, %r1056;
mov.b32 %r1071, %r1056;
mov.b32 %r1072, %r1056;
mov.b32 %r1073, %r1056;
mov.b32 %r1074, %r1056;
mov.b32 %r1075, %r1056;
mov.b32 %r1076, %r1056;
mov.b32 %r1077, %r1056;
mov.b32 %r1078, %r1056;
mov.b32 %r1079, %r1056;
mov.b32 %r1080, %r1056;
mov.b32 %r1081, %r1056;
mov.b32 %r1082, %r1056;
mov.b32 %r1083, %r1056;
mov.b32 %r1084, %r1056;
mov.b32 %r1085, %r1056;
mov.b32 %r1086, %r1056;
mov.b32 %r1087, %r1056;
mov.b32 %r1088, %r1056;
mov.b32 %r1089, %r1056;
mov.b32 %r1090, %r1056;
mov.b32 %r1091, %r1056;
mov.b32 %r1092, %r1056;
mov.b32 %r1093, %r1056;
mov.b32 %r1094, %r1056;
mov.b32 %r1095, %r1056;
mov.b32 %r1096, %r1056;
mov.b32 %r1097, %r1056;
mov.b32 %r1098, %r1056;
mov.b32 %r1099, %r1056;
mov.b32 %r1100, %r1056;
mov.b32 %r1101, %r1056;
mov.b32 %r1102, %r1056;
mov.b32 %r1103, %r1056;
mov.b32 %r1104, %r1056;
mov.b32 %r1105, %r1056;
mov.b32 %r1106, %r1056;
mov.b32 %r1107, %r1056;
mov.b32 %r1108, %r1056;
mov.b32 %r1109, %r1056;
mov.b32 %r1110, %r1056;
mov.b32 %r1111, %r1056;
mov.b32 %r1112, %r1056;
mov.b32 %r1113, %r1056;
mov.b32 %r1114, %r1056;
mov.b32 %r1115, %r1056;
mov.b32 %r1116, %r1056;
mov.b32 %r1117, %r1056;
mov.b32 %r1118, %r1056;
mov.b32 %r1119, %r1056;
mov.b32 %r1120, %r1056;
mov.b32 %r1121, %r1056;
mov.b32 %r1122, %r1056;
mov.b32 %r1123, %r1056;
mov.b32 %r1124, %r1056;
mov.b32 %r1125, %r1056;
mov.b32 %r1126, %r1056;
mov.b32 %r1127, %r1056;
mov.b32 %r1128, %r1056;
mov.b32 %r1129, %r1056;
mov.b32 %r1130, %r1056;
mov.b32 %r1131, %r1056;
mov.b32 %r1132, %r1056;
mov.b32 %r1133, %r1056;
mov.b32 %r1134, %r1056;
mov.b32 %r1135, %r1056;
mov.b32 %r1136, %r1056;
mov.b32 %r1137, %r1056;
mov.b32 %r1138, %r1056;
mov.b32 %r1139, %r1056;
mov.b32 %r1140, %r1056;
mov.b32 %r1141, %r1056;
mov.b32 %r1142, %r1056;
mov.b32 %r1143, %r1056;
mov.b32 %r1144, %r1056;
mov.b32 %r1145, %r1056;
mov.b32 %r1146, %r1056;
mov.b32 %r1147, %r1056;
mov.b32 %r1148, %r1056;
mov.b32 %r1149, %r1056;
mov.b32 %r1150, %r1056;
mov.b32 %r1151, %r1056;
mov.b32 %r1152, %r1056;
mov.b32 %r1153, %r1056;
mov.b32 %r1154, %r1056;
mov.b32 %r1155, %r1056;
mov.b32 %r1156, %r1056;
mov.b32 %r1157, %r1056;
mov.b32 %r1158, %r1056;
mov.b32 %r1159, %r1056;
mov.b32 %r1160, %r1056;
mov.b32 %r1161, %r1056;
mov.b32 %r1162, %r1056;
mov.b32 %r1163, %r1056;
mov.b32 %r1164, %r1056;
mov.b32 %r1165, %r1056;
mov.b32 %r1166, %r1056;
mov.b32 %r1167, %r1056;
mov.b32 %r1168, %r1056;
mov.b32 %r1169, %r1056;
mov.b32 %r1170, %r1056;
mov.b32 %r1171, %r1056;
mov.b32 %r1172, %r1056;
mov.b32 %r1173, %r1056;
mov.b32 %r1174, %r1056;
mov.b32 %r1175, %r1056;
mov.b32 %r1176, %r1056;
mov.b32 %r1177, %r1056;
mov.b32 %r1178, %r1056;
mov.b32 %r1179, %r1056;
mov.b32 %r1180, %r1056;
mov.b32 %r1181, %r1056;
mov.b32 %r1182, %r1056;
mov.b32 %r1183, %r1056;
@%p1 bra $L__BB0_4;
// %bb.1: // %.lr.ph
.loc 1 0 30 // layer_norm.py:0:30
ld.param.b32 %r284, [_layer_norm_backward_kernel_param_12];
ld.param.b32 %r281, [_layer_norm_backward_kernel_param_9];
ld.param.b32 %r280, [_layer_norm_backward_kernel_param_8];
ld.param.b64 %rd47, [_layer_norm_backward_kernel_param_7];
ld.param.b64 %rd44, [_layer_norm_backward_kernel_param_4];
ld.param.b64 %rd43, [_layer_norm_backward_kernel_param_3];
ld.param.b64 %rd42, [_layer_norm_backward_kernel_param_2];
ld.param.b64 %rd41, [_layer_norm_backward_kernel_param_0];
ld.param.b64 %rd48, [_layer_norm_backward_kernel_param_1];
mul.wide.u32 %rd49, %r417, 4;
add.s64 %rd72, %rd48, %rd49;
add.s64 %rd73, %rd72, 2048;
add.s64 %rd74, %rd72, 4096;
add.s64 %rd75, %rd72, 6144;
add.s64 %rd76, %rd72, 8192;
add.s64 %rd77, %rd72, 10240;
add.s64 %rd78, %rd72, 12288;
mul.wide.u32 %rd50, %r418, 4;
add.s64 %rd79, %rd48, %rd50;
add.s64 %rd80, %rd72, 16384;
add.s64 %rd81, %rd72, 18432;
add.s64 %rd82, %rd72, 20480;
add.s64 %rd83, %rd72, 22528;
add.s64 %rd84, %rd72, 24576;
add.s64 %rd85, %rd72, 26624;
add.s64 %rd86, %rd72, 28672;
mul.wide.u32 %rd51, %r419, 4;
add.s64 %rd87, %rd48, %rd51;
cvt.rn.f32.s32 %f1, %r285;
.loc 1 113 26 // layer_norm.py:113:26
mul.lo.s32 %r420, %r284, %r2;
.loc 1 113 14 // layer_norm.py:113:14
mul.wide.s32 %rd52, %r420, 4;
add.s64 %rd165, %rd47, %rd52;
.loc 1 112 26 // layer_norm.py:112:26
mul.lo.s32 %r421, %r281, %r2;
.loc 1 112 14 // layer_norm.py:112:14
mul.wide.s32 %rd53, %r421, 4;
add.s64 %rd164, %rd44, %rd53;
.loc 1 111 16 // layer_norm.py:111:16
mul.wide.s32 %rd54, %r2, 4;
add.s64 %rd166, %rd43, %rd54;
.loc 1 110 16 // layer_norm.py:110:16
add.s64 %rd167, %rd42, %rd54;
.loc 1 109 25 // layer_norm.py:109:25
mul.lo.s32 %r422, %r280, %r2;
.loc 1 109 13 // layer_norm.py:109:13
mul.wide.s32 %rd55, %r422, 4;
add.s64 %rd163, %rd41, %rd55;
.loc 1 103 24 // layer_norm.py:103:24
and.b32 %r18, %r4, 31;
shr.u32 %r423, %r4, 3;
and.b32 %r424, %r423, 12;
mov.b32 %r425, global_smem;
add.s32 %r814, %r425, %r424;
add.s32 %r823, %r425, %r416;
.loc 1 115 30 // layer_norm.py:115:30
mul.wide.s32 %rd26, %r284, 4;
and.b32 %r427, %r4, 127;
mul.wide.u32 %rd28, %r427, 16;
mul.wide.s32 %rd29, %r281, 4;
mul.wide.s32 %rd30, %r280, 4;
sub.s32 %r1055, %r3, %r2;
mov.b32 %f1189, 0f00000000;
setp.eq.s32 %p52, %r4, 0;
setp.lt.u32 %p51, %r4, 4;
setp.eq.s32 %p50, %r18, 0;
mov.b32 %f1190, %f1189;
mov.b32 %f1191, %f1189;
mov.b32 %f1192, %f1189;
mov.b32 %f1193, %f1189;
mov.b32 %f1194, %f1189;
mov.b32 %f1195, %f1189;
mov.b32 %f1196, %f1189;
mov.b32 %f1197, %f1189;
mov.b32 %f1198, %f1189;
mov.b32 %f1199, %f1189;
mov.b32 %f1200, %f1189;
mov.b32 %f1201, %f1189;
mov.b32 %f1202, %f1189;
mov.b32 %f1203, %f1189;
mov.b32 %f1204, %f1189;
mov.b32 %f1205, %f1189;
mov.b32 %f1206, %f1189;
mov.b32 %f1207, %f1189;
mov.b32 %f1208, %f1189;
mov.b32 %f1209, %f1189;
mov.b32 %f1210, %f1189;
mov.b32 %f1211, %f1189;
mov.b32 %f1212, %f1189;
mov.b32 %f1213, %f1189;
mov.b32 %f1214, %f1189;
mov.b32 %f1215, %f1189;
mov.b32 %f1216, %f1189;
mov.b32 %f1217, %f1189;
mov.b32 %f1218, %f1189;
mov.b32 %f1219, %f1189;
mov.b32 %f1220, %f1189;
mov.b32 %f1221, %f1189;
mov.b32 %f1222, %f1189;
mov.b32 %f1223, %f1189;
mov.b32 %f1224, %f1189;
mov.b32 %f1225, %f1189;
mov.b32 %f1226, %f1189;
mov.b32 %f1227, %f1189;
mov.b32 %f1228, %f1189;
mov.b32 %f1229, %f1189;
mov.b32 %f1230, %f1189;
mov.b32 %f1231, %f1189;
mov.b32 %f1232, %f1189;
mov.b32 %f1233, %f1189;
mov.b32 %f1234, %f1189;
mov.b32 %f1235, %f1189;
mov.b32 %f1236, %f1189;
mov.b32 %f1237, %f1189;
mov.b32 %f1238, %f1189;
mov.b32 %f1239, %f1189;
mov.b32 %f1240, %f1189;
mov.b32 %f1241, %f1189;
mov.b32 %f1242, %f1189;
mov.b32 %f1243, %f1189;
mov.b32 %f1244, %f1189;
mov.b32 %f1245, %f1189;
mov.b32 %f1246, %f1189;
mov.b32 %f1247, %f1189;
mov.b32 %f1248, %f1189;
mov.b32 %f1249, %f1189;
mov.b32 %f1250, %f1189;
mov.b32 %f1251, %f1189;
mov.b32 %f1252, %f1189;
mov.b32 %f1253, %f1189;
mov.b32 %f1254, %f1189;
mov.b32 %f1255, %f1189;
mov.b32 %f1256, %f1189;
mov.b32 %f1257, %f1189;
mov.b32 %f1258, %f1189;
mov.b32 %f1259, %f1189;
mov.b32 %f1260, %f1189;
mov.b32 %f1261, %f1189;
mov.b32 %f1262, %f1189;
mov.b32 %f1263, %f1189;
mov.b32 %f1264, %f1189;
mov.b32 %f1265, %f1189;
mov.b32 %f1266, %f1189;
mov.b32 %f1267, %f1189;
mov.b32 %f1268, %f1189;
mov.b32 %f1269, %f1189;
mov.b32 %f1270, %f1189;
mov.b32 %f1271, %f1189;
mov.b32 %f1272, %f1189;
mov.b32 %f1273, %f1189;
mov.b32 %f1274, %f1189;
mov.b32 %f1275, %f1189;
mov.b32 %f1276, %f1189;
mov.b32 %f1277, %f1189;
mov.b32 %f1278, %f1189;
mov.b32 %f1279, %f1189;
mov.b32 %f1280, %f1189;
mov.b32 %f1281, %f1189;
mov.b32 %f1282, %f1189;
mov.b32 %f1283, %f1189;
mov.b32 %f1284, %f1189;
mov.b32 %f1285, %f1189;
mov.b32 %f1286, %f1189;
mov.b32 %f1287, %f1189;
mov.b32 %f1288, %f1189;
mov.b32 %f1289, %f1189;
mov.b32 %f1290, %f1189;
mov.b32 %f1291, %f1189;
mov.b32 %f1292, %f1189;
mov.b32 %f1293, %f1189;
mov.b32 %f1294, %f1189;
mov.b32 %f1295, %f1189;
mov.b32 %f1296, %f1189;
mov.b32 %f1297, %f1189;
mov.b32 %f1298, %f1189;
mov.b32 %f1299, %f1189;
mov.b32 %f1300, %f1189;
mov.b32 %f1301, %f1189;
mov.b32 %f1302, %f1189;
mov.b32 %f1303, %f1189;
mov.b32 %f1304, %f1189;
mov.b32 %f1305, %f1189;
mov.b32 %f1306, %f1189;
mov.b32 %f1307, %f1189;
mov.b32 %f1308, %f1189;
mov.b32 %f1309, %f1189;
mov.b32 %f1310, %f1189;
mov.b32 %f1311, %f1189;
mov.b32 %f1312, %f1189;
mov.b32 %f1313, %f1189;
mov.b32 %f1314, %f1189;
mov.b32 %f1315, %f1189;
mov.b32 %f1316, %f1189;
$L__BB0_2: // =>This Inner Loop Header: Depth=1
.loc 1 104 18 // layer_norm.py:104:18
setp.lt.s32 %p17, %r1052, %r285;
setp.lt.s32 %p9, %r1053, %r285;
setp.lt.s32 %p2, %r1054, %r285;
.loc 1 116 28 // layer_norm.py:116:28
add.s64 %rd56, %rd163, %rd28;
add.s64 %rd57, %rd56, 2048;
add.s64 %rd58, %rd56, 4096;
add.s64 %rd59, %rd56, 6144;
add.s64 %rd60, %rd56, 8192;
add.s64 %rd61, %rd56, 10240;
add.s64 %rd62, %rd56, 12288;
add.s64 %rd63, %rd163, %rd162;
add.s64 %rd64, %rd56, 16384;
add.s64 %rd65, %rd56, 18432;
add.s64 %rd66, %rd56, 20480;
add.s64 %rd67, %rd56, 22528;
add.s64 %rd68, %rd56, 24576;
add.s64 %rd69, %rd56, 26624;
add.s64 %rd70, %rd56, 28672;
.loc 1 116 20 // layer_norm.py:116:20
add.s64 %rd71, %rd163, %rd161;
mov.b32 %r432, 0;
// begin inline asm
mov.u32 %r428, %r432;
mov.u32 %r429, %r432;
mov.u32 %r430, %r432;
mov.u32 %r431, %r432;
@%p2 ld.global.v4.b32 { %r428, %r429, %r430, %r431 }, [ %rd56 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r436, %r432;
mov.u32 %r437, %r432;
mov.u32 %r438, %r432;
mov.u32 %r439, %r432;
@%p74 ld.global.v4.b32 { %r436, %r437, %r438, %r439 }, [ %rd57 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r444, %r432;
mov.u32 %r445, %r432;
mov.u32 %r446, %r432;
mov.u32 %r447, %r432;
@%p75 ld.global.v4.b32 { %r444, %r445, %r446, %r447 }, [ %rd58 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r452, %r432;
mov.u32 %r453, %r432;
mov.u32 %r454, %r432;
mov.u32 %r455, %r432;
@%p76 ld.global.v4.b32 { %r452, %r453, %r454, %r455 }, [ %rd59 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r460, %r432;
mov.u32 %r461, %r432;
mov.u32 %r462, %r432;
mov.u32 %r463, %r432;
@%p77 ld.global.v4.b32 { %r460, %r461, %r462, %r463 }, [ %rd60 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r468, %r432;
mov.u32 %r469, %r432;
mov.u32 %r470, %r432;
mov.u32 %r471, %r432;
@%p78 ld.global.v4.b32 { %r468, %r469, %r470, %r471 }, [ %rd61 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r476, %r432;
mov.u32 %r477, %r432;
mov.u32 %r478, %r432;
mov.u32 %r479, %r432;
@%p79 ld.global.v4.b32 { %r476, %r477, %r478, %r479 }, [ %rd62 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r484, %r432;
mov.u32 %r485, %r432;
mov.u32 %r486, %r432;
mov.u32 %r487, %r432;
@%p9 ld.global.v4.b32 { %r484, %r485, %r486, %r487 }, [ %rd63 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r492, %r432;
mov.u32 %r493, %r432;
mov.u32 %r494, %r432;
mov.u32 %r495, %r432;
@%p81 ld.global.v4.b32 { %r492, %r493, %r494, %r495 }, [ %rd64 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r500, %r432;
mov.u32 %r501, %r432;
mov.u32 %r502, %r432;
mov.u32 %r503, %r432;
@%p82 ld.global.v4.b32 { %r500, %r501, %r502, %r503 }, [ %rd65 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r508, %r432;
mov.u32 %r509, %r432;
mov.u32 %r510, %r432;
mov.u32 %r511, %r432;
@%p83 ld.global.v4.b32 { %r508, %r509, %r510, %r511 }, [ %rd66 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r516, %r432;
mov.u32 %r517, %r432;
mov.u32 %r518, %r432;
mov.u32 %r519, %r432;
@%p84 ld.global.v4.b32 { %r516, %r517, %r518, %r519 }, [ %rd67 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r524, %r432;
mov.u32 %r525, %r432;
mov.u32 %r526, %r432;
mov.u32 %r527, %r432;
@%p85 ld.global.v4.b32 { %r524, %r525, %r526, %r527 }, [ %rd68 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r532, %r432;
mov.u32 %r533, %r432;
mov.u32 %r534, %r432;
mov.u32 %r535, %r432;
@%p86 ld.global.v4.b32 { %r532, %r533, %r534, %r535 }, [ %rd69 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r540, %r432;
mov.u32 %r541, %r432;
mov.u32 %r542, %r432;
mov.u32 %r543, %r432;
@%p87 ld.global.v4.b32 { %r540, %r541, %r542, %r543 }, [ %rd70 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r548, %r432;
mov.u32 %r549, %r432;
mov.u32 %r550, %r432;
mov.u32 %r551, %r432;
@%p17 ld.global.v4.b32 { %r548, %r549, %r550, %r551 }, [ %rd71 + 0 ];
// end inline asm
.loc 1 117 20 // layer_norm.py:117:20
// begin inline asm
mov.u32 %r556, %r432;
mov.u32 %r557, %r432;
mov.u32 %r558, %r432;
mov.u32 %r559, %r432;
@%p2 ld.global.v4.b32 { %r556, %r557, %r558, %r559 }, [ %rd72 + 0 ];
// end inline asm
mov.b32 %f386, %r556;
mov.b32 %f387, %r557;
mov.b32 %f388, %r558;
mov.b32 %f389, %r559;
// begin inline asm
mov.u32 %r564, %r432;
mov.u32 %r565, %r432;
mov.u32 %r566, %r432;
mov.u32 %r567, %r432;
@%p74 ld.global.v4.b32 { %r564, %r565, %r566, %r567 }, [ %rd73 + 0 ];
// end inline asm
mov.b32 %f390, %r564;
mov.b32 %f391, %r565;
mov.b32 %f392, %r566;
mov.b32 %f393, %r567;
// begin inline asm
mov.u32 %r572, %r432;
mov.u32 %r573, %r432;
mov.u32 %r574, %r432;
mov.u32 %r575, %r432;
@%p75 ld.global.v4.b32 { %r572, %r573, %r574, %r575 }, [ %rd74 + 0 ];
// end inline asm
mov.b32 %f394, %r572;
mov.b32 %f395, %r573;
mov.b32 %f396, %r574;
mov.b32 %f397, %r575;
// begin inline asm
mov.u32 %r580, %r432;
mov.u32 %r581, %r432;
mov.u32 %r582, %r432;
mov.u32 %r583, %r432;
@%p76 ld.global.v4.b32 { %r580, %r581, %r582, %r583 }, [ %rd75 + 0 ];
// end inline asm
mov.b32 %f398, %r580;
mov.b32 %f399, %r581;
mov.b32 %f400, %r582;
mov.b32 %f401, %r583;
// begin inline asm
mov.u32 %r588, %r432;
mov.u32 %r589, %r432;
mov.u32 %r590, %r432;
mov.u32 %r591, %r432;
@%p77 ld.global.v4.b32 { %r588, %r589, %r590, %r591 }, [ %rd76 + 0 ];
// end inline asm
mov.b32 %f402, %r588;
mov.b32 %f403, %r589;
mov.b32 %f404, %r590;
mov.b32 %f405, %r591;
// begin inline asm
mov.u32 %r596, %r432;
mov.u32 %r597, %r432;
mov.u32 %r598, %r432;
mov.u32 %r599, %r432;
@%p78 ld.global.v4.b32 { %r596, %r597, %r598, %r599 }, [ %rd77 + 0 ];
// end inline asm
mov.b32 %f406, %r596;
mov.b32 %f407, %r597;
mov.b32 %f408, %r598;
mov.b32 %f409, %r599;
// begin inline asm
mov.u32 %r604, %r432;
mov.u32 %r605, %r432;
mov.u32 %r606, %r432;
mov.u32 %r607, %r432;
@%p79 ld.global.v4.b32 { %r604, %r605, %r606, %r607 }, [ %rd78 + 0 ];
// end inline asm
mov.b32 %f410, %r604;
mov.b32 %f411, %r605;
mov.b32 %f412, %r606;
mov.b32 %f413, %r607;
// begin inline asm
mov.u32 %r612, %r432;
mov.u32 %r613, %r432;
mov.u32 %r614, %r432;
mov.u32 %r615, %r432;
@%p9 ld.global.v4.b32 { %r612, %r613, %r614, %r615 }, [ %rd79 + 0 ];
// end inline asm
mov.b32 %f414, %r612;
mov.b32 %f415, %r613;
mov.b32 %f416, %r614;
mov.b32 %f417, %r615;
// begin inline asm
mov.u32 %r620, %r432;
mov.u32 %r621, %r432;
mov.u32 %r622, %r432;
mov.u32 %r623, %r432;
@%p81 ld.global.v4.b32 { %r620, %r621, %r622, %r623 }, [ %rd80 + 0 ];
// end inline asm
mov.b32 %f418, %r620;
mov.b32 %f419, %r621;
mov.b32 %f420, %r622;
mov.b32 %f421, %r623;
// begin inline asm
mov.u32 %r628, %r432;
mov.u32 %r629, %r432;
mov.u32 %r630, %r432;
mov.u32 %r631, %r432;
@%p82 ld.global.v4.b32 { %r628, %r629, %r630, %r631 }, [ %rd81 + 0 ];
// end inline asm
mov.b32 %f422, %r628;
mov.b32 %f423, %r629;
mov.b32 %f424, %r630;
mov.b32 %f425, %r631;
// begin inline asm
mov.u32 %r636, %r432;
mov.u32 %r637, %r432;
mov.u32 %r638, %r432;
mov.u32 %r639, %r432;
@%p83 ld.global.v4.b32 { %r636, %r637, %r638, %r639 }, [ %rd82 + 0 ];
// end inline asm
mov.b32 %f426, %r636;
mov.b32 %f427, %r637;
mov.b32 %f428, %r638;
mov.b32 %f429, %r639;
// begin inline asm
mov.u32 %r644, %r432;
mov.u32 %r645, %r432;
mov.u32 %r646, %r432;
mov.u32 %r647, %r432;
@%p84 ld.global.v4.b32 { %r644, %r645, %r646, %r647 }, [ %rd83 + 0 ];
// end inline asm
mov.b32 %f430, %r644;
mov.b32 %f431, %r645;
mov.b32 %f432, %r646;
mov.b32 %f433, %r647;
// begin inline asm
mov.u32 %r652, %r432;
mov.u32 %r653, %r432;
mov.u32 %r654, %r432;
mov.u32 %r655, %r432;
@%p85 ld.global.v4.b32 { %r652, %r653, %r654, %r655 }, [ %rd84 + 0 ];
// end inline asm
mov.b32 %f434, %r652;
mov.b32 %f435, %r653;
mov.b32 %f436, %r654;
mov.b32 %f437, %r655;
// begin inline asm
mov.u32 %r660, %r432;
mov.u32 %r661, %r432;
mov.u32 %r662, %r432;
mov.u32 %r663, %r432;
@%p86 ld.global.v4.b32 { %r660, %r661, %r662, %r663 }, [ %rd85 + 0 ];
// end inline asm
mov.b32 %f438, %r660;
mov.b32 %f439, %r661;
mov.b32 %f440, %r662;
mov.b32 %f441, %r663;
// begin inline asm
mov.u32 %r668, %r432;
mov.u32 %r669, %r432;
mov.u32 %r670, %r432;
mov.u32 %r671, %r432;
@%p87 ld.global.v4.b32 { %r668, %r669, %r670, %r671 }, [ %rd86 + 0 ];
// end inline asm
mov.b32 %f442, %r668;
mov.b32 %f443, %r669;
mov.b32 %f444, %r670;
mov.b32 %f445, %r671;
// begin inline asm
mov.u32 %r676, %r432;
mov.u32 %r677, %r432;
mov.u32 %r678, %r432;
mov.u32 %r679, %r432;
@%p17 ld.global.v4.b32 { %r676, %r677, %r678, %r679 }, [ %rd87 + 0 ];
// end inline asm
mov.b32 %f446, %r676;
mov.b32 %f447, %r677;
mov.b32 %f448, %r678;
mov.b32 %f449, %r679;
.loc 1 118 30 // layer_norm.py:118:30
add.s64 %rd88, %rd165, %rd28;
add.s64 %rd89, %rd88, 2048;
add.s64 %rd90, %rd88, 4096;
add.s64 %rd91, %rd88, 6144;
add.s64 %rd92, %rd88, 8192;
add.s64 %rd93, %rd88, 10240;
add.s64 %rd94, %rd88, 12288;
add.s64 %rd95, %rd165, %rd162;
add.s64 %rd96, %rd88, 16384;
add.s64 %rd97, %rd88, 18432;
add.s64 %rd98, %rd88, 20480;
add.s64 %rd99, %rd88, 22528;
add.s64 %rd100, %rd88, 24576;
add.s64 %rd101, %rd88, 26624;
add.s64 %rd102, %rd88, 28672;
.loc 1 118 21 // layer_norm.py:118:21
add.s64 %rd103, %rd165, %rd161;
// begin inline asm
mov.u32 %r684, %r432;
mov.u32 %r685, %r432;
mov.u32 %r686, %r432;
mov.u32 %r687, %r432;
@%p2 ld.global.v4.b32 { %r684, %r685, %r686, %r687 }, [ %rd88 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r692, %r432;
mov.u32 %r693, %r432;
mov.u32 %r694, %r432;
mov.u32 %r695, %r432;
@%p74 ld.global.v4.b32 { %r692, %r693, %r694, %r695 }, [ %rd89 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r700, %r432;
mov.u32 %r701, %r432;
mov.u32 %r702, %r432;
mov.u32 %r703, %r432;
@%p75 ld.global.v4.b32 { %r700, %r701, %r702, %r703 }, [ %rd90 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r708, %r432;
mov.u32 %r709, %r432;
mov.u32 %r710, %r432;
mov.u32 %r711, %r432;
@%p76 ld.global.v4.b32 { %r708, %r709, %r710, %r711 }, [ %rd91 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r716, %r432;
mov.u32 %r717, %r432;
mov.u32 %r718, %r432;
mov.u32 %r719, %r432;
@%p77 ld.global.v4.b32 { %r716, %r717, %r718, %r719 }, [ %rd92 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r724, %r432;
mov.u32 %r725, %r432;
mov.u32 %r726, %r432;
mov.u32 %r727, %r432;
@%p78 ld.global.v4.b32 { %r724, %r725, %r726, %r727 }, [ %rd93 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r732, %r432;
mov.u32 %r733, %r432;
mov.u32 %r734, %r432;
mov.u32 %r735, %r432;
@%p79 ld.global.v4.b32 { %r732, %r733, %r734, %r735 }, [ %rd94 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r740, %r432;
mov.u32 %r741, %r432;
mov.u32 %r742, %r432;
mov.u32 %r743, %r432;
@%p9 ld.global.v4.b32 { %r740, %r741, %r742, %r743 }, [ %rd95 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r748, %r432;
mov.u32 %r749, %r432;
mov.u32 %r750, %r432;
mov.u32 %r751, %r432;
@%p81 ld.global.v4.b32 { %r748, %r749, %r750, %r751 }, [ %rd96 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r756, %r432;
mov.u32 %r757, %r432;
mov.u32 %r758, %r432;
mov.u32 %r759, %r432;
@%p82 ld.global.v4.b32 { %r756, %r757, %r758, %r759 }, [ %rd97 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r764, %r432;
mov.u32 %r765, %r432;
mov.u32 %r766, %r432;
mov.u32 %r767, %r432;
@%p83 ld.global.v4.b32 { %r764, %r765, %r766, %r767 }, [ %rd98 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r772, %r432;
mov.u32 %r773, %r432;
mov.u32 %r774, %r432;
mov.u32 %r775, %r432;
@%p84 ld.global.v4.b32 { %r772, %r773, %r774, %r775 }, [ %rd99 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r780, %r432;
mov.u32 %r781, %r432;
mov.u32 %r782, %r432;
mov.u32 %r783, %r432;
@%p85 ld.global.v4.b32 { %r780, %r781, %r782, %r783 }, [ %rd100 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r788, %r432;
mov.u32 %r789, %r432;
mov.u32 %r790, %r432;
mov.u32 %r791, %r432;
@%p86 ld.global.v4.b32 { %r788, %r789, %r790, %r791 }, [ %rd101 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r796, %r432;
mov.u32 %r797, %r432;
mov.u32 %r798, %r432;
mov.u32 %r799, %r432;
@%p87 ld.global.v4.b32 { %r796, %r797, %r798, %r799 }, [ %rd102 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r804, %r432;
mov.u32 %r805, %r432;
mov.u32 %r806, %r432;
mov.u32 %r807, %r432;
@%p17 ld.global.v4.b32 { %r804, %r805, %r806, %r807 }, [ %rd103 + 0 ];
// end inline asm
.loc 1 119 23 // layer_norm.py:119:23
// begin inline asm
mov.u32 %r812, 0x0;
ld.global.b32 { %r812 }, [ %rd167 + 0 ];
// end inline asm
.loc 1 120 23 // layer_norm.py:120:23
// begin inline asm
mov.u32 %r813, 0x0;
ld.global.b32 { %r813 }, [ %rd166 + 0 ];
// end inline asm
mov.b32 %f450, %r813;
$L__tmp1:
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
bar.sync 0;
$L__tmp2:
.loc 1 127 26 // layer_norm.py:127:26
add.s64 %rd106, %rd164, %rd28;
add.s64 %rd107, %rd106, 2048;
add.s64 %rd108, %rd106, 4096;
add.s64 %rd109, %rd106, 6144;
add.s64 %rd110, %rd106, 8192;
add.s64 %rd111, %rd106, 10240;
add.s64 %rd112, %rd106, 12288;
add.s64 %rd113, %rd164, %rd162;
add.s64 %rd114, %rd106, 16384;
add.s64 %rd115, %rd106, 18432;
add.s64 %rd116, %rd106, 20480;
add.s64 %rd117, %rd106, 22528;
add.s64 %rd118, %rd106, 24576;
add.s64 %rd119, %rd106, 26624;
add.s64 %rd120, %rd106, 28672;
.loc 1 116 20 // layer_norm.py:116:20
add.s64 %rd121, %rd164, %rd161;
mov.b32 %f451, %r551;
mov.b32 %f452, %r550;
mov.b32 %f453, %r549;
mov.b32 %f454, %r548;
mov.b32 %f455, %r543;
mov.b32 %f456, %r542;
mov.b32 %f457, %r541;
mov.b32 %f458, %r540;
mov.b32 %f459, %r535;
mov.b32 %f460, %r534;
mov.b32 %f461, %r533;
mov.b32 %f462, %r532;
mov.b32 %f463, %r527;
mov.b32 %f464, %r526;
mov.b32 %f465, %r525;
mov.b32 %f466, %r524;
mov.b32 %f467, %r519;
mov.b32 %f468, %r518;
mov.b32 %f469, %r517;
mov.b32 %f470, %r516;
mov.b32 %f471, %r511;
mov.b32 %f472, %r510;
mov.b32 %f473, %r509;
mov.b32 %f474, %r508;
mov.b32 %f475, %r503;
mov.b32 %f476, %r502;
mov.b32 %f477, %r501;
mov.b32 %f478, %r500;
mov.b32 %f479, %r495;
mov.b32 %f480, %r494;
mov.b32 %f481, %r493;
mov.b32 %f482, %r492;
mov.b32 %f483, %r487;
mov.b32 %f484, %r486;
mov.b32 %f485, %r485;
mov.b32 %f486, %r484;
mov.b32 %f487, %r479;
mov.b32 %f488, %r478;
mov.b32 %f489, %r477;
mov.b32 %f490, %r476;
mov.b32 %f491, %r471;
mov.b32 %f492, %r470;
mov.b32 %f493, %r469;
mov.b32 %f494, %r468;
mov.b32 %f495, %r463;
mov.b32 %f496, %r462;
mov.b32 %f497, %r461;
mov.b32 %f498, %r460;
mov.b32 %f499, %r455;
mov.b32 %f500, %r454;
mov.b32 %f501, %r453;
mov.b32 %f502, %r452;
mov.b32 %f503, %r447;
mov.b32 %f504, %r446;
mov.b32 %f505, %r445;
mov.b32 %f506, %r444;
mov.b32 %f507, %r439;
mov.b32 %f508, %r438;
mov.b32 %f509, %r437;
mov.b32 %f510, %r436;
mov.b32 %f511, %r431;
mov.b32 %f512, %r430;
mov.b32 %f513, %r428;
mov.b32 %f514, %r429;
.loc 1 122 21 // layer_norm.py:122:21
mov.b32 %f515, %r812;
sub.f32 %f516, %f514, %f515;
sub.f32 %f517, %f513, %f515;
sub.f32 %f518, %f512, %f515;
sub.f32 %f519, %f511, %f515;
sub.f32 %f520, %f510, %f515;
sub.f32 %f521, %f509, %f515;
sub.f32 %f522, %f508, %f515;
sub.f32 %f523, %f507, %f515;
sub.f32 %f524, %f506, %f515;
sub.f32 %f525, %f505, %f515;
sub.f32 %f526, %f504, %f515;
sub.f32 %f527, %f503, %f515;
sub.f32 %f528, %f502, %f515;
sub.f32 %f529, %f501, %f515;
sub.f32 %f530, %f500, %f515;
sub.f32 %f531, %f499, %f515;
sub.f32 %f532, %f498, %f515;
sub.f32 %f533, %f497, %f515;
sub.f32 %f534, %f496, %f515;
sub.f32 %f535, %f495, %f515;
sub.f32 %f536, %f494, %f515;
sub.f32 %f537, %f493, %f515;
sub.f32 %f538, %f492, %f515;
sub.f32 %f539, %f491, %f515;
sub.f32 %f540, %f490, %f515;
sub.f32 %f541, %f489, %f515;
sub.f32 %f542, %f488, %f515;
sub.f32 %f543, %f487, %f515;
sub.f32 %f544, %f486, %f515;
sub.f32 %f545, %f485, %f515;
sub.f32 %f546, %f484, %f515;
sub.f32 %f547, %f483, %f515;
sub.f32 %f548, %f482, %f515;
sub.f32 %f549, %f481, %f515;
sub.f32 %f550, %f480, %f515;
sub.f32 %f551, %f479, %f515;
sub.f32 %f552, %f478, %f515;
sub.f32 %f553, %f477, %f515;
sub.f32 %f554, %f476, %f515;
sub.f32 %f555, %f475, %f515;
sub.f32 %f556, %f474, %f515;
sub.f32 %f557, %f473, %f515;
sub.f32 %f558, %f472, %f515;
sub.f32 %f559, %f471, %f515;
sub.f32 %f560, %f470, %f515;
sub.f32 %f561, %f469, %f515;
sub.f32 %f562, %f468, %f515;
sub.f32 %f563, %f467, %f515;
sub.f32 %f564, %f466, %f515;
sub.f32 %f565, %f465, %f515;
sub.f32 %f566, %f464, %f515;
sub.f32 %f567, %f463, %f515;
sub.f32 %f568, %f462, %f515;
sub.f32 %f569, %f461, %f515;
sub.f32 %f570, %f460, %f515;
sub.f32 %f571, %f459, %f515;
sub.f32 %f572, %f458, %f515;
sub.f32 %f573, %f457, %f515;
sub.f32 %f574, %f456, %f515;
sub.f32 %f575, %f455, %f515;
sub.f32 %f576, %f454, %f515;
sub.f32 %f577, %f453, %f515;
sub.f32 %f578, %f452, %f515;
sub.f32 %f579, %f451, %f515;
.loc 1 122 29 // layer_norm.py:122:29
mul.f32 %f580, %f579, %f450;
mul.f32 %f581, %f578, %f450;
mul.f32 %f582, %f577, %f450;
mul.f32 %f583, %f576, %f450;
mul.f32 %f584, %f575, %f450;
mul.f32 %f585, %f574, %f450;
mul.f32 %f586, %f573, %f450;
mul.f32 %f587, %f572, %f450;
mul.f32 %f588, %f571, %f450;
mul.f32 %f589, %f570, %f450;
mul.f32 %f590, %f569, %f450;
mul.f32 %f591, %f568, %f450;
mul.f32 %f592, %f567, %f450;
mul.f32 %f593, %f566, %f450;
mul.f32 %f594, %f565, %f450;
mul.f32 %f595, %f564, %f450;
mul.f32 %f596, %f563, %f450;
mul.f32 %f597, %f562, %f450;
mul.f32 %f598, %f561, %f450;
mul.f32 %f599, %f560, %f450;
mul.f32 %f600, %f559, %f450;
mul.f32 %f601, %f558, %f450;
mul.f32 %f602, %f557, %f450;
mul.f32 %f603, %f556, %f450;
mul.f32 %f604, %f555, %f450;
mul.f32 %f605, %f554, %f450;
mul.f32 %f606, %f553, %f450;
mul.f32 %f607, %f552, %f450;
mul.f32 %f608, %f551, %f450;
mul.f32 %f609, %f550, %f450;
mul.f32 %f610, %f549, %f450;
mul.f32 %f611, %f548, %f450;
mul.f32 %f612, %f547, %f450;
mul.f32 %f613, %f546, %f450;
mul.f32 %f614, %f545, %f450;
mul.f32 %f615, %f544, %f450;
mul.f32 %f616, %f543, %f450;
mul.f32 %f617, %f542, %f450;
mul.f32 %f618, %f541, %f450;
mul.f32 %f619, %f540, %f450;
mul.f32 %f620, %f539, %f450;
mul.f32 %f621, %f538, %f450;
mul.f32 %f622, %f537, %f450;
mul.f32 %f623, %f536, %f450;
mul.f32 %f624, %f535, %f450;
mul.f32 %f625, %f534, %f450;
mul.f32 %f626, %f533, %f450;
mul.f32 %f627, %f532, %f450;
mul.f32 %f628, %f531, %f450;
mul.f32 %f629, %f530, %f450;
mul.f32 %f630, %f529, %f450;
mul.f32 %f631, %f528, %f450;
mul.f32 %f632, %f527, %f450;
mul.f32 %f633, %f526, %f450;
mul.f32 %f634, %f525, %f450;
mul.f32 %f635, %f524, %f450;
mul.f32 %f636, %f523, %f450;
mul.f32 %f637, %f522, %f450;
mul.f32 %f638, %f521, %f450;
mul.f32 %f639, %f520, %f450;
mul.f32 %f640, %f519, %f450;
mul.f32 %f641, %f518, %f450;
mul.f32 %f642, %f517, %f450;
mul.f32 %f643, %f516, %f450;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f644, %r684;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f645, %f386, %f644;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f646, %r685;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f647, %f387, %f646;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f648, %r686;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f649, %f388, %f648;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f650, %r687;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f651, %f389, %f650;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f652, %r692;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f653, %f390, %f652;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f654, %r693;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f655, %f391, %f654;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f656, %r694;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f657, %f392, %f656;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f658, %r695;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f659, %f393, %f658;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f660, %r700;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f661, %f394, %f660;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f662, %r701;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f663, %f395, %f662;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f664, %r702;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f665, %f396, %f664;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f666, %r703;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f667, %f397, %f666;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f668, %r708;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f669, %f398, %f668;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f670, %r709;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f671, %f399, %f670;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f672, %r710;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f673, %f400, %f672;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f674, %r711;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f675, %f401, %f674;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f676, %r716;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f677, %f402, %f676;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f678, %r717;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f679, %f403, %f678;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f680, %r718;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f681, %f404, %f680;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f682, %r719;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f683, %f405, %f682;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f684, %r724;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f685, %f406, %f684;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f686, %r725;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f687, %f407, %f686;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f688, %r726;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f689, %f408, %f688;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f690, %r727;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f691, %f409, %f690;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f692, %r732;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f693, %f410, %f692;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f694, %r733;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f695, %f411, %f694;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f696, %r734;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f697, %f412, %f696;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f698, %r735;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f699, %f413, %f698;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f700, %r740;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f701, %f414, %f700;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f702, %r741;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f703, %f415, %f702;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f704, %r742;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f705, %f416, %f704;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f706, %r743;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f707, %f417, %f706;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f708, %r748;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f709, %f418, %f708;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f710, %r749;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f711, %f419, %f710;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f712, %r750;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f713, %f420, %f712;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f714, %r751;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f715, %f421, %f714;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f716, %r756;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f717, %f422, %f716;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f718, %r757;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f719, %f423, %f718;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f720, %r758;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f721, %f424, %f720;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f722, %r759;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f723, %f425, %f722;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f724, %r764;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f725, %f426, %f724;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f726, %r765;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f727, %f427, %f726;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f728, %r766;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f729, %f428, %f728;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f730, %r767;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f731, %f429, %f730;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f732, %r772;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f733, %f430, %f732;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f734, %r773;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f735, %f431, %f734;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f736, %r774;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f737, %f432, %f736;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f738, %r775;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f739, %f433, %f738;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f740, %r780;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f741, %f434, %f740;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f742, %r781;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f743, %f435, %f742;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f744, %r782;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f745, %f436, %f744;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f746, %r783;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f747, %f437, %f746;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f748, %r788;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f749, %f438, %f748;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f750, %r789;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f751, %f439, %f750;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f752, %r790;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f753, %f440, %f752;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f754, %r791;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f755, %f441, %f754;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f756, %r796;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f757, %f442, %f756;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f758, %r797;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f759, %f443, %f758;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f760, %r798;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f761, %f444, %f760;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f762, %r799;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f763, %f445, %f762;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f764, %r804;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f765, %f446, %f764;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f766, %r805;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f767, %f447, %f766;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f768, %r806;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f769, %f448, %f768;
.loc 1 118 21 // layer_norm.py:118:21
mov.b32 %f770, %r807;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %f771, %f449, %f770;
.loc 1 124 28 // layer_norm.py:124:28
mul.f32 %f772, %f647, %f643;
$L__tmp3:
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
fma.rn.f32 %f773, %f645, %f642, %f772;
fma.rn.f32 %f774, %f649, %f641, %f773;
fma.rn.f32 %f775, %f651, %f640, %f774;
fma.rn.f32 %f776, %f653, %f639, %f775;
fma.rn.f32 %f777, %f655, %f638, %f776;
fma.rn.f32 %f778, %f657, %f637, %f777;
fma.rn.f32 %f779, %f659, %f636, %f778;
fma.rn.f32 %f780, %f661, %f635, %f779;
fma.rn.f32 %f781, %f663, %f634, %f780;
fma.rn.f32 %f782, %f665, %f633, %f781;
fma.rn.f32 %f783, %f667, %f632, %f782;
fma.rn.f32 %f784, %f669, %f631, %f783;
fma.rn.f32 %f785, %f671, %f630, %f784;
fma.rn.f32 %f786, %f673, %f629, %f785;
fma.rn.f32 %f787, %f675, %f628, %f786;
fma.rn.f32 %f788, %f677, %f627, %f787;
fma.rn.f32 %f789, %f679, %f626, %f788;
fma.rn.f32 %f790, %f681, %f625, %f789;
fma.rn.f32 %f791, %f683, %f624, %f790;
fma.rn.f32 %f792, %f685, %f623, %f791;
fma.rn.f32 %f793, %f687, %f622, %f792;
fma.rn.f32 %f794, %f689, %f621, %f793;
fma.rn.f32 %f795, %f691, %f620, %f794;
fma.rn.f32 %f796, %f693, %f619, %f795;
fma.rn.f32 %f797, %f695, %f618, %f796;
fma.rn.f32 %f798, %f697, %f617, %f797;
fma.rn.f32 %f799, %f699, %f616, %f798;
fma.rn.f32 %f800, %f701, %f615, %f799;
fma.rn.f32 %f801, %f703, %f614, %f800;
fma.rn.f32 %f802, %f705, %f613, %f801;
fma.rn.f32 %f803, %f707, %f612, %f802;
fma.rn.f32 %f804, %f709, %f611, %f803;
fma.rn.f32 %f805, %f711, %f610, %f804;
fma.rn.f32 %f806, %f713, %f609, %f805;
fma.rn.f32 %f807, %f715, %f608, %f806;
fma.rn.f32 %f808, %f717, %f607, %f807;
fma.rn.f32 %f809, %f719, %f606, %f808;
fma.rn.f32 %f810, %f721, %f605, %f809;
fma.rn.f32 %f811, %f723, %f604, %f810;
fma.rn.f32 %f812, %f725, %f603, %f811;
fma.rn.f32 %f813, %f727, %f602, %f812;
fma.rn.f32 %f814, %f729, %f601, %f813;
fma.rn.f32 %f815, %f731, %f600, %f814;
fma.rn.f32 %f816, %f733, %f599, %f815;
fma.rn.f32 %f817, %f735, %f598, %f816;
fma.rn.f32 %f818, %f737, %f597, %f817;
fma.rn.f32 %f819, %f739, %f596, %f818;
fma.rn.f32 %f820, %f741, %f595, %f819;
fma.rn.f32 %f821, %f743, %f594, %f820;
fma.rn.f32 %f822, %f745, %f593, %f821;
fma.rn.f32 %f823, %f747, %f592, %f822;
fma.rn.f32 %f824, %f749, %f591, %f823;
fma.rn.f32 %f825, %f751, %f590, %f824;
fma.rn.f32 %f826, %f753, %f589, %f825;
fma.rn.f32 %f827, %f755, %f588, %f826;
fma.rn.f32 %f828, %f757, %f587, %f827;
fma.rn.f32 %f829, %f759, %f586, %f828;
fma.rn.f32 %f830, %f761, %f585, %f829;
fma.rn.f32 %f831, %f763, %f584, %f830;
fma.rn.f32 %f832, %f765, %f583, %f831;
fma.rn.f32 %f833, %f767, %f582, %f832;
fma.rn.f32 %f834, %f769, %f581, %f833;
fma.rn.f32 %f835, %f771, %f580, %f834;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r893, %f835;
shfl.sync.bfly.b32 %r894, %r893, 16, 31, -1;
mov.b32 %f836, %r894;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f837, %f835, %f836;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r895, %f837;
shfl.sync.bfly.b32 %r896, %r895, 8, 31, -1;
mov.b32 %f838, %r896;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f839, %f837, %f838;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r897, %f839;
shfl.sync.bfly.b32 %r898, %r897, 4, 31, -1;
mov.b32 %f840, %r898;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f841, %f839, %f840;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r899, %f841;
shfl.sync.bfly.b32 %r900, %r899, 2, 31, -1;
mov.b32 %f842, %r900;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f843, %f841, %f842;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r901, %f843;
shfl.sync.bfly.b32 %r902, %r901, 1, 31, -1;
mov.b32 %f844, %r902;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f845, %f843, %f844;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r815, %f845;
// begin inline asm
@%p50 st.shared.b32 [ %r814 + 0 ], %r815;
// end inline asm
bar.sync 0;
// begin inline asm
@%p51 ld.shared.b32 %r816, [ %r823 + 0 ];
// end inline asm
mov.b32 %f846, %r816;
shfl.sync.bfly.b32 %r903, %r816, 2, 31, -1;
mov.b32 %f847, %r903;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f848, %f846, %f847;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r904, %f848;
shfl.sync.bfly.b32 %r905, %r904, 1, 31, -1;
mov.b32 %f849, %r905;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %f850, %f848, %f849;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
mov.b32 %r819, %f850;
// begin inline asm
@%p52 st.shared.b32 [ %r823 + 0 ], %r819;
// end inline asm
bar.sync 0;
ld.shared.b32 %f851, [global_smem];
$L__tmp4:
.loc 1 124 43 // layer_norm.py:124:43
div.full.f32 %f852, %f851, %f1;
$L__tmp5:
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
bar.sync 0;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
fma.rn.f32 %f853, %f386, %f644, %f647;
fma.rn.f32 %f854, %f388, %f648, %f853;
fma.rn.f32 %f855, %f389, %f650, %f854;
fma.rn.f32 %f856, %f390, %f652, %f855;
fma.rn.f32 %f857, %f391, %f654, %f856;
fma.rn.f32 %f858, %f392, %f656, %f857;
fma.rn.f32 %f859, %f393, %f658, %f858;
fma.rn.f32 %f860, %f394, %f660, %f859;
fma.rn.f32 %f861, %f395, %f662, %f860;
fma.rn.f32 %f862, %f396, %f664, %f861;
fma.rn.f32 %f863, %f397, %f666, %f862;
fma.rn.f32 %f864, %f398, %f668, %f863;
fma.rn.f32 %f865, %f399, %f670, %f864;
fma.rn.f32 %f866, %f400, %f672, %f865;
fma.rn.f32 %f867, %f401, %f674, %f866;
fma.rn.f32 %f868, %f402, %f676, %f867;
fma.rn.f32 %f869, %f403, %f678, %f868;
fma.rn.f32 %f870, %f404, %f680, %f869;
fma.rn.f32 %f871, %f405, %f682, %f870;
fma.rn.f32 %f872, %f406, %f684, %f871;
fma.rn.f32 %f873, %f407, %f686, %f872;
fma.rn.f32 %f874, %f408, %f688, %f873;
fma.rn.f32 %f875, %f409, %f690, %f874;
fma.rn.f32 %f876, %f410, %f692, %f875;
fma.rn.f32 %f877, %f411, %f694, %f876;
fma.rn.f32 %f878, %f412, %f696, %f877;
fma.rn.f32 %f879, %f413, %f698, %f878;
fma.rn.f32 %f880, %f414, %f700, %f879;
fma.rn.f32 %f881, %f415, %f702, %f880;
fma.rn.f32 %f882, %f416, %f704, %f881;
fma.rn.f32 %f883, %f417, %f706, %f882;
fma.rn.f32 %f884, %f418, %f708, %f883;
fma.rn.f32 %f885, %f419, %f710, %f884;
fma.rn.f32 %f886, %f420, %f712, %f885;
fma.rn.f32 %f887, %f421, %f714, %f886;
fma.rn.f32 %f888, %f422, %f716, %f887;
fma.rn.f32 %f889, %f423, %f718, %f888;
fma.rn.f32 %f890, %f424, %f720, %f889;
fma.rn.f32 %f891, %f425, %f722, %f890;
fma.rn.f32 %f892, %f426, %f724, %f891;
fma.rn.f32 %f893, %f427, %f726, %f892;
fma.rn.f32 %f894, %f428, %f728, %f893;
fma.rn.f32 %f895, %f429, %f730, %f894;
fma.rn.f32 %f896, %f430, %f732, %f895;
fma.rn.f32 %f897, %f431, %f734, %f896;
fma.rn.f32 %f898, %f432, %f736, %f897;
fma.rn.f32 %f899, %f433, %f738, %f898;
fma.rn.f32 %f900, %f434, %f740, %f899;
fma.rn.f32 %f901, %f435, %f742, %f900;
fma.rn.f32 %f902, %f436, %f744, %f901;
fma.rn.f32 %f903, %f437, %f746, %f902;
fma.rn.f32 %f904, %f438, %f748, %f903;
fma.rn.f32 %f905, %f439, %f750, %f904;
fma.rn.f32 %f906, %f440, %f752, %f905;
fma.rn.f32 %f907, %f441, %f754, %f906;
fma.rn.f32 %f908, %f442, %f756, %f907;
fma.rn.f32 %f909, %f443, %f758, %f908;
fma.rn.f32 %f910, %f444, %f760, %f909;
fma.rn.f32 %f911, %f445, %f762, %f910;
fma.rn.f32 %f912, %f446, %f764, %f911;
fma.rn.f32 %f913, %f447, %f766, %f912;
fma.rn.f32 %f914, %f448, %f768, %f913;
fma.rn.f32 %f915, %f449, %f770, %f914;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r906, %f915;
shfl.sync.bfly.b32 %r907, %r906, 16, 31, -1;
mov.b32 %f916, %r907;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f917, %f915, %f916;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r908, %f917;
shfl.sync.bfly.b32 %r909, %r908, 8, 31, -1;
mov.b32 %f918, %r909;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f919, %f917, %f918;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r910, %f919;
shfl.sync.bfly.b32 %r911, %r910, 4, 31, -1;
mov.b32 %f920, %r911;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f921, %f919, %f920;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r912, %f921;
shfl.sync.bfly.b32 %r913, %r912, 2, 31, -1;
mov.b32 %f922, %r913;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f923, %f921, %f922;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r914, %f923;
shfl.sync.bfly.b32 %r915, %r914, 1, 31, -1;
mov.b32 %f924, %r915;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f925, %f923, %f924;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r821, %f925;
// begin inline asm
@%p50 st.shared.b32 [ %r814 + 0 ], %r821;
// end inline asm
bar.sync 0;
// begin inline asm
@%p51 ld.shared.b32 %r822, [ %r823 + 0 ];
// end inline asm
mov.b32 %f926, %r822;
shfl.sync.bfly.b32 %r916, %r822, 2, 31, -1;
mov.b32 %f927, %r916;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f928, %f926, %f927;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r917, %f928;
shfl.sync.bfly.b32 %r918, %r917, 1, 31, -1;
mov.b32 %f929, %r918;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %f930, %f928, %f929;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
mov.b32 %r825, %f930;
// begin inline asm
@%p52 st.shared.b32 [ %r823 + 0 ], %r825;
// end inline asm
bar.sync 0;
ld.shared.b32 %f931, [global_smem];
$L__tmp6:
.loc 1 125 35 // layer_norm.py:125:35
div.full.f32 %f932, %f931, %f1;
.loc 1 126 34 // layer_norm.py:126:34
fma.rn.f32 %f933, %f642, %f852, %f932;
fma.rn.f32 %f934, %f643, %f852, %f932;
fma.rn.f32 %f935, %f641, %f852, %f932;
fma.rn.f32 %f936, %f640, %f852, %f932;
fma.rn.f32 %f937, %f639, %f852, %f932;
fma.rn.f32 %f938, %f638, %f852, %f932;
fma.rn.f32 %f939, %f637, %f852, %f932;
fma.rn.f32 %f940, %f636, %f852, %f932;
fma.rn.f32 %f941, %f635, %f852, %f932;
fma.rn.f32 %f942, %f634, %f852, %f932;
fma.rn.f32 %f943, %f633, %f852, %f932;
fma.rn.f32 %f944, %f632, %f852, %f932;
fma.rn.f32 %f945, %f631, %f852, %f932;
fma.rn.f32 %f946, %f630, %f852, %f932;
fma.rn.f32 %f947, %f629, %f852, %f932;
fma.rn.f32 %f948, %f628, %f852, %f932;
fma.rn.f32 %f949, %f627, %f852, %f932;
fma.rn.f32 %f950, %f626, %f852, %f932;
fma.rn.f32 %f951, %f625, %f852, %f932;
fma.rn.f32 %f952, %f624, %f852, %f932;
fma.rn.f32 %f953, %f623, %f852, %f932;
fma.rn.f32 %f954, %f622, %f852, %f932;
fma.rn.f32 %f955, %f621, %f852, %f932;
fma.rn.f32 %f956, %f620, %f852, %f932;
fma.rn.f32 %f957, %f619, %f852, %f932;
fma.rn.f32 %f958, %f618, %f852, %f932;
fma.rn.f32 %f959, %f617, %f852, %f932;
fma.rn.f32 %f960, %f616, %f852, %f932;
fma.rn.f32 %f961, %f615, %f852, %f932;
fma.rn.f32 %f962, %f614, %f852, %f932;
fma.rn.f32 %f963, %f613, %f852, %f932;
fma.rn.f32 %f964, %f612, %f852, %f932;
fma.rn.f32 %f965, %f611, %f852, %f932;
fma.rn.f32 %f966, %f610, %f852, %f932;
fma.rn.f32 %f967, %f609, %f852, %f932;
fma.rn.f32 %f968, %f608, %f852, %f932;
fma.rn.f32 %f969, %f607, %f852, %f932;
fma.rn.f32 %f970, %f606, %f852, %f932;
fma.rn.f32 %f971, %f605, %f852, %f932;
fma.rn.f32 %f972, %f604, %f852, %f932;
fma.rn.f32 %f973, %f603, %f852, %f932;
fma.rn.f32 %f974, %f602, %f852, %f932;
fma.rn.f32 %f975, %f601, %f852, %f932;
fma.rn.f32 %f976, %f600, %f852, %f932;
fma.rn.f32 %f977, %f599, %f852, %f932;
fma.rn.f32 %f978, %f598, %f852, %f932;
fma.rn.f32 %f979, %f597, %f852, %f932;
fma.rn.f32 %f980, %f596, %f852, %f932;
fma.rn.f32 %f981, %f595, %f852, %f932;
fma.rn.f32 %f982, %f594, %f852, %f932;
fma.rn.f32 %f983, %f593, %f852, %f932;
fma.rn.f32 %f984, %f592, %f852, %f932;
fma.rn.f32 %f985, %f591, %f852, %f932;
fma.rn.f32 %f986, %f590, %f852, %f932;
fma.rn.f32 %f987, %f589, %f852, %f932;
fma.rn.f32 %f988, %f588, %f852, %f932;
fma.rn.f32 %f989, %f587, %f852, %f932;
fma.rn.f32 %f990, %f586, %f852, %f932;
fma.rn.f32 %f991, %f585, %f852, %f932;
fma.rn.f32 %f992, %f584, %f852, %f932;
fma.rn.f32 %f993, %f583, %f852, %f932;
fma.rn.f32 %f994, %f582, %f852, %f932;
fma.rn.f32 %f995, %f581, %f852, %f932;
fma.rn.f32 %f996, %f580, %f852, %f932;
.loc 1 126 21 // layer_norm.py:126:21
neg.f32 %f997, %f933;
fma.rn.f32 %f998, %f386, %f644, %f997;
neg.f32 %f999, %f934;
fma.rn.f32 %f1000, %f387, %f646, %f999;
neg.f32 %f1001, %f935;
fma.rn.f32 %f1002, %f388, %f648, %f1001;
neg.f32 %f1003, %f936;
fma.rn.f32 %f1004, %f389, %f650, %f1003;
neg.f32 %f1005, %f937;
fma.rn.f32 %f1006, %f390, %f652, %f1005;
neg.f32 %f1007, %f938;
fma.rn.f32 %f1008, %f391, %f654, %f1007;
neg.f32 %f1009, %f939;
fma.rn.f32 %f1010, %f392, %f656, %f1009;
neg.f32 %f1011, %f940;
fma.rn.f32 %f1012, %f393, %f658, %f1011;
neg.f32 %f1013, %f941;
fma.rn.f32 %f1014, %f394, %f660, %f1013;
neg.f32 %f1015, %f942;
fma.rn.f32 %f1016, %f395, %f662, %f1015;
neg.f32 %f1017, %f943;
fma.rn.f32 %f1018, %f396, %f664, %f1017;
neg.f32 %f1019, %f944;
fma.rn.f32 %f1020, %f397, %f666, %f1019;
neg.f32 %f1021, %f945;
fma.rn.f32 %f1022, %f398, %f668, %f1021;
neg.f32 %f1023, %f946;
fma.rn.f32 %f1024, %f399, %f670, %f1023;
neg.f32 %f1025, %f947;
fma.rn.f32 %f1026, %f400, %f672, %f1025;
neg.f32 %f1027, %f948;
fma.rn.f32 %f1028, %f401, %f674, %f1027;
neg.f32 %f1029, %f949;
fma.rn.f32 %f1030, %f402, %f676, %f1029;
neg.f32 %f1031, %f950;
fma.rn.f32 %f1032, %f403, %f678, %f1031;
neg.f32 %f1033, %f951;
fma.rn.f32 %f1034, %f404, %f680, %f1033;
neg.f32 %f1035, %f952;
fma.rn.f32 %f1036, %f405, %f682, %f1035;
neg.f32 %f1037, %f953;
fma.rn.f32 %f1038, %f406, %f684, %f1037;
neg.f32 %f1039, %f954;
fma.rn.f32 %f1040, %f407, %f686, %f1039;
neg.f32 %f1041, %f955;
fma.rn.f32 %f1042, %f408, %f688, %f1041;
neg.f32 %f1043, %f956;
fma.rn.f32 %f1044, %f409, %f690, %f1043;
neg.f32 %f1045, %f957;
fma.rn.f32 %f1046, %f410, %f692, %f1045;
neg.f32 %f1047, %f958;
fma.rn.f32 %f1048, %f411, %f694, %f1047;
neg.f32 %f1049, %f959;
fma.rn.f32 %f1050, %f412, %f696, %f1049;
neg.f32 %f1051, %f960;
fma.rn.f32 %f1052, %f413, %f698, %f1051;
neg.f32 %f1053, %f961;
fma.rn.f32 %f1054, %f414, %f700, %f1053;
neg.f32 %f1055, %f962;
fma.rn.f32 %f1056, %f415, %f702, %f1055;
neg.f32 %f1057, %f963;
fma.rn.f32 %f1058, %f416, %f704, %f1057;
neg.f32 %f1059, %f964;
fma.rn.f32 %f1060, %f417, %f706, %f1059;
neg.f32 %f1061, %f965;
fma.rn.f32 %f1062, %f418, %f708, %f1061;
neg.f32 %f1063, %f966;
fma.rn.f32 %f1064, %f419, %f710, %f1063;
neg.f32 %f1065, %f967;
fma.rn.f32 %f1066, %f420, %f712, %f1065;
neg.f32 %f1067, %f968;
fma.rn.f32 %f1068, %f421, %f714, %f1067;
neg.f32 %f1069, %f969;
fma.rn.f32 %f1070, %f422, %f716, %f1069;
neg.f32 %f1071, %f970;
fma.rn.f32 %f1072, %f423, %f718, %f1071;
neg.f32 %f1073, %f971;
fma.rn.f32 %f1074, %f424, %f720, %f1073;
neg.f32 %f1075, %f972;
fma.rn.f32 %f1076, %f425, %f722, %f1075;
neg.f32 %f1077, %f973;
fma.rn.f32 %f1078, %f426, %f724, %f1077;
neg.f32 %f1079, %f974;
fma.rn.f32 %f1080, %f427, %f726, %f1079;
neg.f32 %f1081, %f975;
fma.rn.f32 %f1082, %f428, %f728, %f1081;
neg.f32 %f1083, %f976;
fma.rn.f32 %f1084, %f429, %f730, %f1083;
neg.f32 %f1085, %f977;
fma.rn.f32 %f1086, %f430, %f732, %f1085;
neg.f32 %f1087, %f978;
fma.rn.f32 %f1088, %f431, %f734, %f1087;
neg.f32 %f1089, %f979;
fma.rn.f32 %f1090, %f432, %f736, %f1089;
neg.f32 %f1091, %f980;
fma.rn.f32 %f1092, %f433, %f738, %f1091;
neg.f32 %f1093, %f981;
fma.rn.f32 %f1094, %f434, %f740, %f1093;
neg.f32 %f1095, %f982;
fma.rn.f32 %f1096, %f435, %f742, %f1095;
neg.f32 %f1097, %f983;
fma.rn.f32 %f1098, %f436, %f744, %f1097;
neg.f32 %f1099, %f984;
fma.rn.f32 %f1100, %f437, %f746, %f1099;
neg.f32 %f1101, %f985;
fma.rn.f32 %f1102, %f438, %f748, %f1101;
neg.f32 %f1103, %f986;
fma.rn.f32 %f1104, %f439, %f750, %f1103;
neg.f32 %f1105, %f987;
fma.rn.f32 %f1106, %f440, %f752, %f1105;
neg.f32 %f1107, %f988;
fma.rn.f32 %f1108, %f441, %f754, %f1107;
neg.f32 %f1109, %f989;
fma.rn.f32 %f1110, %f442, %f756, %f1109;
neg.f32 %f1111, %f990;
fma.rn.f32 %f1112, %f443, %f758, %f1111;
neg.f32 %f1113, %f991;
fma.rn.f32 %f1114, %f444, %f760, %f1113;
neg.f32 %f1115, %f992;
fma.rn.f32 %f1116, %f445, %f762, %f1115;
neg.f32 %f1117, %f993;
fma.rn.f32 %f1118, %f446, %f764, %f1117;
neg.f32 %f1119, %f994;
fma.rn.f32 %f1120, %f447, %f766, %f1119;
neg.f32 %f1121, %f995;
fma.rn.f32 %f1122, %f448, %f768, %f1121;
neg.f32 %f1123, %f996;
fma.rn.f32 %f1124, %f449, %f770, %f1123;
.loc 1 126 41 // layer_norm.py:126:41
mul.f32 %f1125, %f998, %f450;
mul.f32 %f1126, %f1000, %f450;
mul.f32 %f1127, %f1002, %f450;
mul.f32 %f1128, %f1004, %f450;
mul.f32 %f1129, %f1006, %f450;
mul.f32 %f1130, %f1008, %f450;
mul.f32 %f1131, %f1010, %f450;
mul.f32 %f1132, %f1012, %f450;
mul.f32 %f1133, %f1014, %f450;
mul.f32 %f1134, %f1016, %f450;
mul.f32 %f1135, %f1018, %f450;
mul.f32 %f1136, %f1020, %f450;
mul.f32 %f1137, %f1022, %f450;
mul.f32 %f1138, %f1024, %f450;
mul.f32 %f1139, %f1026, %f450;
mul.f32 %f1140, %f1028, %f450;
mul.f32 %f1141, %f1030, %f450;
mul.f32 %f1142, %f1032, %f450;
mul.f32 %f1143, %f1034, %f450;
mul.f32 %f1144, %f1036, %f450;
mul.f32 %f1145, %f1038, %f450;
mul.f32 %f1146, %f1040, %f450;
mul.f32 %f1147, %f1042, %f450;
mul.f32 %f1148, %f1044, %f450;
mul.f32 %f1149, %f1046, %f450;
mul.f32 %f1150, %f1048, %f450;
mul.f32 %f1151, %f1050, %f450;
mul.f32 %f1152, %f1052, %f450;
mul.f32 %f1153, %f1054, %f450;
mul.f32 %f1154, %f1056, %f450;
mul.f32 %f1155, %f1058, %f450;
mul.f32 %f1156, %f1060, %f450;
mul.f32 %f1157, %f1062, %f450;
mul.f32 %f1158, %f1064, %f450;
mul.f32 %f1159, %f1066, %f450;
mul.f32 %f1160, %f1068, %f450;
mul.f32 %f1161, %f1070, %f450;
mul.f32 %f1162, %f1072, %f450;
mul.f32 %f1163, %f1074, %f450;
mul.f32 %f1164, %f1076, %f450;
mul.f32 %f1165, %f1078, %f450;
mul.f32 %f1166, %f1080, %f450;
mul.f32 %f1167, %f1082, %f450;
mul.f32 %f1168, %f1084, %f450;
mul.f32 %f1169, %f1086, %f450;
mul.f32 %f1170, %f1088, %f450;
mul.f32 %f1171, %f1090, %f450;
mul.f32 %f1172, %f1092, %f450;
mul.f32 %f1173, %f1094, %f450;
mul.f32 %f1174, %f1096, %f450;
mul.f32 %f1175, %f1098, %f450;
mul.f32 %f1176, %f1100, %f450;
mul.f32 %f1177, %f1102, %f450;
mul.f32 %f1178, %f1104, %f450;
mul.f32 %f1179, %f1106, %f450;
mul.f32 %f1180, %f1108, %f450;
mul.f32 %f1181, %f1110, %f450;
mul.f32 %f1182, %f1112, %f450;
mul.f32 %f1183, %f1114, %f450;
mul.f32 %f1184, %f1116, %f450;
mul.f32 %f1185, %f1118, %f450;
mul.f32 %f1186, %f1120, %f450;
mul.f32 %f1187, %f1122, %f450;
mul.f32 %f1188, %f1124, %f450;
.loc 1 127 32 // layer_norm.py:127:32
mov.b32 %r826, %f1125;
mov.b32 %r827, %f1126;
mov.b32 %r828, %f1127;
mov.b32 %r829, %f1128;
// begin inline asm
@%p2 st.global.v4.b32 [ %rd106 + 0 ], { %r826, %r827, %r828, %r829 };
// end inline asm
mov.b32 %r830, %f1129;
mov.b32 %r831, %f1130;
mov.b32 %r832, %f1131;
mov.b32 %r833, %f1132;
// begin inline asm
@%p74 st.global.v4.b32 [ %rd107 + 0 ], { %r830, %r831, %r832, %r833 };
// end inline asm
mov.b32 %r834, %f1133;
mov.b32 %r835, %f1134;
mov.b32 %r836, %f1135;
mov.b32 %r837, %f1136;
// begin inline asm
@%p75 st.global.v4.b32 [ %rd108 + 0 ], { %r834, %r835, %r836, %r837 };
// end inline asm
mov.b32 %r838, %f1137;
mov.b32 %r839, %f1138;
mov.b32 %r840, %f1139;
mov.b32 %r841, %f1140;
// begin inline asm
@%p76 st.global.v4.b32 [ %rd109 + 0 ], { %r838, %r839, %r840, %r841 };
// end inline asm
mov.b32 %r842, %f1141;
mov.b32 %r843, %f1142;
mov.b32 %r844, %f1143;
mov.b32 %r845, %f1144;
// begin inline asm
@%p77 st.global.v4.b32 [ %rd110 + 0 ], { %r842, %r843, %r844, %r845 };
// end inline asm
mov.b32 %r846, %f1145;
mov.b32 %r847, %f1146;
mov.b32 %r848, %f1147;
mov.b32 %r849, %f1148;
// begin inline asm
@%p78 st.global.v4.b32 [ %rd111 + 0 ], { %r846, %r847, %r848, %r849 };
// end inline asm
mov.b32 %r850, %f1149;
mov.b32 %r851, %f1150;
mov.b32 %r852, %f1151;
mov.b32 %r853, %f1152;
// begin inline asm
@%p79 st.global.v4.b32 [ %rd112 + 0 ], { %r850, %r851, %r852, %r853 };
// end inline asm
mov.b32 %r854, %f1153;
mov.b32 %r855, %f1154;
mov.b32 %r856, %f1155;
mov.b32 %r857, %f1156;
// begin inline asm
@%p9 st.global.v4.b32 [ %rd113 + 0 ], { %r854, %r855, %r856, %r857 };
// end inline asm
mov.b32 %r858, %f1157;
mov.b32 %r859, %f1158;
mov.b32 %r860, %f1159;
mov.b32 %r861, %f1160;
// begin inline asm
@%p81 st.global.v4.b32 [ %rd114 + 0 ], { %r858, %r859, %r860, %r861 };
// end inline asm
mov.b32 %r862, %f1161;
mov.b32 %r863, %f1162;
mov.b32 %r864, %f1163;
mov.b32 %r865, %f1164;
// begin inline asm
@%p82 st.global.v4.b32 [ %rd115 + 0 ], { %r862, %r863, %r864, %r865 };
// end inline asm
mov.b32 %r866, %f1165;
mov.b32 %r867, %f1166;
mov.b32 %r868, %f1167;
mov.b32 %r869, %f1168;
// begin inline asm
@%p83 st.global.v4.b32 [ %rd116 + 0 ], { %r866, %r867, %r868, %r869 };
// end inline asm
mov.b32 %r870, %f1169;
mov.b32 %r871, %f1170;
mov.b32 %r872, %f1171;
mov.b32 %r873, %f1172;
// begin inline asm
@%p84 st.global.v4.b32 [ %rd117 + 0 ], { %r870, %r871, %r872, %r873 };
// end inline asm
mov.b32 %r874, %f1173;
mov.b32 %r875, %f1174;
mov.b32 %r876, %f1175;
mov.b32 %r877, %f1176;
// begin inline asm
@%p85 st.global.v4.b32 [ %rd118 + 0 ], { %r874, %r875, %r876, %r877 };
// end inline asm
mov.b32 %r878, %f1177;
mov.b32 %r879, %f1178;
mov.b32 %r880, %f1179;
mov.b32 %r881, %f1180;
// begin inline asm
@%p86 st.global.v4.b32 [ %rd119 + 0 ], { %r878, %r879, %r880, %r881 };
// end inline asm
mov.b32 %r882, %f1181;
mov.b32 %r883, %f1182;
mov.b32 %r884, %f1183;
mov.b32 %r885, %f1184;
// begin inline asm
@%p87 st.global.v4.b32 [ %rd120 + 0 ], { %r882, %r883, %r884, %r885 };
// end inline asm
mov.b32 %r886, %f1185;
mov.b32 %r887, %f1186;
mov.b32 %r888, %f1187;
mov.b32 %r889, %f1188;
// begin inline asm
@%p17 st.global.v4.b32 [ %rd121 + 0 ], { %r886, %r887, %r888, %r889 };
// end inline asm
.loc 1 129 18 // layer_norm.py:129:18
fma.rn.f32 %f1243, %f589, %f752, %f1243;
fma.rn.f32 %f1242, %f590, %f750, %f1242;
fma.rn.f32 %f1241, %f591, %f748, %f1241;
fma.rn.f32 %f1240, %f592, %f746, %f1240;
fma.rn.f32 %f1239, %f593, %f744, %f1239;
fma.rn.f32 %f1238, %f594, %f742, %f1238;
fma.rn.f32 %f1237, %f595, %f740, %f1237;
fma.rn.f32 %f1236, %f596, %f738, %f1236;
fma.rn.f32 %f1235, %f597, %f736, %f1235;
fma.rn.f32 %f1234, %f598, %f734, %f1234;
fma.rn.f32 %f1233, %f599, %f732, %f1233;
fma.rn.f32 %f1232, %f600, %f730, %f1232;
fma.rn.f32 %f1231, %f601, %f728, %f1231;
fma.rn.f32 %f1230, %f602, %f726, %f1230;
fma.rn.f32 %f1229, %f603, %f724, %f1229;
fma.rn.f32 %f1228, %f604, %f722, %f1228;
fma.rn.f32 %f1227, %f605, %f720, %f1227;
fma.rn.f32 %f1226, %f606, %f718, %f1226;
fma.rn.f32 %f1225, %f607, %f716, %f1225;
fma.rn.f32 %f1224, %f608, %f714, %f1224;
fma.rn.f32 %f1223, %f609, %f712, %f1223;
fma.rn.f32 %f1222, %f610, %f710, %f1222;
fma.rn.f32 %f1221, %f611, %f708, %f1221;
fma.rn.f32 %f1220, %f612, %f706, %f1220;
fma.rn.f32 %f1219, %f613, %f704, %f1219;
fma.rn.f32 %f1218, %f614, %f702, %f1218;
fma.rn.f32 %f1217, %f615, %f700, %f1217;
fma.rn.f32 %f1216, %f616, %f698, %f1216;
fma.rn.f32 %f1215, %f617, %f696, %f1215;
fma.rn.f32 %f1214, %f618, %f694, %f1214;
fma.rn.f32 %f1213, %f619, %f692, %f1213;
fma.rn.f32 %f1212, %f620, %f690, %f1212;
fma.rn.f32 %f1211, %f621, %f688, %f1211;
fma.rn.f32 %f1210, %f622, %f686, %f1210;
fma.rn.f32 %f1209, %f623, %f684, %f1209;
fma.rn.f32 %f1208, %f624, %f682, %f1208;
fma.rn.f32 %f1207, %f625, %f680, %f1207;
fma.rn.f32 %f1206, %f626, %f678, %f1206;
fma.rn.f32 %f1205, %f627, %f676, %f1205;
fma.rn.f32 %f1204, %f628, %f674, %f1204;
fma.rn.f32 %f1203, %f629, %f672, %f1203;
fma.rn.f32 %f1202, %f630, %f670, %f1202;
fma.rn.f32 %f1201, %f631, %f668, %f1201;
fma.rn.f32 %f1200, %f632, %f666, %f1200;
fma.rn.f32 %f1199, %f633, %f664, %f1199;
fma.rn.f32 %f1198, %f634, %f662, %f1198;
fma.rn.f32 %f1197, %f635, %f660, %f1197;
fma.rn.f32 %f1196, %f636, %f658, %f1196;
fma.rn.f32 %f1195, %f637, %f656, %f1195;
fma.rn.f32 %f1194, %f638, %f654, %f1194;
fma.rn.f32 %f1193, %f639, %f652, %f1193;
fma.rn.f32 %f1192, %f640, %f650, %f1192;
fma.rn.f32 %f1191, %f641, %f648, %f1191;
fma.rn.f32 %f1190, %f643, %f646, %f1190;
fma.rn.f32 %f1189, %f642, %f644, %f1189;
fma.rn.f32 %f1244, %f588, %f754, %f1244;
fma.rn.f32 %f1245, %f587, %f756, %f1245;
fma.rn.f32 %f1246, %f586, %f758, %f1246;
fma.rn.f32 %f1247, %f585, %f760, %f1247;
fma.rn.f32 %f1248, %f584, %f762, %f1248;
fma.rn.f32 %f1249, %f583, %f764, %f1249;
fma.rn.f32 %f1250, %f582, %f766, %f1250;
fma.rn.f32 %f1251, %f581, %f768, %f1251;
fma.rn.f32 %f1252, %f580, %f770, %f1252;
add.f32 %f1253, %f1253, %f644;
add.f32 %f1254, %f1254, %f646;
add.f32 %f1255, %f1255, %f648;
add.f32 %f1256, %f1256, %f650;
add.f32 %f1257, %f1257, %f652;
add.f32 %f1258, %f1258, %f654;
add.f32 %f1259, %f1259, %f656;
add.f32 %f1260, %f1260, %f658;
add.f32 %f1261, %f1261, %f660;
add.f32 %f1262, %f1262, %f662;
add.f32 %f1263, %f1263, %f664;
add.f32 %f1264, %f1264, %f666;
add.f32 %f1265, %f1265, %f668;
add.f32 %f1266, %f1266, %f670;
add.f32 %f1267, %f1267, %f672;
add.f32 %f1268, %f1268, %f674;
add.f32 %f1269, %f1269, %f676;
add.f32 %f1270, %f1270, %f678;
add.f32 %f1271, %f1271, %f680;
add.f32 %f1272, %f1272, %f682;
add.f32 %f1273, %f1273, %f684;
add.f32 %f1274, %f1274, %f686;
add.f32 %f1275, %f1275, %f688;
add.f32 %f1276, %f1276, %f690;
add.f32 %f1277, %f1277, %f692;
add.f32 %f1278, %f1278, %f694;
add.f32 %f1279, %f1279, %f696;
add.f32 %f1280, %f1280, %f698;
add.f32 %f1281, %f1281, %f700;
add.f32 %f1282, %f1282, %f702;
add.f32 %f1283, %f1283, %f704;
add.f32 %f1284, %f1284, %f706;
add.f32 %f1285, %f1285, %f708;
add.f32 %f1286, %f1286, %f710;
add.f32 %f1287, %f1287, %f712;
add.f32 %f1288, %f1288, %f714;
add.f32 %f1289, %f1289, %f716;
add.f32 %f1290, %f1290, %f718;
add.f32 %f1291, %f1291, %f720;
add.f32 %f1292, %f1292, %f722;
add.f32 %f1293, %f1293, %f724;
add.f32 %f1294, %f1294, %f726;
add.f32 %f1295, %f1295, %f728;
add.f32 %f1296, %f1296, %f730;
add.f32 %f1297, %f1297, %f732;
add.f32 %f1298, %f1298, %f734;
add.f32 %f1299, %f1299, %f736;
add.f32 %f1300, %f1300, %f738;
add.f32 %f1301, %f1301, %f740;
add.f32 %f1302, %f1302, %f742;
add.f32 %f1303, %f1303, %f744;
add.f32 %f1304, %f1304, %f746;
add.f32 %f1305, %f1305, %f748;
add.f32 %f1306, %f1306, %f750;
add.f32 %f1307, %f1307, %f752;
add.f32 %f1308, %f1308, %f754;
add.f32 %f1309, %f1309, %f756;
add.f32 %f1310, %f1310, %f758;
add.f32 %f1311, %f1311, %f760;
add.f32 %f1312, %f1312, %f762;
add.f32 %f1313, %f1313, %f764;
add.f32 %f1314, %f1314, %f766;
add.f32 %f1315, %f1315, %f768;
add.f32 %f1316, %f1316, %f770;
.loc 1 133 20 // layer_norm.py:133:20
add.s64 %rd167, %rd167, 4;
.loc 1 134 20 // layer_norm.py:134:20
add.s64 %rd166, %rd166, 4;
.loc 1 115 30 // layer_norm.py:115:30
add.s64 %rd165, %rd165, %rd26;
add.s64 %rd164, %rd164, %rd29;
add.s64 %rd163, %rd163, %rd30;
add.s32 %r1055, %r1055, -1;
setp.ne.s32 %p72, %r1055, 0;
@%p72 bra $L__BB0_2;
// %bb.3: // %._crit_edge.loopexit
.loc 1 138 55 // layer_norm.py:138:55
mov.b32 %r1056, %f1189;
mov.b32 %r1057, %f1190;
mov.b32 %r1058, %f1191;
mov.b32 %r1059, %f1192;
mov.b32 %r1060, %f1193;
mov.b32 %r1061, %f1194;
mov.b32 %r1062, %f1195;
mov.b32 %r1063, %f1196;
mov.b32 %r1064, %f1197;
mov.b32 %r1065, %f1198;
mov.b32 %r1066, %f1199;
mov.b32 %r1067, %f1200;
mov.b32 %r1068, %f1201;
mov.b32 %r1069, %f1202;
mov.b32 %r1070, %f1203;
mov.b32 %r1071, %f1204;
mov.b32 %r1072, %f1205;
mov.b32 %r1073, %f1206;
mov.b32 %r1074, %f1207;
mov.b32 %r1075, %f1208;
mov.b32 %r1076, %f1209;
mov.b32 %r1077, %f1210;
mov.b32 %r1078, %f1211;
mov.b32 %r1079, %f1212;
mov.b32 %r1080, %f1213;
mov.b32 %r1081, %f1214;
mov.b32 %r1082, %f1215;
mov.b32 %r1083, %f1216;
mov.b32 %r1084, %f1217;
mov.b32 %r1085, %f1218;
mov.b32 %r1086, %f1219;
mov.b32 %r1087, %f1220;
mov.b32 %r1088, %f1221;
mov.b32 %r1089, %f1222;
mov.b32 %r1090, %f1223;
mov.b32 %r1091, %f1224;
mov.b32 %r1092, %f1225;
mov.b32 %r1093, %f1226;
mov.b32 %r1094, %f1227;
mov.b32 %r1095, %f1228;
mov.b32 %r1096, %f1229;
mov.b32 %r1097, %f1230;
mov.b32 %r1098, %f1231;
mov.b32 %r1099, %f1232;
mov.b32 %r1100, %f1233;
mov.b32 %r1101, %f1234;
mov.b32 %r1102, %f1235;
mov.b32 %r1103, %f1236;
mov.b32 %r1104, %f1237;
mov.b32 %r1105, %f1238;
mov.b32 %r1106, %f1239;
mov.b32 %r1107, %f1240;
mov.b32 %r1108, %f1241;
mov.b32 %r1109, %f1242;
mov.b32 %r1110, %f1243;
mov.b32 %r1111, %f1244;
mov.b32 %r1112, %f1245;
mov.b32 %r1113, %f1246;
mov.b32 %r1114, %f1247;
mov.b32 %r1115, %f1248;
mov.b32 %r1116, %f1249;
mov.b32 %r1117, %f1250;
mov.b32 %r1118, %f1251;
mov.b32 %r1119, %f1252;
mov.b32 %r1120, %f1253;
mov.b32 %r1121, %f1254;
mov.b32 %r1122, %f1255;
mov.b32 %r1123, %f1256;
mov.b32 %r1124, %f1257;
mov.b32 %r1125, %f1258;
mov.b32 %r1126, %f1259;
mov.b32 %r1127, %f1260;
mov.b32 %r1128, %f1261;
mov.b32 %r1129, %f1262;
mov.b32 %r1130, %f1263;
mov.b32 %r1131, %f1264;
mov.b32 %r1132, %f1265;
mov.b32 %r1133, %f1266;
mov.b32 %r1134, %f1267;
mov.b32 %r1135, %f1268;
mov.b32 %r1136, %f1269;
mov.b32 %r1137, %f1270;
mov.b32 %r1138, %f1271;
mov.b32 %r1139, %f1272;
mov.b32 %r1140, %f1273;
mov.b32 %r1141, %f1274;
mov.b32 %r1142, %f1275;
mov.b32 %r1143, %f1276;
mov.b32 %r1144, %f1277;
mov.b32 %r1145, %f1278;
mov.b32 %r1146, %f1279;
mov.b32 %r1147, %f1280;
mov.b32 %r1148, %f1281;
mov.b32 %r1149, %f1282;
mov.b32 %r1150, %f1283;
mov.b32 %r1151, %f1284;
mov.b32 %r1152, %f1285;
mov.b32 %r1153, %f1286;
mov.b32 %r1154, %f1287;
mov.b32 %r1155, %f1288;
mov.b32 %r1156, %f1289;
mov.b32 %r1157, %f1290;
mov.b32 %r1158, %f1291;
mov.b32 %r1159, %f1292;
mov.b32 %r1160, %f1293;
mov.b32 %r1161, %f1294;
mov.b32 %r1162, %f1295;
mov.b32 %r1163, %f1296;
mov.b32 %r1164, %f1297;
mov.b32 %r1165, %f1298;
mov.b32 %r1166, %f1299;
mov.b32 %r1167, %f1300;
mov.b32 %r1168, %f1301;
mov.b32 %r1169, %f1302;
mov.b32 %r1170, %f1303;
mov.b32 %r1171, %f1304;
mov.b32 %r1172, %f1305;
mov.b32 %r1173, %f1306;
mov.b32 %r1174, %f1307;
mov.b32 %r1175, %f1308;
mov.b32 %r1176, %f1309;
mov.b32 %r1177, %f1310;
mov.b32 %r1178, %f1311;
mov.b32 %r1179, %f1312;
mov.b32 %r1180, %f1313;
mov.b32 %r1181, %f1314;
mov.b32 %r1182, %f1315;
mov.b32 %r1183, %f1316;
$L__BB0_4: // %._crit_edge
.loc 1 104 18 // layer_norm.py:104:18
setp.lt.s32 %p88, %r1052, %r285;
setp.lt.s32 %p80, %r1053, %r285;
setp.lt.s32 %p73, %r1054, %r285;
.loc 1 138 37 // layer_norm.py:138:37
mul.lo.s32 %r1050, %r282, %r1;
.loc 1 138 22 // layer_norm.py:138:22
mul.wide.s32 %rd154, %r1050, 4;
add.s64 %rd155, %rd45, %rd154;
.loc 1 138 49 // layer_norm.py:138:49
shl.b64 %rd156, %rd1, 2;
add.s64 %rd122, %rd155, %rd156;
add.s64 %rd123, %rd122, 2048;
add.s64 %rd124, %rd122, 4096;
add.s64 %rd125, %rd122, 6144;
add.s64 %rd126, %rd122, 8192;
add.s64 %rd127, %rd122, 10240;
add.s64 %rd128, %rd122, 12288;
add.s64 %rd129, %rd155, %rd162;
add.s64 %rd130, %rd122, 16384;
add.s64 %rd131, %rd122, 18432;
add.s64 %rd132, %rd122, 20480;
add.s64 %rd133, %rd122, 22528;
add.s64 %rd134, %rd122, 24576;
add.s64 %rd135, %rd122, 26624;
add.s64 %rd136, %rd122, 28672;
add.s64 %rd137, %rd155, %rd161;
.loc 1 138 55 // layer_norm.py:138:55
// begin inline asm
@%p73 st.global.v4.b32 [ %rd122 + 0 ], { %r1056, %r1057, %r1058, %r1059 };
// end inline asm
// begin inline asm
@%p74 st.global.v4.b32 [ %rd123 + 0 ], { %r1060, %r1061, %r1062, %r1063 };
// end inline asm
// begin inline asm
@%p75 st.global.v4.b32 [ %rd124 + 0 ], { %r1064, %r1065, %r1066, %r1067 };
// end inline asm
// begin inline asm
@%p76 st.global.v4.b32 [ %rd125 + 0 ], { %r1068, %r1069, %r1070, %r1071 };
// end inline asm
// begin inline asm
@%p77 st.global.v4.b32 [ %rd126 + 0 ], { %r1072, %r1073, %r1074, %r1075 };
// end inline asm
// begin inline asm
@%p78 st.global.v4.b32 [ %rd127 + 0 ], { %r1076, %r1077, %r1078, %r1079 };
// end inline asm
// begin inline asm
@%p79 st.global.v4.b32 [ %rd128 + 0 ], { %r1080, %r1081, %r1082, %r1083 };
// end inline asm
// begin inline asm
@%p80 st.global.v4.b32 [ %rd129 + 0 ], { %r1084, %r1085, %r1086, %r1087 };
// end inline asm
// begin inline asm
@%p81 st.global.v4.b32 [ %rd130 + 0 ], { %r1088, %r1089, %r1090, %r1091 };
// end inline asm
// begin inline asm
@%p82 st.global.v4.b32 [ %rd131 + 0 ], { %r1092, %r1093, %r1094, %r1095 };
// end inline asm
// begin inline asm
@%p83 st.global.v4.b32 [ %rd132 + 0 ], { %r1096, %r1097, %r1098, %r1099 };
// end inline asm
// begin inline asm
@%p84 st.global.v4.b32 [ %rd133 + 0 ], { %r1100, %r1101, %r1102, %r1103 };
// end inline asm
// begin inline asm
@%p85 st.global.v4.b32 [ %rd134 + 0 ], { %r1104, %r1105, %r1106, %r1107 };
// end inline asm
// begin inline asm
@%p86 st.global.v4.b32 [ %rd135 + 0 ], { %r1108, %r1109, %r1110, %r1111 };
// end inline asm
// begin inline asm
@%p87 st.global.v4.b32 [ %rd136 + 0 ], { %r1112, %r1113, %r1114, %r1115 };
// end inline asm
// begin inline asm
@%p88 st.global.v4.b32 [ %rd137 + 0 ], { %r1116, %r1117, %r1118, %r1119 };
// end inline asm
.loc 1 139 37 // layer_norm.py:139:37
mul.lo.s32 %r1051, %r283, %r1;
.loc 1 139 22 // layer_norm.py:139:22
mul.wide.s32 %rd159, %r1051, 4;
add.s64 %rd160, %rd46, %rd159;
.loc 1 139 49 // layer_norm.py:139:49
add.s64 %rd138, %rd160, %rd156;
add.s64 %rd139, %rd138, 2048;
add.s64 %rd140, %rd138, 4096;
add.s64 %rd141, %rd138, 6144;
add.s64 %rd142, %rd138, 8192;
add.s64 %rd143, %rd138, 10240;
add.s64 %rd144, %rd138, 12288;
add.s64 %rd145, %rd160, %rd162;
add.s64 %rd146, %rd138, 16384;
add.s64 %rd147, %rd138, 18432;
add.s64 %rd148, %rd138, 20480;
add.s64 %rd149, %rd138, 22528;
add.s64 %rd150, %rd138, 24576;
add.s64 %rd151, %rd138, 26624;
add.s64 %rd152, %rd138, 28672;
add.s64 %rd153, %rd160, %rd161;
.loc 1 139 55 // layer_norm.py:139:55
// begin inline asm
@%p73 st.global.v4.b32 [ %rd138 + 0 ], { %r1120, %r1121, %r1122, %r1123 };
// end inline asm
// begin inline asm
@%p74 st.global.v4.b32 [ %rd139 + 0 ], { %r1124, %r1125, %r1126, %r1127 };
// end inline asm
// begin inline asm
@%p75 st.global.v4.b32 [ %rd140 + 0 ], { %r1128, %r1129, %r1130, %r1131 };
// end inline asm
// begin inline asm
@%p76 st.global.v4.b32 [ %rd141 + 0 ], { %r1132, %r1133, %r1134, %r1135 };
// end inline asm
// begin inline asm
@%p77 st.global.v4.b32 [ %rd142 + 0 ], { %r1136, %r1137, %r1138, %r1139 };
// end inline asm
// begin inline asm
@%p78 st.global.v4.b32 [ %rd143 + 0 ], { %r1140, %r1141, %r1142, %r1143 };
// end inline asm
// begin inline asm
@%p79 st.global.v4.b32 [ %rd144 + 0 ], { %r1144, %r1145, %r1146, %r1147 };
// end inline asm
// begin inline asm
@%p80 st.global.v4.b32 [ %rd145 + 0 ], { %r1148, %r1149, %r1150, %r1151 };
// end inline asm
// begin inline asm
@%p81 st.global.v4.b32 [ %rd146 + 0 ], { %r1152, %r1153, %r1154, %r1155 };
// end inline asm
// begin inline asm
@%p82 st.global.v4.b32 [ %rd147 + 0 ], { %r1156, %r1157, %r1158, %r1159 };
// end inline asm
// begin inline asm
@%p83 st.global.v4.b32 [ %rd148 + 0 ], { %r1160, %r1161, %r1162, %r1163 };
// end inline asm
// begin inline asm
@%p84 st.global.v4.b32 [ %rd149 + 0 ], { %r1164, %r1165, %r1166, %r1167 };
// end inline asm
// begin inline asm
@%p85 st.global.v4.b32 [ %rd150 + 0 ], { %r1168, %r1169, %r1170, %r1171 };
// end inline asm
// begin inline asm
@%p86 st.global.v4.b32 [ %rd151 + 0 ], { %r1172, %r1173, %r1174, %r1175 };
// end inline asm
// begin inline asm
@%p87 st.global.v4.b32 [ %rd152 + 0 ], { %r1176, %r1177, %r1178, %r1179 };
// end inline asm
// begin inline asm
@%p88 st.global.v4.b32 [ %rd153 + 0 ], { %r1180, %r1181, %r1182, %r1183 };
// end inline asm
.loc 1 139 4 // layer_norm.py:139:4
ret;
$L__tmp7:
$L__func_end0:
// -- End function
}
.file 1 "/home/dberard/local/pytorch-env7/Liger-Kernel/src/liger_kernel/ops/layer_norm.py"
.file 2 "/home/dberard/local/pytorch-env7/triton/python/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 203 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 108 // DW_AT_name
.b8 97
.b8 121
.b8 101
.b8 114
.b8 95
.b8 110
.b8 111
.b8 114
.b8 109
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 104
.b8 111
.b8 109
.b8 101
.b8 47
.b8 100
.b8 98
.b8 101
.b8 114
.b8 97
.b8 114
.b8 100
.b8 47
.b8 108
.b8 111
.b8 99
.b8 97
.b8 108
.b8 47
.b8 112
.b8 121
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 45
.b8 101
.b8 110
.b8 118
.b8 55
.b8 47
.b8 76
.b8 105
.b8 103
.b8 101
.b8 114
.b8 45
.b8 75
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 47
.b8 115
.b8 114
.b8 99
.b8 47
.b8 108
.b8 105
.b8 103
.b8 101
.b8 114
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 47
.b8 111
.b8 112
.b8 115
.b8 0
.b8 2 // Abbrev [2] 0x6a:0x1e DW_TAG_subprogram
.b8 95 // DW_AT_name
.b8 108
.b8 97
.b8 121
.b8 101
.b8 114
.b8 95
.b8 110
.b8 111
.b8 114
.b8 109
.b8 95
.b8 98
.b8 97
.b8 99
.b8 107
.b8 119
.b8 97
.b8 114
.b8 100
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x88:0x46 DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 106 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0x9d:0x18 DW_TAG_inlined_subroutine
.b32 106 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp4 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 124 // DW_AT_call_line
.b8 20 // DW_AT_call_column
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
.b32 106 // DW_AT_abstract_origin
.b64 $L__tmp5 // DW_AT_low_pc
.b64 $L__tmp6 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 125 // DW_AT_call_line
.b8 20 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment