Last active
July 9, 2025 23:42
-
-
Save davidberard98/9a4b820d39b7d2f89479d59def8f10c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.7 | |
.target sm_90a | |
.address_size 64 | |
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel | |
.extern .shared .align 16 .b8 global_smem[]; | |
// @_layer_norm_backward_kernel | |
.visible .entry _layer_norm_backward_kernel( | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_0, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_1, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_2, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_3, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_4, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_5, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_6, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_7, | |
.param .u32 _layer_norm_backward_kernel_param_8, | |
.param .u32 _layer_norm_backward_kernel_param_9, | |
.param .u32 _layer_norm_backward_kernel_param_10, | |
.param .u32 _layer_norm_backward_kernel_param_11, | |
.param .u32 _layer_norm_backward_kernel_param_12, | |
.param .u32 _layer_norm_backward_kernel_param_13, | |
.param .u32 _layer_norm_backward_kernel_param_14, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_15 | |
) | |
.reqntid 128 | |
{ | |
.reg .pred %p<118>; | |
.reg .b32 %r<1184>; | |
.reg .b32 %f<1317>; | |
.reg .b64 %rd<168>; | |
.loc 1 73 0 // layer_norm.py:73:0 | |
$L__func_begin0: | |
.loc 1 73 0 // layer_norm.py:73:0 | |
// %bb.0: | |
ld.param.b32 %r285, [_layer_norm_backward_kernel_param_14]; | |
ld.param.b32 %r283, [_layer_norm_backward_kernel_param_11]; | |
ld.param.b32 %r282, [_layer_norm_backward_kernel_param_10]; | |
ld.param.b64 %rd46, [_layer_norm_backward_kernel_param_6]; | |
ld.param.b64 %rd45, [_layer_norm_backward_kernel_param_5]; | |
$L__tmp0: | |
.loc 1 100 33 // layer_norm.py:100:33 | |
mov.u32 %r1, %ctaid.x; | |
.loc 1 101 31 // layer_norm.py:101:31 | |
shl.b32 %r2, %r1, 5; | |
.loc 1 102 39 // layer_norm.py:102:39 | |
add.s32 %r414, %r2, 32; | |
ld.param.b32 %r415, [_layer_norm_backward_kernel_param_13]; | |
.loc 1 102 57 // layer_norm.py:102:57 | |
min.s32 %r3, %r414, %r415; | |
.loc 1 103 24 // layer_norm.py:103:24 | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r416, %r4, 2; | |
and.b32 %r417, %r416, 508; | |
or.b32 %r5, %r417, 512; | |
or.b32 %r6, %r417, 1024; | |
or.b32 %r7, %r417, 1536; | |
or.b32 %r8, %r417, 2048; | |
or.b32 %r9, %r417, 2560; | |
or.b32 %r10, %r417, 3072; | |
or.b32 %r418, %r416, 3584; | |
or.b32 %r11, %r417, 4096; | |
or.b32 %r12, %r417, 4608; | |
or.b32 %r13, %r417, 5120; | |
or.b32 %r14, %r417, 5632; | |
or.b32 %r15, %r417, 6144; | |
or.b32 %r16, %r417, 6656; | |
or.b32 %r17, %r417, 7168; | |
or.b32 %r419, %r416, 7680; | |
.loc 1 117 28 // layer_norm.py:117:28 | |
cvt.u64.u32 %rd1, %r417; | |
cvt.u64.u32 %rd9, %r418; | |
cvt.u64.u32 %rd18, %r419; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
setp.le.s32 %p1, %r3, %r2; | |
mov.b32 %r1056, 0; | |
shl.b64 %rd161, %rd18, 2; | |
shl.b64 %rd162, %rd9, 2; | |
setp.lt.s32 %p87, %r17, %r285; | |
cvt.u32.u64 %r1052, %rd18; | |
cvt.u32.u64 %r1053, %rd9; | |
cvt.u32.u64 %r1054, %rd1; | |
setp.lt.s32 %p86, %r16, %r285; | |
setp.lt.s32 %p85, %r15, %r285; | |
setp.lt.s32 %p84, %r14, %r285; | |
setp.lt.s32 %p83, %r13, %r285; | |
setp.lt.s32 %p82, %r12, %r285; | |
setp.lt.s32 %p81, %r11, %r285; | |
setp.lt.s32 %p79, %r10, %r285; | |
setp.lt.s32 %p78, %r9, %r285; | |
setp.lt.s32 %p77, %r8, %r285; | |
setp.lt.s32 %p76, %r7, %r285; | |
setp.lt.s32 %p75, %r6, %r285; | |
setp.lt.s32 %p74, %r5, %r285; | |
mov.b32 %r1057, %r1056; | |
mov.b32 %r1058, %r1056; | |
mov.b32 %r1059, %r1056; | |
mov.b32 %r1060, %r1056; | |
mov.b32 %r1061, %r1056; | |
mov.b32 %r1062, %r1056; | |
mov.b32 %r1063, %r1056; | |
mov.b32 %r1064, %r1056; | |
mov.b32 %r1065, %r1056; | |
mov.b32 %r1066, %r1056; | |
mov.b32 %r1067, %r1056; | |
mov.b32 %r1068, %r1056; | |
mov.b32 %r1069, %r1056; | |
mov.b32 %r1070, %r1056; | |
mov.b32 %r1071, %r1056; | |
mov.b32 %r1072, %r1056; | |
mov.b32 %r1073, %r1056; | |
mov.b32 %r1074, %r1056; | |
mov.b32 %r1075, %r1056; | |
mov.b32 %r1076, %r1056; | |
mov.b32 %r1077, %r1056; | |
mov.b32 %r1078, %r1056; | |
mov.b32 %r1079, %r1056; | |
mov.b32 %r1080, %r1056; | |
mov.b32 %r1081, %r1056; | |
mov.b32 %r1082, %r1056; | |
mov.b32 %r1083, %r1056; | |
mov.b32 %r1084, %r1056; | |
mov.b32 %r1085, %r1056; | |
mov.b32 %r1086, %r1056; | |
mov.b32 %r1087, %r1056; | |
mov.b32 %r1088, %r1056; | |
mov.b32 %r1089, %r1056; | |
mov.b32 %r1090, %r1056; | |
mov.b32 %r1091, %r1056; | |
mov.b32 %r1092, %r1056; | |
mov.b32 %r1093, %r1056; | |
mov.b32 %r1094, %r1056; | |
mov.b32 %r1095, %r1056; | |
mov.b32 %r1096, %r1056; | |
mov.b32 %r1097, %r1056; | |
mov.b32 %r1098, %r1056; | |
mov.b32 %r1099, %r1056; | |
mov.b32 %r1100, %r1056; | |
mov.b32 %r1101, %r1056; | |
mov.b32 %r1102, %r1056; | |
mov.b32 %r1103, %r1056; | |
mov.b32 %r1104, %r1056; | |
mov.b32 %r1105, %r1056; | |
mov.b32 %r1106, %r1056; | |
mov.b32 %r1107, %r1056; | |
mov.b32 %r1108, %r1056; | |
mov.b32 %r1109, %r1056; | |
mov.b32 %r1110, %r1056; | |
mov.b32 %r1111, %r1056; | |
mov.b32 %r1112, %r1056; | |
mov.b32 %r1113, %r1056; | |
mov.b32 %r1114, %r1056; | |
mov.b32 %r1115, %r1056; | |
mov.b32 %r1116, %r1056; | |
mov.b32 %r1117, %r1056; | |
mov.b32 %r1118, %r1056; | |
mov.b32 %r1119, %r1056; | |
mov.b32 %r1120, %r1056; | |
mov.b32 %r1121, %r1056; | |
mov.b32 %r1122, %r1056; | |
mov.b32 %r1123, %r1056; | |
mov.b32 %r1124, %r1056; | |
mov.b32 %r1125, %r1056; | |
mov.b32 %r1126, %r1056; | |
mov.b32 %r1127, %r1056; | |
mov.b32 %r1128, %r1056; | |
mov.b32 %r1129, %r1056; | |
mov.b32 %r1130, %r1056; | |
mov.b32 %r1131, %r1056; | |
mov.b32 %r1132, %r1056; | |
mov.b32 %r1133, %r1056; | |
mov.b32 %r1134, %r1056; | |
mov.b32 %r1135, %r1056; | |
mov.b32 %r1136, %r1056; | |
mov.b32 %r1137, %r1056; | |
mov.b32 %r1138, %r1056; | |
mov.b32 %r1139, %r1056; | |
mov.b32 %r1140, %r1056; | |
mov.b32 %r1141, %r1056; | |
mov.b32 %r1142, %r1056; | |
mov.b32 %r1143, %r1056; | |
mov.b32 %r1144, %r1056; | |
mov.b32 %r1145, %r1056; | |
mov.b32 %r1146, %r1056; | |
mov.b32 %r1147, %r1056; | |
mov.b32 %r1148, %r1056; | |
mov.b32 %r1149, %r1056; | |
mov.b32 %r1150, %r1056; | |
mov.b32 %r1151, %r1056; | |
mov.b32 %r1152, %r1056; | |
mov.b32 %r1153, %r1056; | |
mov.b32 %r1154, %r1056; | |
mov.b32 %r1155, %r1056; | |
mov.b32 %r1156, %r1056; | |
mov.b32 %r1157, %r1056; | |
mov.b32 %r1158, %r1056; | |
mov.b32 %r1159, %r1056; | |
mov.b32 %r1160, %r1056; | |
mov.b32 %r1161, %r1056; | |
mov.b32 %r1162, %r1056; | |
mov.b32 %r1163, %r1056; | |
mov.b32 %r1164, %r1056; | |
mov.b32 %r1165, %r1056; | |
mov.b32 %r1166, %r1056; | |
mov.b32 %r1167, %r1056; | |
mov.b32 %r1168, %r1056; | |
mov.b32 %r1169, %r1056; | |
mov.b32 %r1170, %r1056; | |
mov.b32 %r1171, %r1056; | |
mov.b32 %r1172, %r1056; | |
mov.b32 %r1173, %r1056; | |
mov.b32 %r1174, %r1056; | |
mov.b32 %r1175, %r1056; | |
mov.b32 %r1176, %r1056; | |
mov.b32 %r1177, %r1056; | |
mov.b32 %r1178, %r1056; | |
mov.b32 %r1179, %r1056; | |
mov.b32 %r1180, %r1056; | |
mov.b32 %r1181, %r1056; | |
mov.b32 %r1182, %r1056; | |
mov.b32 %r1183, %r1056; | |
@%p1 bra $L__BB0_4; | |
// %bb.1: // %.lr.ph | |
.loc 1 0 30 // layer_norm.py:0:30 | |
ld.param.b32 %r284, [_layer_norm_backward_kernel_param_12]; | |
ld.param.b32 %r281, [_layer_norm_backward_kernel_param_9]; | |
ld.param.b32 %r280, [_layer_norm_backward_kernel_param_8]; | |
ld.param.b64 %rd47, [_layer_norm_backward_kernel_param_7]; | |
ld.param.b64 %rd44, [_layer_norm_backward_kernel_param_4]; | |
ld.param.b64 %rd43, [_layer_norm_backward_kernel_param_3]; | |
ld.param.b64 %rd42, [_layer_norm_backward_kernel_param_2]; | |
ld.param.b64 %rd41, [_layer_norm_backward_kernel_param_0]; | |
ld.param.b64 %rd48, [_layer_norm_backward_kernel_param_1]; | |
mul.wide.u32 %rd49, %r417, 4; | |
add.s64 %rd72, %rd48, %rd49; | |
add.s64 %rd73, %rd72, 2048; | |
add.s64 %rd74, %rd72, 4096; | |
add.s64 %rd75, %rd72, 6144; | |
add.s64 %rd76, %rd72, 8192; | |
add.s64 %rd77, %rd72, 10240; | |
add.s64 %rd78, %rd72, 12288; | |
mul.wide.u32 %rd50, %r418, 4; | |
add.s64 %rd79, %rd48, %rd50; | |
add.s64 %rd80, %rd72, 16384; | |
add.s64 %rd81, %rd72, 18432; | |
add.s64 %rd82, %rd72, 20480; | |
add.s64 %rd83, %rd72, 22528; | |
add.s64 %rd84, %rd72, 24576; | |
add.s64 %rd85, %rd72, 26624; | |
add.s64 %rd86, %rd72, 28672; | |
mul.wide.u32 %rd51, %r419, 4; | |
add.s64 %rd87, %rd48, %rd51; | |
cvt.rn.f32.s32 %f1, %r285; | |
.loc 1 113 26 // layer_norm.py:113:26 | |
mul.lo.s32 %r420, %r284, %r2; | |
.loc 1 113 14 // layer_norm.py:113:14 | |
mul.wide.s32 %rd52, %r420, 4; | |
add.s64 %rd165, %rd47, %rd52; | |
.loc 1 112 26 // layer_norm.py:112:26 | |
mul.lo.s32 %r421, %r281, %r2; | |
.loc 1 112 14 // layer_norm.py:112:14 | |
mul.wide.s32 %rd53, %r421, 4; | |
add.s64 %rd164, %rd44, %rd53; | |
.loc 1 111 16 // layer_norm.py:111:16 | |
mul.wide.s32 %rd54, %r2, 4; | |
add.s64 %rd166, %rd43, %rd54; | |
.loc 1 110 16 // layer_norm.py:110:16 | |
add.s64 %rd167, %rd42, %rd54; | |
.loc 1 109 25 // layer_norm.py:109:25 | |
mul.lo.s32 %r422, %r280, %r2; | |
.loc 1 109 13 // layer_norm.py:109:13 | |
mul.wide.s32 %rd55, %r422, 4; | |
add.s64 %rd163, %rd41, %rd55; | |
.loc 1 103 24 // layer_norm.py:103:24 | |
and.b32 %r18, %r4, 31; | |
shr.u32 %r423, %r4, 3; | |
and.b32 %r424, %r423, 12; | |
mov.b32 %r425, global_smem; | |
add.s32 %r814, %r425, %r424; | |
add.s32 %r823, %r425, %r416; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
mul.wide.s32 %rd26, %r284, 4; | |
and.b32 %r427, %r4, 127; | |
mul.wide.u32 %rd28, %r427, 16; | |
mul.wide.s32 %rd29, %r281, 4; | |
mul.wide.s32 %rd30, %r280, 4; | |
sub.s32 %r1055, %r3, %r2; | |
mov.b32 %f1189, 0f00000000; | |
setp.eq.s32 %p52, %r4, 0; | |
setp.lt.u32 %p51, %r4, 4; | |
setp.eq.s32 %p50, %r18, 0; | |
mov.b32 %f1190, %f1189; | |
mov.b32 %f1191, %f1189; | |
mov.b32 %f1192, %f1189; | |
mov.b32 %f1193, %f1189; | |
mov.b32 %f1194, %f1189; | |
mov.b32 %f1195, %f1189; | |
mov.b32 %f1196, %f1189; | |
mov.b32 %f1197, %f1189; | |
mov.b32 %f1198, %f1189; | |
mov.b32 %f1199, %f1189; | |
mov.b32 %f1200, %f1189; | |
mov.b32 %f1201, %f1189; | |
mov.b32 %f1202, %f1189; | |
mov.b32 %f1203, %f1189; | |
mov.b32 %f1204, %f1189; | |
mov.b32 %f1205, %f1189; | |
mov.b32 %f1206, %f1189; | |
mov.b32 %f1207, %f1189; | |
mov.b32 %f1208, %f1189; | |
mov.b32 %f1209, %f1189; | |
mov.b32 %f1210, %f1189; | |
mov.b32 %f1211, %f1189; | |
mov.b32 %f1212, %f1189; | |
mov.b32 %f1213, %f1189; | |
mov.b32 %f1214, %f1189; | |
mov.b32 %f1215, %f1189; | |
mov.b32 %f1216, %f1189; | |
mov.b32 %f1217, %f1189; | |
mov.b32 %f1218, %f1189; | |
mov.b32 %f1219, %f1189; | |
mov.b32 %f1220, %f1189; | |
mov.b32 %f1221, %f1189; | |
mov.b32 %f1222, %f1189; | |
mov.b32 %f1223, %f1189; | |
mov.b32 %f1224, %f1189; | |
mov.b32 %f1225, %f1189; | |
mov.b32 %f1226, %f1189; | |
mov.b32 %f1227, %f1189; | |
mov.b32 %f1228, %f1189; | |
mov.b32 %f1229, %f1189; | |
mov.b32 %f1230, %f1189; | |
mov.b32 %f1231, %f1189; | |
mov.b32 %f1232, %f1189; | |
mov.b32 %f1233, %f1189; | |
mov.b32 %f1234, %f1189; | |
mov.b32 %f1235, %f1189; | |
mov.b32 %f1236, %f1189; | |
mov.b32 %f1237, %f1189; | |
mov.b32 %f1238, %f1189; | |
mov.b32 %f1239, %f1189; | |
mov.b32 %f1240, %f1189; | |
mov.b32 %f1241, %f1189; | |
mov.b32 %f1242, %f1189; | |
mov.b32 %f1243, %f1189; | |
mov.b32 %f1244, %f1189; | |
mov.b32 %f1245, %f1189; | |
mov.b32 %f1246, %f1189; | |
mov.b32 %f1247, %f1189; | |
mov.b32 %f1248, %f1189; | |
mov.b32 %f1249, %f1189; | |
mov.b32 %f1250, %f1189; | |
mov.b32 %f1251, %f1189; | |
mov.b32 %f1252, %f1189; | |
mov.b32 %f1253, %f1189; | |
mov.b32 %f1254, %f1189; | |
mov.b32 %f1255, %f1189; | |
mov.b32 %f1256, %f1189; | |
mov.b32 %f1257, %f1189; | |
mov.b32 %f1258, %f1189; | |
mov.b32 %f1259, %f1189; | |
mov.b32 %f1260, %f1189; | |
mov.b32 %f1261, %f1189; | |
mov.b32 %f1262, %f1189; | |
mov.b32 %f1263, %f1189; | |
mov.b32 %f1264, %f1189; | |
mov.b32 %f1265, %f1189; | |
mov.b32 %f1266, %f1189; | |
mov.b32 %f1267, %f1189; | |
mov.b32 %f1268, %f1189; | |
mov.b32 %f1269, %f1189; | |
mov.b32 %f1270, %f1189; | |
mov.b32 %f1271, %f1189; | |
mov.b32 %f1272, %f1189; | |
mov.b32 %f1273, %f1189; | |
mov.b32 %f1274, %f1189; | |
mov.b32 %f1275, %f1189; | |
mov.b32 %f1276, %f1189; | |
mov.b32 %f1277, %f1189; | |
mov.b32 %f1278, %f1189; | |
mov.b32 %f1279, %f1189; | |
mov.b32 %f1280, %f1189; | |
mov.b32 %f1281, %f1189; | |
mov.b32 %f1282, %f1189; | |
mov.b32 %f1283, %f1189; | |
mov.b32 %f1284, %f1189; | |
mov.b32 %f1285, %f1189; | |
mov.b32 %f1286, %f1189; | |
mov.b32 %f1287, %f1189; | |
mov.b32 %f1288, %f1189; | |
mov.b32 %f1289, %f1189; | |
mov.b32 %f1290, %f1189; | |
mov.b32 %f1291, %f1189; | |
mov.b32 %f1292, %f1189; | |
mov.b32 %f1293, %f1189; | |
mov.b32 %f1294, %f1189; | |
mov.b32 %f1295, %f1189; | |
mov.b32 %f1296, %f1189; | |
mov.b32 %f1297, %f1189; | |
mov.b32 %f1298, %f1189; | |
mov.b32 %f1299, %f1189; | |
mov.b32 %f1300, %f1189; | |
mov.b32 %f1301, %f1189; | |
mov.b32 %f1302, %f1189; | |
mov.b32 %f1303, %f1189; | |
mov.b32 %f1304, %f1189; | |
mov.b32 %f1305, %f1189; | |
mov.b32 %f1306, %f1189; | |
mov.b32 %f1307, %f1189; | |
mov.b32 %f1308, %f1189; | |
mov.b32 %f1309, %f1189; | |
mov.b32 %f1310, %f1189; | |
mov.b32 %f1311, %f1189; | |
mov.b32 %f1312, %f1189; | |
mov.b32 %f1313, %f1189; | |
mov.b32 %f1314, %f1189; | |
mov.b32 %f1315, %f1189; | |
mov.b32 %f1316, %f1189; | |
$L__BB0_2: // =>This Inner Loop Header: Depth=1 | |
.loc 1 104 18 // layer_norm.py:104:18 | |
setp.lt.s32 %p17, %r1052, %r285; | |
setp.lt.s32 %p9, %r1053, %r285; | |
setp.lt.s32 %p2, %r1054, %r285; | |
.loc 1 116 28 // layer_norm.py:116:28 | |
add.s64 %rd56, %rd163, %rd28; | |
add.s64 %rd57, %rd56, 2048; | |
add.s64 %rd58, %rd56, 4096; | |
add.s64 %rd59, %rd56, 6144; | |
add.s64 %rd60, %rd56, 8192; | |
add.s64 %rd61, %rd56, 10240; | |
add.s64 %rd62, %rd56, 12288; | |
add.s64 %rd63, %rd163, %rd162; | |
add.s64 %rd64, %rd56, 16384; | |
add.s64 %rd65, %rd56, 18432; | |
add.s64 %rd66, %rd56, 20480; | |
add.s64 %rd67, %rd56, 22528; | |
add.s64 %rd68, %rd56, 24576; | |
add.s64 %rd69, %rd56, 26624; | |
add.s64 %rd70, %rd56, 28672; | |
.loc 1 116 20 // layer_norm.py:116:20 | |
add.s64 %rd71, %rd163, %rd161; | |
mov.b32 %r432, 0; | |
// begin inline asm | |
mov.u32 %r428, %r432; | |
mov.u32 %r429, %r432; | |
mov.u32 %r430, %r432; | |
mov.u32 %r431, %r432; | |
@%p2 ld.global.v4.b32 { %r428, %r429, %r430, %r431 }, [ %rd56 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r436, %r432; | |
mov.u32 %r437, %r432; | |
mov.u32 %r438, %r432; | |
mov.u32 %r439, %r432; | |
@%p74 ld.global.v4.b32 { %r436, %r437, %r438, %r439 }, [ %rd57 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r444, %r432; | |
mov.u32 %r445, %r432; | |
mov.u32 %r446, %r432; | |
mov.u32 %r447, %r432; | |
@%p75 ld.global.v4.b32 { %r444, %r445, %r446, %r447 }, [ %rd58 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r452, %r432; | |
mov.u32 %r453, %r432; | |
mov.u32 %r454, %r432; | |
mov.u32 %r455, %r432; | |
@%p76 ld.global.v4.b32 { %r452, %r453, %r454, %r455 }, [ %rd59 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r460, %r432; | |
mov.u32 %r461, %r432; | |
mov.u32 %r462, %r432; | |
mov.u32 %r463, %r432; | |
@%p77 ld.global.v4.b32 { %r460, %r461, %r462, %r463 }, [ %rd60 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r468, %r432; | |
mov.u32 %r469, %r432; | |
mov.u32 %r470, %r432; | |
mov.u32 %r471, %r432; | |
@%p78 ld.global.v4.b32 { %r468, %r469, %r470, %r471 }, [ %rd61 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r476, %r432; | |
mov.u32 %r477, %r432; | |
mov.u32 %r478, %r432; | |
mov.u32 %r479, %r432; | |
@%p79 ld.global.v4.b32 { %r476, %r477, %r478, %r479 }, [ %rd62 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r484, %r432; | |
mov.u32 %r485, %r432; | |
mov.u32 %r486, %r432; | |
mov.u32 %r487, %r432; | |
@%p9 ld.global.v4.b32 { %r484, %r485, %r486, %r487 }, [ %rd63 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r492, %r432; | |
mov.u32 %r493, %r432; | |
mov.u32 %r494, %r432; | |
mov.u32 %r495, %r432; | |
@%p81 ld.global.v4.b32 { %r492, %r493, %r494, %r495 }, [ %rd64 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r500, %r432; | |
mov.u32 %r501, %r432; | |
mov.u32 %r502, %r432; | |
mov.u32 %r503, %r432; | |
@%p82 ld.global.v4.b32 { %r500, %r501, %r502, %r503 }, [ %rd65 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r508, %r432; | |
mov.u32 %r509, %r432; | |
mov.u32 %r510, %r432; | |
mov.u32 %r511, %r432; | |
@%p83 ld.global.v4.b32 { %r508, %r509, %r510, %r511 }, [ %rd66 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r516, %r432; | |
mov.u32 %r517, %r432; | |
mov.u32 %r518, %r432; | |
mov.u32 %r519, %r432; | |
@%p84 ld.global.v4.b32 { %r516, %r517, %r518, %r519 }, [ %rd67 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r524, %r432; | |
mov.u32 %r525, %r432; | |
mov.u32 %r526, %r432; | |
mov.u32 %r527, %r432; | |
@%p85 ld.global.v4.b32 { %r524, %r525, %r526, %r527 }, [ %rd68 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r532, %r432; | |
mov.u32 %r533, %r432; | |
mov.u32 %r534, %r432; | |
mov.u32 %r535, %r432; | |
@%p86 ld.global.v4.b32 { %r532, %r533, %r534, %r535 }, [ %rd69 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r540, %r432; | |
mov.u32 %r541, %r432; | |
mov.u32 %r542, %r432; | |
mov.u32 %r543, %r432; | |
@%p87 ld.global.v4.b32 { %r540, %r541, %r542, %r543 }, [ %rd70 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r548, %r432; | |
mov.u32 %r549, %r432; | |
mov.u32 %r550, %r432; | |
mov.u32 %r551, %r432; | |
@%p17 ld.global.v4.b32 { %r548, %r549, %r550, %r551 }, [ %rd71 + 0 ]; | |
// end inline asm | |
.loc 1 117 20 // layer_norm.py:117:20 | |
// begin inline asm | |
mov.u32 %r556, %r432; | |
mov.u32 %r557, %r432; | |
mov.u32 %r558, %r432; | |
mov.u32 %r559, %r432; | |
@%p2 ld.global.v4.b32 { %r556, %r557, %r558, %r559 }, [ %rd72 + 0 ]; | |
// end inline asm | |
mov.b32 %f386, %r556; | |
mov.b32 %f387, %r557; | |
mov.b32 %f388, %r558; | |
mov.b32 %f389, %r559; | |
// begin inline asm | |
mov.u32 %r564, %r432; | |
mov.u32 %r565, %r432; | |
mov.u32 %r566, %r432; | |
mov.u32 %r567, %r432; | |
@%p74 ld.global.v4.b32 { %r564, %r565, %r566, %r567 }, [ %rd73 + 0 ]; | |
// end inline asm | |
mov.b32 %f390, %r564; | |
mov.b32 %f391, %r565; | |
mov.b32 %f392, %r566; | |
mov.b32 %f393, %r567; | |
// begin inline asm | |
mov.u32 %r572, %r432; | |
mov.u32 %r573, %r432; | |
mov.u32 %r574, %r432; | |
mov.u32 %r575, %r432; | |
@%p75 ld.global.v4.b32 { %r572, %r573, %r574, %r575 }, [ %rd74 + 0 ]; | |
// end inline asm | |
mov.b32 %f394, %r572; | |
mov.b32 %f395, %r573; | |
mov.b32 %f396, %r574; | |
mov.b32 %f397, %r575; | |
// begin inline asm | |
mov.u32 %r580, %r432; | |
mov.u32 %r581, %r432; | |
mov.u32 %r582, %r432; | |
mov.u32 %r583, %r432; | |
@%p76 ld.global.v4.b32 { %r580, %r581, %r582, %r583 }, [ %rd75 + 0 ]; | |
// end inline asm | |
mov.b32 %f398, %r580; | |
mov.b32 %f399, %r581; | |
mov.b32 %f400, %r582; | |
mov.b32 %f401, %r583; | |
// begin inline asm | |
mov.u32 %r588, %r432; | |
mov.u32 %r589, %r432; | |
mov.u32 %r590, %r432; | |
mov.u32 %r591, %r432; | |
@%p77 ld.global.v4.b32 { %r588, %r589, %r590, %r591 }, [ %rd76 + 0 ]; | |
// end inline asm | |
mov.b32 %f402, %r588; | |
mov.b32 %f403, %r589; | |
mov.b32 %f404, %r590; | |
mov.b32 %f405, %r591; | |
// begin inline asm | |
mov.u32 %r596, %r432; | |
mov.u32 %r597, %r432; | |
mov.u32 %r598, %r432; | |
mov.u32 %r599, %r432; | |
@%p78 ld.global.v4.b32 { %r596, %r597, %r598, %r599 }, [ %rd77 + 0 ]; | |
// end inline asm | |
mov.b32 %f406, %r596; | |
mov.b32 %f407, %r597; | |
mov.b32 %f408, %r598; | |
mov.b32 %f409, %r599; | |
// begin inline asm | |
mov.u32 %r604, %r432; | |
mov.u32 %r605, %r432; | |
mov.u32 %r606, %r432; | |
mov.u32 %r607, %r432; | |
@%p79 ld.global.v4.b32 { %r604, %r605, %r606, %r607 }, [ %rd78 + 0 ]; | |
// end inline asm | |
mov.b32 %f410, %r604; | |
mov.b32 %f411, %r605; | |
mov.b32 %f412, %r606; | |
mov.b32 %f413, %r607; | |
// begin inline asm | |
mov.u32 %r612, %r432; | |
mov.u32 %r613, %r432; | |
mov.u32 %r614, %r432; | |
mov.u32 %r615, %r432; | |
@%p9 ld.global.v4.b32 { %r612, %r613, %r614, %r615 }, [ %rd79 + 0 ]; | |
// end inline asm | |
mov.b32 %f414, %r612; | |
mov.b32 %f415, %r613; | |
mov.b32 %f416, %r614; | |
mov.b32 %f417, %r615; | |
// begin inline asm | |
mov.u32 %r620, %r432; | |
mov.u32 %r621, %r432; | |
mov.u32 %r622, %r432; | |
mov.u32 %r623, %r432; | |
@%p81 ld.global.v4.b32 { %r620, %r621, %r622, %r623 }, [ %rd80 + 0 ]; | |
// end inline asm | |
mov.b32 %f418, %r620; | |
mov.b32 %f419, %r621; | |
mov.b32 %f420, %r622; | |
mov.b32 %f421, %r623; | |
// begin inline asm | |
mov.u32 %r628, %r432; | |
mov.u32 %r629, %r432; | |
mov.u32 %r630, %r432; | |
mov.u32 %r631, %r432; | |
@%p82 ld.global.v4.b32 { %r628, %r629, %r630, %r631 }, [ %rd81 + 0 ]; | |
// end inline asm | |
mov.b32 %f422, %r628; | |
mov.b32 %f423, %r629; | |
mov.b32 %f424, %r630; | |
mov.b32 %f425, %r631; | |
// begin inline asm | |
mov.u32 %r636, %r432; | |
mov.u32 %r637, %r432; | |
mov.u32 %r638, %r432; | |
mov.u32 %r639, %r432; | |
@%p83 ld.global.v4.b32 { %r636, %r637, %r638, %r639 }, [ %rd82 + 0 ]; | |
// end inline asm | |
mov.b32 %f426, %r636; | |
mov.b32 %f427, %r637; | |
mov.b32 %f428, %r638; | |
mov.b32 %f429, %r639; | |
// begin inline asm | |
mov.u32 %r644, %r432; | |
mov.u32 %r645, %r432; | |
mov.u32 %r646, %r432; | |
mov.u32 %r647, %r432; | |
@%p84 ld.global.v4.b32 { %r644, %r645, %r646, %r647 }, [ %rd83 + 0 ]; | |
// end inline asm | |
mov.b32 %f430, %r644; | |
mov.b32 %f431, %r645; | |
mov.b32 %f432, %r646; | |
mov.b32 %f433, %r647; | |
// begin inline asm | |
mov.u32 %r652, %r432; | |
mov.u32 %r653, %r432; | |
mov.u32 %r654, %r432; | |
mov.u32 %r655, %r432; | |
@%p85 ld.global.v4.b32 { %r652, %r653, %r654, %r655 }, [ %rd84 + 0 ]; | |
// end inline asm | |
mov.b32 %f434, %r652; | |
mov.b32 %f435, %r653; | |
mov.b32 %f436, %r654; | |
mov.b32 %f437, %r655; | |
// begin inline asm | |
mov.u32 %r660, %r432; | |
mov.u32 %r661, %r432; | |
mov.u32 %r662, %r432; | |
mov.u32 %r663, %r432; | |
@%p86 ld.global.v4.b32 { %r660, %r661, %r662, %r663 }, [ %rd85 + 0 ]; | |
// end inline asm | |
mov.b32 %f438, %r660; | |
mov.b32 %f439, %r661; | |
mov.b32 %f440, %r662; | |
mov.b32 %f441, %r663; | |
// begin inline asm | |
mov.u32 %r668, %r432; | |
mov.u32 %r669, %r432; | |
mov.u32 %r670, %r432; | |
mov.u32 %r671, %r432; | |
@%p87 ld.global.v4.b32 { %r668, %r669, %r670, %r671 }, [ %rd86 + 0 ]; | |
// end inline asm | |
mov.b32 %f442, %r668; | |
mov.b32 %f443, %r669; | |
mov.b32 %f444, %r670; | |
mov.b32 %f445, %r671; | |
// begin inline asm | |
mov.u32 %r676, %r432; | |
mov.u32 %r677, %r432; | |
mov.u32 %r678, %r432; | |
mov.u32 %r679, %r432; | |
@%p17 ld.global.v4.b32 { %r676, %r677, %r678, %r679 }, [ %rd87 + 0 ]; | |
// end inline asm | |
mov.b32 %f446, %r676; | |
mov.b32 %f447, %r677; | |
mov.b32 %f448, %r678; | |
mov.b32 %f449, %r679; | |
.loc 1 118 30 // layer_norm.py:118:30 | |
add.s64 %rd88, %rd165, %rd28; | |
add.s64 %rd89, %rd88, 2048; | |
add.s64 %rd90, %rd88, 4096; | |
add.s64 %rd91, %rd88, 6144; | |
add.s64 %rd92, %rd88, 8192; | |
add.s64 %rd93, %rd88, 10240; | |
add.s64 %rd94, %rd88, 12288; | |
add.s64 %rd95, %rd165, %rd162; | |
add.s64 %rd96, %rd88, 16384; | |
add.s64 %rd97, %rd88, 18432; | |
add.s64 %rd98, %rd88, 20480; | |
add.s64 %rd99, %rd88, 22528; | |
add.s64 %rd100, %rd88, 24576; | |
add.s64 %rd101, %rd88, 26624; | |
add.s64 %rd102, %rd88, 28672; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
add.s64 %rd103, %rd165, %rd161; | |
// begin inline asm | |
mov.u32 %r684, %r432; | |
mov.u32 %r685, %r432; | |
mov.u32 %r686, %r432; | |
mov.u32 %r687, %r432; | |
@%p2 ld.global.v4.b32 { %r684, %r685, %r686, %r687 }, [ %rd88 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r692, %r432; | |
mov.u32 %r693, %r432; | |
mov.u32 %r694, %r432; | |
mov.u32 %r695, %r432; | |
@%p74 ld.global.v4.b32 { %r692, %r693, %r694, %r695 }, [ %rd89 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r700, %r432; | |
mov.u32 %r701, %r432; | |
mov.u32 %r702, %r432; | |
mov.u32 %r703, %r432; | |
@%p75 ld.global.v4.b32 { %r700, %r701, %r702, %r703 }, [ %rd90 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r708, %r432; | |
mov.u32 %r709, %r432; | |
mov.u32 %r710, %r432; | |
mov.u32 %r711, %r432; | |
@%p76 ld.global.v4.b32 { %r708, %r709, %r710, %r711 }, [ %rd91 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r716, %r432; | |
mov.u32 %r717, %r432; | |
mov.u32 %r718, %r432; | |
mov.u32 %r719, %r432; | |
@%p77 ld.global.v4.b32 { %r716, %r717, %r718, %r719 }, [ %rd92 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r724, %r432; | |
mov.u32 %r725, %r432; | |
mov.u32 %r726, %r432; | |
mov.u32 %r727, %r432; | |
@%p78 ld.global.v4.b32 { %r724, %r725, %r726, %r727 }, [ %rd93 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r732, %r432; | |
mov.u32 %r733, %r432; | |
mov.u32 %r734, %r432; | |
mov.u32 %r735, %r432; | |
@%p79 ld.global.v4.b32 { %r732, %r733, %r734, %r735 }, [ %rd94 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r740, %r432; | |
mov.u32 %r741, %r432; | |
mov.u32 %r742, %r432; | |
mov.u32 %r743, %r432; | |
@%p9 ld.global.v4.b32 { %r740, %r741, %r742, %r743 }, [ %rd95 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r748, %r432; | |
mov.u32 %r749, %r432; | |
mov.u32 %r750, %r432; | |
mov.u32 %r751, %r432; | |
@%p81 ld.global.v4.b32 { %r748, %r749, %r750, %r751 }, [ %rd96 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r756, %r432; | |
mov.u32 %r757, %r432; | |
mov.u32 %r758, %r432; | |
mov.u32 %r759, %r432; | |
@%p82 ld.global.v4.b32 { %r756, %r757, %r758, %r759 }, [ %rd97 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r764, %r432; | |
mov.u32 %r765, %r432; | |
mov.u32 %r766, %r432; | |
mov.u32 %r767, %r432; | |
@%p83 ld.global.v4.b32 { %r764, %r765, %r766, %r767 }, [ %rd98 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r772, %r432; | |
mov.u32 %r773, %r432; | |
mov.u32 %r774, %r432; | |
mov.u32 %r775, %r432; | |
@%p84 ld.global.v4.b32 { %r772, %r773, %r774, %r775 }, [ %rd99 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r780, %r432; | |
mov.u32 %r781, %r432; | |
mov.u32 %r782, %r432; | |
mov.u32 %r783, %r432; | |
@%p85 ld.global.v4.b32 { %r780, %r781, %r782, %r783 }, [ %rd100 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r788, %r432; | |
mov.u32 %r789, %r432; | |
mov.u32 %r790, %r432; | |
mov.u32 %r791, %r432; | |
@%p86 ld.global.v4.b32 { %r788, %r789, %r790, %r791 }, [ %rd101 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r796, %r432; | |
mov.u32 %r797, %r432; | |
mov.u32 %r798, %r432; | |
mov.u32 %r799, %r432; | |
@%p87 ld.global.v4.b32 { %r796, %r797, %r798, %r799 }, [ %rd102 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r804, %r432; | |
mov.u32 %r805, %r432; | |
mov.u32 %r806, %r432; | |
mov.u32 %r807, %r432; | |
@%p17 ld.global.v4.b32 { %r804, %r805, %r806, %r807 }, [ %rd103 + 0 ]; | |
// end inline asm | |
.loc 1 119 23 // layer_norm.py:119:23 | |
// begin inline asm | |
mov.u32 %r812, 0x0; | |
ld.global.b32 { %r812 }, [ %rd167 + 0 ]; | |
// end inline asm | |
.loc 1 120 23 // layer_norm.py:120:23 | |
// begin inline asm | |
mov.u32 %r813, 0x0; | |
ld.global.b32 { %r813 }, [ %rd166 + 0 ]; | |
// end inline asm | |
mov.b32 %f450, %r813; | |
$L__tmp1: | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
bar.sync 0; | |
$L__tmp2: | |
.loc 1 127 26 // layer_norm.py:127:26 | |
add.s64 %rd106, %rd164, %rd28; | |
add.s64 %rd107, %rd106, 2048; | |
add.s64 %rd108, %rd106, 4096; | |
add.s64 %rd109, %rd106, 6144; | |
add.s64 %rd110, %rd106, 8192; | |
add.s64 %rd111, %rd106, 10240; | |
add.s64 %rd112, %rd106, 12288; | |
add.s64 %rd113, %rd164, %rd162; | |
add.s64 %rd114, %rd106, 16384; | |
add.s64 %rd115, %rd106, 18432; | |
add.s64 %rd116, %rd106, 20480; | |
add.s64 %rd117, %rd106, 22528; | |
add.s64 %rd118, %rd106, 24576; | |
add.s64 %rd119, %rd106, 26624; | |
add.s64 %rd120, %rd106, 28672; | |
.loc 1 116 20 // layer_norm.py:116:20 | |
add.s64 %rd121, %rd164, %rd161; | |
mov.b32 %f451, %r551; | |
mov.b32 %f452, %r550; | |
mov.b32 %f453, %r549; | |
mov.b32 %f454, %r548; | |
mov.b32 %f455, %r543; | |
mov.b32 %f456, %r542; | |
mov.b32 %f457, %r541; | |
mov.b32 %f458, %r540; | |
mov.b32 %f459, %r535; | |
mov.b32 %f460, %r534; | |
mov.b32 %f461, %r533; | |
mov.b32 %f462, %r532; | |
mov.b32 %f463, %r527; | |
mov.b32 %f464, %r526; | |
mov.b32 %f465, %r525; | |
mov.b32 %f466, %r524; | |
mov.b32 %f467, %r519; | |
mov.b32 %f468, %r518; | |
mov.b32 %f469, %r517; | |
mov.b32 %f470, %r516; | |
mov.b32 %f471, %r511; | |
mov.b32 %f472, %r510; | |
mov.b32 %f473, %r509; | |
mov.b32 %f474, %r508; | |
mov.b32 %f475, %r503; | |
mov.b32 %f476, %r502; | |
mov.b32 %f477, %r501; | |
mov.b32 %f478, %r500; | |
mov.b32 %f479, %r495; | |
mov.b32 %f480, %r494; | |
mov.b32 %f481, %r493; | |
mov.b32 %f482, %r492; | |
mov.b32 %f483, %r487; | |
mov.b32 %f484, %r486; | |
mov.b32 %f485, %r485; | |
mov.b32 %f486, %r484; | |
mov.b32 %f487, %r479; | |
mov.b32 %f488, %r478; | |
mov.b32 %f489, %r477; | |
mov.b32 %f490, %r476; | |
mov.b32 %f491, %r471; | |
mov.b32 %f492, %r470; | |
mov.b32 %f493, %r469; | |
mov.b32 %f494, %r468; | |
mov.b32 %f495, %r463; | |
mov.b32 %f496, %r462; | |
mov.b32 %f497, %r461; | |
mov.b32 %f498, %r460; | |
mov.b32 %f499, %r455; | |
mov.b32 %f500, %r454; | |
mov.b32 %f501, %r453; | |
mov.b32 %f502, %r452; | |
mov.b32 %f503, %r447; | |
mov.b32 %f504, %r446; | |
mov.b32 %f505, %r445; | |
mov.b32 %f506, %r444; | |
mov.b32 %f507, %r439; | |
mov.b32 %f508, %r438; | |
mov.b32 %f509, %r437; | |
mov.b32 %f510, %r436; | |
mov.b32 %f511, %r431; | |
mov.b32 %f512, %r430; | |
mov.b32 %f513, %r428; | |
mov.b32 %f514, %r429; | |
.loc 1 122 21 // layer_norm.py:122:21 | |
mov.b32 %f515, %r812; | |
sub.f32 %f516, %f514, %f515; | |
sub.f32 %f517, %f513, %f515; | |
sub.f32 %f518, %f512, %f515; | |
sub.f32 %f519, %f511, %f515; | |
sub.f32 %f520, %f510, %f515; | |
sub.f32 %f521, %f509, %f515; | |
sub.f32 %f522, %f508, %f515; | |
sub.f32 %f523, %f507, %f515; | |
sub.f32 %f524, %f506, %f515; | |
sub.f32 %f525, %f505, %f515; | |
sub.f32 %f526, %f504, %f515; | |
sub.f32 %f527, %f503, %f515; | |
sub.f32 %f528, %f502, %f515; | |
sub.f32 %f529, %f501, %f515; | |
sub.f32 %f530, %f500, %f515; | |
sub.f32 %f531, %f499, %f515; | |
sub.f32 %f532, %f498, %f515; | |
sub.f32 %f533, %f497, %f515; | |
sub.f32 %f534, %f496, %f515; | |
sub.f32 %f535, %f495, %f515; | |
sub.f32 %f536, %f494, %f515; | |
sub.f32 %f537, %f493, %f515; | |
sub.f32 %f538, %f492, %f515; | |
sub.f32 %f539, %f491, %f515; | |
sub.f32 %f540, %f490, %f515; | |
sub.f32 %f541, %f489, %f515; | |
sub.f32 %f542, %f488, %f515; | |
sub.f32 %f543, %f487, %f515; | |
sub.f32 %f544, %f486, %f515; | |
sub.f32 %f545, %f485, %f515; | |
sub.f32 %f546, %f484, %f515; | |
sub.f32 %f547, %f483, %f515; | |
sub.f32 %f548, %f482, %f515; | |
sub.f32 %f549, %f481, %f515; | |
sub.f32 %f550, %f480, %f515; | |
sub.f32 %f551, %f479, %f515; | |
sub.f32 %f552, %f478, %f515; | |
sub.f32 %f553, %f477, %f515; | |
sub.f32 %f554, %f476, %f515; | |
sub.f32 %f555, %f475, %f515; | |
sub.f32 %f556, %f474, %f515; | |
sub.f32 %f557, %f473, %f515; | |
sub.f32 %f558, %f472, %f515; | |
sub.f32 %f559, %f471, %f515; | |
sub.f32 %f560, %f470, %f515; | |
sub.f32 %f561, %f469, %f515; | |
sub.f32 %f562, %f468, %f515; | |
sub.f32 %f563, %f467, %f515; | |
sub.f32 %f564, %f466, %f515; | |
sub.f32 %f565, %f465, %f515; | |
sub.f32 %f566, %f464, %f515; | |
sub.f32 %f567, %f463, %f515; | |
sub.f32 %f568, %f462, %f515; | |
sub.f32 %f569, %f461, %f515; | |
sub.f32 %f570, %f460, %f515; | |
sub.f32 %f571, %f459, %f515; | |
sub.f32 %f572, %f458, %f515; | |
sub.f32 %f573, %f457, %f515; | |
sub.f32 %f574, %f456, %f515; | |
sub.f32 %f575, %f455, %f515; | |
sub.f32 %f576, %f454, %f515; | |
sub.f32 %f577, %f453, %f515; | |
sub.f32 %f578, %f452, %f515; | |
sub.f32 %f579, %f451, %f515; | |
.loc 1 122 29 // layer_norm.py:122:29 | |
mul.f32 %f580, %f579, %f450; | |
mul.f32 %f581, %f578, %f450; | |
mul.f32 %f582, %f577, %f450; | |
mul.f32 %f583, %f576, %f450; | |
mul.f32 %f584, %f575, %f450; | |
mul.f32 %f585, %f574, %f450; | |
mul.f32 %f586, %f573, %f450; | |
mul.f32 %f587, %f572, %f450; | |
mul.f32 %f588, %f571, %f450; | |
mul.f32 %f589, %f570, %f450; | |
mul.f32 %f590, %f569, %f450; | |
mul.f32 %f591, %f568, %f450; | |
mul.f32 %f592, %f567, %f450; | |
mul.f32 %f593, %f566, %f450; | |
mul.f32 %f594, %f565, %f450; | |
mul.f32 %f595, %f564, %f450; | |
mul.f32 %f596, %f563, %f450; | |
mul.f32 %f597, %f562, %f450; | |
mul.f32 %f598, %f561, %f450; | |
mul.f32 %f599, %f560, %f450; | |
mul.f32 %f600, %f559, %f450; | |
mul.f32 %f601, %f558, %f450; | |
mul.f32 %f602, %f557, %f450; | |
mul.f32 %f603, %f556, %f450; | |
mul.f32 %f604, %f555, %f450; | |
mul.f32 %f605, %f554, %f450; | |
mul.f32 %f606, %f553, %f450; | |
mul.f32 %f607, %f552, %f450; | |
mul.f32 %f608, %f551, %f450; | |
mul.f32 %f609, %f550, %f450; | |
mul.f32 %f610, %f549, %f450; | |
mul.f32 %f611, %f548, %f450; | |
mul.f32 %f612, %f547, %f450; | |
mul.f32 %f613, %f546, %f450; | |
mul.f32 %f614, %f545, %f450; | |
mul.f32 %f615, %f544, %f450; | |
mul.f32 %f616, %f543, %f450; | |
mul.f32 %f617, %f542, %f450; | |
mul.f32 %f618, %f541, %f450; | |
mul.f32 %f619, %f540, %f450; | |
mul.f32 %f620, %f539, %f450; | |
mul.f32 %f621, %f538, %f450; | |
mul.f32 %f622, %f537, %f450; | |
mul.f32 %f623, %f536, %f450; | |
mul.f32 %f624, %f535, %f450; | |
mul.f32 %f625, %f534, %f450; | |
mul.f32 %f626, %f533, %f450; | |
mul.f32 %f627, %f532, %f450; | |
mul.f32 %f628, %f531, %f450; | |
mul.f32 %f629, %f530, %f450; | |
mul.f32 %f630, %f529, %f450; | |
mul.f32 %f631, %f528, %f450; | |
mul.f32 %f632, %f527, %f450; | |
mul.f32 %f633, %f526, %f450; | |
mul.f32 %f634, %f525, %f450; | |
mul.f32 %f635, %f524, %f450; | |
mul.f32 %f636, %f523, %f450; | |
mul.f32 %f637, %f522, %f450; | |
mul.f32 %f638, %f521, %f450; | |
mul.f32 %f639, %f520, %f450; | |
mul.f32 %f640, %f519, %f450; | |
mul.f32 %f641, %f518, %f450; | |
mul.f32 %f642, %f517, %f450; | |
mul.f32 %f643, %f516, %f450; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f644, %r684; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f645, %f386, %f644; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f646, %r685; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f647, %f387, %f646; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f648, %r686; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f649, %f388, %f648; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f650, %r687; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f651, %f389, %f650; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f652, %r692; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f653, %f390, %f652; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f654, %r693; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f655, %f391, %f654; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f656, %r694; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f657, %f392, %f656; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f658, %r695; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f659, %f393, %f658; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f660, %r700; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f661, %f394, %f660; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f662, %r701; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f663, %f395, %f662; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f664, %r702; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f665, %f396, %f664; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f666, %r703; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f667, %f397, %f666; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f668, %r708; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f669, %f398, %f668; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f670, %r709; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f671, %f399, %f670; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f672, %r710; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f673, %f400, %f672; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f674, %r711; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f675, %f401, %f674; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f676, %r716; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f677, %f402, %f676; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f678, %r717; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f679, %f403, %f678; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f680, %r718; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f681, %f404, %f680; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f682, %r719; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f683, %f405, %f682; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f684, %r724; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f685, %f406, %f684; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f686, %r725; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f687, %f407, %f686; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f688, %r726; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f689, %f408, %f688; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f690, %r727; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f691, %f409, %f690; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f692, %r732; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f693, %f410, %f692; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f694, %r733; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f695, %f411, %f694; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f696, %r734; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f697, %f412, %f696; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f698, %r735; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f699, %f413, %f698; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f700, %r740; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f701, %f414, %f700; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f702, %r741; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f703, %f415, %f702; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f704, %r742; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f705, %f416, %f704; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f706, %r743; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f707, %f417, %f706; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f708, %r748; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f709, %f418, %f708; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f710, %r749; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f711, %f419, %f710; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f712, %r750; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f713, %f420, %f712; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f714, %r751; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f715, %f421, %f714; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f716, %r756; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f717, %f422, %f716; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f718, %r757; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f719, %f423, %f718; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f720, %r758; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f721, %f424, %f720; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f722, %r759; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f723, %f425, %f722; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f724, %r764; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f725, %f426, %f724; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f726, %r765; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f727, %f427, %f726; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f728, %r766; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f729, %f428, %f728; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f730, %r767; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f731, %f429, %f730; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f732, %r772; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f733, %f430, %f732; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f734, %r773; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f735, %f431, %f734; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f736, %r774; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f737, %f432, %f736; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f738, %r775; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f739, %f433, %f738; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f740, %r780; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f741, %f434, %f740; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f742, %r781; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f743, %f435, %f742; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f744, %r782; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f745, %f436, %f744; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f746, %r783; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f747, %f437, %f746; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f748, %r788; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f749, %f438, %f748; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f750, %r789; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f751, %f439, %f750; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f752, %r790; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f753, %f440, %f752; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f754, %r791; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f755, %f441, %f754; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f756, %r796; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f757, %f442, %f756; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f758, %r797; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f759, %f443, %f758; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f760, %r798; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f761, %f444, %f760; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f762, %r799; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f763, %f445, %f762; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f764, %r804; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f765, %f446, %f764; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f766, %r805; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f767, %f447, %f766; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f768, %r806; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f769, %f448, %f768; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
mov.b32 %f770, %r807; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %f771, %f449, %f770; | |
.loc 1 124 28 // layer_norm.py:124:28 | |
mul.f32 %f772, %f647, %f643; | |
$L__tmp3: | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
fma.rn.f32 %f773, %f645, %f642, %f772; | |
fma.rn.f32 %f774, %f649, %f641, %f773; | |
fma.rn.f32 %f775, %f651, %f640, %f774; | |
fma.rn.f32 %f776, %f653, %f639, %f775; | |
fma.rn.f32 %f777, %f655, %f638, %f776; | |
fma.rn.f32 %f778, %f657, %f637, %f777; | |
fma.rn.f32 %f779, %f659, %f636, %f778; | |
fma.rn.f32 %f780, %f661, %f635, %f779; | |
fma.rn.f32 %f781, %f663, %f634, %f780; | |
fma.rn.f32 %f782, %f665, %f633, %f781; | |
fma.rn.f32 %f783, %f667, %f632, %f782; | |
fma.rn.f32 %f784, %f669, %f631, %f783; | |
fma.rn.f32 %f785, %f671, %f630, %f784; | |
fma.rn.f32 %f786, %f673, %f629, %f785; | |
fma.rn.f32 %f787, %f675, %f628, %f786; | |
fma.rn.f32 %f788, %f677, %f627, %f787; | |
fma.rn.f32 %f789, %f679, %f626, %f788; | |
fma.rn.f32 %f790, %f681, %f625, %f789; | |
fma.rn.f32 %f791, %f683, %f624, %f790; | |
fma.rn.f32 %f792, %f685, %f623, %f791; | |
fma.rn.f32 %f793, %f687, %f622, %f792; | |
fma.rn.f32 %f794, %f689, %f621, %f793; | |
fma.rn.f32 %f795, %f691, %f620, %f794; | |
fma.rn.f32 %f796, %f693, %f619, %f795; | |
fma.rn.f32 %f797, %f695, %f618, %f796; | |
fma.rn.f32 %f798, %f697, %f617, %f797; | |
fma.rn.f32 %f799, %f699, %f616, %f798; | |
fma.rn.f32 %f800, %f701, %f615, %f799; | |
fma.rn.f32 %f801, %f703, %f614, %f800; | |
fma.rn.f32 %f802, %f705, %f613, %f801; | |
fma.rn.f32 %f803, %f707, %f612, %f802; | |
fma.rn.f32 %f804, %f709, %f611, %f803; | |
fma.rn.f32 %f805, %f711, %f610, %f804; | |
fma.rn.f32 %f806, %f713, %f609, %f805; | |
fma.rn.f32 %f807, %f715, %f608, %f806; | |
fma.rn.f32 %f808, %f717, %f607, %f807; | |
fma.rn.f32 %f809, %f719, %f606, %f808; | |
fma.rn.f32 %f810, %f721, %f605, %f809; | |
fma.rn.f32 %f811, %f723, %f604, %f810; | |
fma.rn.f32 %f812, %f725, %f603, %f811; | |
fma.rn.f32 %f813, %f727, %f602, %f812; | |
fma.rn.f32 %f814, %f729, %f601, %f813; | |
fma.rn.f32 %f815, %f731, %f600, %f814; | |
fma.rn.f32 %f816, %f733, %f599, %f815; | |
fma.rn.f32 %f817, %f735, %f598, %f816; | |
fma.rn.f32 %f818, %f737, %f597, %f817; | |
fma.rn.f32 %f819, %f739, %f596, %f818; | |
fma.rn.f32 %f820, %f741, %f595, %f819; | |
fma.rn.f32 %f821, %f743, %f594, %f820; | |
fma.rn.f32 %f822, %f745, %f593, %f821; | |
fma.rn.f32 %f823, %f747, %f592, %f822; | |
fma.rn.f32 %f824, %f749, %f591, %f823; | |
fma.rn.f32 %f825, %f751, %f590, %f824; | |
fma.rn.f32 %f826, %f753, %f589, %f825; | |
fma.rn.f32 %f827, %f755, %f588, %f826; | |
fma.rn.f32 %f828, %f757, %f587, %f827; | |
fma.rn.f32 %f829, %f759, %f586, %f828; | |
fma.rn.f32 %f830, %f761, %f585, %f829; | |
fma.rn.f32 %f831, %f763, %f584, %f830; | |
fma.rn.f32 %f832, %f765, %f583, %f831; | |
fma.rn.f32 %f833, %f767, %f582, %f832; | |
fma.rn.f32 %f834, %f769, %f581, %f833; | |
fma.rn.f32 %f835, %f771, %f580, %f834; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r893, %f835; | |
shfl.sync.bfly.b32 %r894, %r893, 16, 31, -1; | |
mov.b32 %f836, %r894; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f837, %f835, %f836; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r895, %f837; | |
shfl.sync.bfly.b32 %r896, %r895, 8, 31, -1; | |
mov.b32 %f838, %r896; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f839, %f837, %f838; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r897, %f839; | |
shfl.sync.bfly.b32 %r898, %r897, 4, 31, -1; | |
mov.b32 %f840, %r898; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f841, %f839, %f840; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r899, %f841; | |
shfl.sync.bfly.b32 %r900, %r899, 2, 31, -1; | |
mov.b32 %f842, %r900; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f843, %f841, %f842; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r901, %f843; | |
shfl.sync.bfly.b32 %r902, %r901, 1, 31, -1; | |
mov.b32 %f844, %r902; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f845, %f843, %f844; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r815, %f845; | |
// begin inline asm | |
@%p50 st.shared.b32 [ %r814 + 0 ], %r815; | |
// end inline asm | |
bar.sync 0; | |
// begin inline asm | |
@%p51 ld.shared.b32 %r816, [ %r823 + 0 ]; | |
// end inline asm | |
mov.b32 %f846, %r816; | |
shfl.sync.bfly.b32 %r903, %r816, 2, 31, -1; | |
mov.b32 %f847, %r903; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f848, %f846, %f847; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r904, %f848; | |
shfl.sync.bfly.b32 %r905, %r904, 1, 31, -1; | |
mov.b32 %f849, %r905; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %f850, %f848, %f849; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
mov.b32 %r819, %f850; | |
// begin inline asm | |
@%p52 st.shared.b32 [ %r823 + 0 ], %r819; | |
// end inline asm | |
bar.sync 0; | |
ld.shared.b32 %f851, [global_smem]; | |
$L__tmp4: | |
.loc 1 124 43 // layer_norm.py:124:43 | |
div.full.f32 %f852, %f851, %f1; | |
$L__tmp5: | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
bar.sync 0; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
fma.rn.f32 %f853, %f386, %f644, %f647; | |
fma.rn.f32 %f854, %f388, %f648, %f853; | |
fma.rn.f32 %f855, %f389, %f650, %f854; | |
fma.rn.f32 %f856, %f390, %f652, %f855; | |
fma.rn.f32 %f857, %f391, %f654, %f856; | |
fma.rn.f32 %f858, %f392, %f656, %f857; | |
fma.rn.f32 %f859, %f393, %f658, %f858; | |
fma.rn.f32 %f860, %f394, %f660, %f859; | |
fma.rn.f32 %f861, %f395, %f662, %f860; | |
fma.rn.f32 %f862, %f396, %f664, %f861; | |
fma.rn.f32 %f863, %f397, %f666, %f862; | |
fma.rn.f32 %f864, %f398, %f668, %f863; | |
fma.rn.f32 %f865, %f399, %f670, %f864; | |
fma.rn.f32 %f866, %f400, %f672, %f865; | |
fma.rn.f32 %f867, %f401, %f674, %f866; | |
fma.rn.f32 %f868, %f402, %f676, %f867; | |
fma.rn.f32 %f869, %f403, %f678, %f868; | |
fma.rn.f32 %f870, %f404, %f680, %f869; | |
fma.rn.f32 %f871, %f405, %f682, %f870; | |
fma.rn.f32 %f872, %f406, %f684, %f871; | |
fma.rn.f32 %f873, %f407, %f686, %f872; | |
fma.rn.f32 %f874, %f408, %f688, %f873; | |
fma.rn.f32 %f875, %f409, %f690, %f874; | |
fma.rn.f32 %f876, %f410, %f692, %f875; | |
fma.rn.f32 %f877, %f411, %f694, %f876; | |
fma.rn.f32 %f878, %f412, %f696, %f877; | |
fma.rn.f32 %f879, %f413, %f698, %f878; | |
fma.rn.f32 %f880, %f414, %f700, %f879; | |
fma.rn.f32 %f881, %f415, %f702, %f880; | |
fma.rn.f32 %f882, %f416, %f704, %f881; | |
fma.rn.f32 %f883, %f417, %f706, %f882; | |
fma.rn.f32 %f884, %f418, %f708, %f883; | |
fma.rn.f32 %f885, %f419, %f710, %f884; | |
fma.rn.f32 %f886, %f420, %f712, %f885; | |
fma.rn.f32 %f887, %f421, %f714, %f886; | |
fma.rn.f32 %f888, %f422, %f716, %f887; | |
fma.rn.f32 %f889, %f423, %f718, %f888; | |
fma.rn.f32 %f890, %f424, %f720, %f889; | |
fma.rn.f32 %f891, %f425, %f722, %f890; | |
fma.rn.f32 %f892, %f426, %f724, %f891; | |
fma.rn.f32 %f893, %f427, %f726, %f892; | |
fma.rn.f32 %f894, %f428, %f728, %f893; | |
fma.rn.f32 %f895, %f429, %f730, %f894; | |
fma.rn.f32 %f896, %f430, %f732, %f895; | |
fma.rn.f32 %f897, %f431, %f734, %f896; | |
fma.rn.f32 %f898, %f432, %f736, %f897; | |
fma.rn.f32 %f899, %f433, %f738, %f898; | |
fma.rn.f32 %f900, %f434, %f740, %f899; | |
fma.rn.f32 %f901, %f435, %f742, %f900; | |
fma.rn.f32 %f902, %f436, %f744, %f901; | |
fma.rn.f32 %f903, %f437, %f746, %f902; | |
fma.rn.f32 %f904, %f438, %f748, %f903; | |
fma.rn.f32 %f905, %f439, %f750, %f904; | |
fma.rn.f32 %f906, %f440, %f752, %f905; | |
fma.rn.f32 %f907, %f441, %f754, %f906; | |
fma.rn.f32 %f908, %f442, %f756, %f907; | |
fma.rn.f32 %f909, %f443, %f758, %f908; | |
fma.rn.f32 %f910, %f444, %f760, %f909; | |
fma.rn.f32 %f911, %f445, %f762, %f910; | |
fma.rn.f32 %f912, %f446, %f764, %f911; | |
fma.rn.f32 %f913, %f447, %f766, %f912; | |
fma.rn.f32 %f914, %f448, %f768, %f913; | |
fma.rn.f32 %f915, %f449, %f770, %f914; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r906, %f915; | |
shfl.sync.bfly.b32 %r907, %r906, 16, 31, -1; | |
mov.b32 %f916, %r907; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f917, %f915, %f916; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r908, %f917; | |
shfl.sync.bfly.b32 %r909, %r908, 8, 31, -1; | |
mov.b32 %f918, %r909; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f919, %f917, %f918; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r910, %f919; | |
shfl.sync.bfly.b32 %r911, %r910, 4, 31, -1; | |
mov.b32 %f920, %r911; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f921, %f919, %f920; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r912, %f921; | |
shfl.sync.bfly.b32 %r913, %r912, 2, 31, -1; | |
mov.b32 %f922, %r913; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f923, %f921, %f922; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r914, %f923; | |
shfl.sync.bfly.b32 %r915, %r914, 1, 31, -1; | |
mov.b32 %f924, %r915; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f925, %f923, %f924; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r821, %f925; | |
// begin inline asm | |
@%p50 st.shared.b32 [ %r814 + 0 ], %r821; | |
// end inline asm | |
bar.sync 0; | |
// begin inline asm | |
@%p51 ld.shared.b32 %r822, [ %r823 + 0 ]; | |
// end inline asm | |
mov.b32 %f926, %r822; | |
shfl.sync.bfly.b32 %r916, %r822, 2, 31, -1; | |
mov.b32 %f927, %r916; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f928, %f926, %f927; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r917, %f928; | |
shfl.sync.bfly.b32 %r918, %r917, 1, 31, -1; | |
mov.b32 %f929, %r918; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %f930, %f928, %f929; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
mov.b32 %r825, %f930; | |
// begin inline asm | |
@%p52 st.shared.b32 [ %r823 + 0 ], %r825; | |
// end inline asm | |
bar.sync 0; | |
ld.shared.b32 %f931, [global_smem]; | |
$L__tmp6: | |
.loc 1 125 35 // layer_norm.py:125:35 | |
div.full.f32 %f932, %f931, %f1; | |
.loc 1 126 34 // layer_norm.py:126:34 | |
fma.rn.f32 %f933, %f642, %f852, %f932; | |
fma.rn.f32 %f934, %f643, %f852, %f932; | |
fma.rn.f32 %f935, %f641, %f852, %f932; | |
fma.rn.f32 %f936, %f640, %f852, %f932; | |
fma.rn.f32 %f937, %f639, %f852, %f932; | |
fma.rn.f32 %f938, %f638, %f852, %f932; | |
fma.rn.f32 %f939, %f637, %f852, %f932; | |
fma.rn.f32 %f940, %f636, %f852, %f932; | |
fma.rn.f32 %f941, %f635, %f852, %f932; | |
fma.rn.f32 %f942, %f634, %f852, %f932; | |
fma.rn.f32 %f943, %f633, %f852, %f932; | |
fma.rn.f32 %f944, %f632, %f852, %f932; | |
fma.rn.f32 %f945, %f631, %f852, %f932; | |
fma.rn.f32 %f946, %f630, %f852, %f932; | |
fma.rn.f32 %f947, %f629, %f852, %f932; | |
fma.rn.f32 %f948, %f628, %f852, %f932; | |
fma.rn.f32 %f949, %f627, %f852, %f932; | |
fma.rn.f32 %f950, %f626, %f852, %f932; | |
fma.rn.f32 %f951, %f625, %f852, %f932; | |
fma.rn.f32 %f952, %f624, %f852, %f932; | |
fma.rn.f32 %f953, %f623, %f852, %f932; | |
fma.rn.f32 %f954, %f622, %f852, %f932; | |
fma.rn.f32 %f955, %f621, %f852, %f932; | |
fma.rn.f32 %f956, %f620, %f852, %f932; | |
fma.rn.f32 %f957, %f619, %f852, %f932; | |
fma.rn.f32 %f958, %f618, %f852, %f932; | |
fma.rn.f32 %f959, %f617, %f852, %f932; | |
fma.rn.f32 %f960, %f616, %f852, %f932; | |
fma.rn.f32 %f961, %f615, %f852, %f932; | |
fma.rn.f32 %f962, %f614, %f852, %f932; | |
fma.rn.f32 %f963, %f613, %f852, %f932; | |
fma.rn.f32 %f964, %f612, %f852, %f932; | |
fma.rn.f32 %f965, %f611, %f852, %f932; | |
fma.rn.f32 %f966, %f610, %f852, %f932; | |
fma.rn.f32 %f967, %f609, %f852, %f932; | |
fma.rn.f32 %f968, %f608, %f852, %f932; | |
fma.rn.f32 %f969, %f607, %f852, %f932; | |
fma.rn.f32 %f970, %f606, %f852, %f932; | |
fma.rn.f32 %f971, %f605, %f852, %f932; | |
fma.rn.f32 %f972, %f604, %f852, %f932; | |
fma.rn.f32 %f973, %f603, %f852, %f932; | |
fma.rn.f32 %f974, %f602, %f852, %f932; | |
fma.rn.f32 %f975, %f601, %f852, %f932; | |
fma.rn.f32 %f976, %f600, %f852, %f932; | |
fma.rn.f32 %f977, %f599, %f852, %f932; | |
fma.rn.f32 %f978, %f598, %f852, %f932; | |
fma.rn.f32 %f979, %f597, %f852, %f932; | |
fma.rn.f32 %f980, %f596, %f852, %f932; | |
fma.rn.f32 %f981, %f595, %f852, %f932; | |
fma.rn.f32 %f982, %f594, %f852, %f932; | |
fma.rn.f32 %f983, %f593, %f852, %f932; | |
fma.rn.f32 %f984, %f592, %f852, %f932; | |
fma.rn.f32 %f985, %f591, %f852, %f932; | |
fma.rn.f32 %f986, %f590, %f852, %f932; | |
fma.rn.f32 %f987, %f589, %f852, %f932; | |
fma.rn.f32 %f988, %f588, %f852, %f932; | |
fma.rn.f32 %f989, %f587, %f852, %f932; | |
fma.rn.f32 %f990, %f586, %f852, %f932; | |
fma.rn.f32 %f991, %f585, %f852, %f932; | |
fma.rn.f32 %f992, %f584, %f852, %f932; | |
fma.rn.f32 %f993, %f583, %f852, %f932; | |
fma.rn.f32 %f994, %f582, %f852, %f932; | |
fma.rn.f32 %f995, %f581, %f852, %f932; | |
fma.rn.f32 %f996, %f580, %f852, %f932; | |
.loc 1 126 21 // layer_norm.py:126:21 | |
neg.f32 %f997, %f933; | |
fma.rn.f32 %f998, %f386, %f644, %f997; | |
neg.f32 %f999, %f934; | |
fma.rn.f32 %f1000, %f387, %f646, %f999; | |
neg.f32 %f1001, %f935; | |
fma.rn.f32 %f1002, %f388, %f648, %f1001; | |
neg.f32 %f1003, %f936; | |
fma.rn.f32 %f1004, %f389, %f650, %f1003; | |
neg.f32 %f1005, %f937; | |
fma.rn.f32 %f1006, %f390, %f652, %f1005; | |
neg.f32 %f1007, %f938; | |
fma.rn.f32 %f1008, %f391, %f654, %f1007; | |
neg.f32 %f1009, %f939; | |
fma.rn.f32 %f1010, %f392, %f656, %f1009; | |
neg.f32 %f1011, %f940; | |
fma.rn.f32 %f1012, %f393, %f658, %f1011; | |
neg.f32 %f1013, %f941; | |
fma.rn.f32 %f1014, %f394, %f660, %f1013; | |
neg.f32 %f1015, %f942; | |
fma.rn.f32 %f1016, %f395, %f662, %f1015; | |
neg.f32 %f1017, %f943; | |
fma.rn.f32 %f1018, %f396, %f664, %f1017; | |
neg.f32 %f1019, %f944; | |
fma.rn.f32 %f1020, %f397, %f666, %f1019; | |
neg.f32 %f1021, %f945; | |
fma.rn.f32 %f1022, %f398, %f668, %f1021; | |
neg.f32 %f1023, %f946; | |
fma.rn.f32 %f1024, %f399, %f670, %f1023; | |
neg.f32 %f1025, %f947; | |
fma.rn.f32 %f1026, %f400, %f672, %f1025; | |
neg.f32 %f1027, %f948; | |
fma.rn.f32 %f1028, %f401, %f674, %f1027; | |
neg.f32 %f1029, %f949; | |
fma.rn.f32 %f1030, %f402, %f676, %f1029; | |
neg.f32 %f1031, %f950; | |
fma.rn.f32 %f1032, %f403, %f678, %f1031; | |
neg.f32 %f1033, %f951; | |
fma.rn.f32 %f1034, %f404, %f680, %f1033; | |
neg.f32 %f1035, %f952; | |
fma.rn.f32 %f1036, %f405, %f682, %f1035; | |
neg.f32 %f1037, %f953; | |
fma.rn.f32 %f1038, %f406, %f684, %f1037; | |
neg.f32 %f1039, %f954; | |
fma.rn.f32 %f1040, %f407, %f686, %f1039; | |
neg.f32 %f1041, %f955; | |
fma.rn.f32 %f1042, %f408, %f688, %f1041; | |
neg.f32 %f1043, %f956; | |
fma.rn.f32 %f1044, %f409, %f690, %f1043; | |
neg.f32 %f1045, %f957; | |
fma.rn.f32 %f1046, %f410, %f692, %f1045; | |
neg.f32 %f1047, %f958; | |
fma.rn.f32 %f1048, %f411, %f694, %f1047; | |
neg.f32 %f1049, %f959; | |
fma.rn.f32 %f1050, %f412, %f696, %f1049; | |
neg.f32 %f1051, %f960; | |
fma.rn.f32 %f1052, %f413, %f698, %f1051; | |
neg.f32 %f1053, %f961; | |
fma.rn.f32 %f1054, %f414, %f700, %f1053; | |
neg.f32 %f1055, %f962; | |
fma.rn.f32 %f1056, %f415, %f702, %f1055; | |
neg.f32 %f1057, %f963; | |
fma.rn.f32 %f1058, %f416, %f704, %f1057; | |
neg.f32 %f1059, %f964; | |
fma.rn.f32 %f1060, %f417, %f706, %f1059; | |
neg.f32 %f1061, %f965; | |
fma.rn.f32 %f1062, %f418, %f708, %f1061; | |
neg.f32 %f1063, %f966; | |
fma.rn.f32 %f1064, %f419, %f710, %f1063; | |
neg.f32 %f1065, %f967; | |
fma.rn.f32 %f1066, %f420, %f712, %f1065; | |
neg.f32 %f1067, %f968; | |
fma.rn.f32 %f1068, %f421, %f714, %f1067; | |
neg.f32 %f1069, %f969; | |
fma.rn.f32 %f1070, %f422, %f716, %f1069; | |
neg.f32 %f1071, %f970; | |
fma.rn.f32 %f1072, %f423, %f718, %f1071; | |
neg.f32 %f1073, %f971; | |
fma.rn.f32 %f1074, %f424, %f720, %f1073; | |
neg.f32 %f1075, %f972; | |
fma.rn.f32 %f1076, %f425, %f722, %f1075; | |
neg.f32 %f1077, %f973; | |
fma.rn.f32 %f1078, %f426, %f724, %f1077; | |
neg.f32 %f1079, %f974; | |
fma.rn.f32 %f1080, %f427, %f726, %f1079; | |
neg.f32 %f1081, %f975; | |
fma.rn.f32 %f1082, %f428, %f728, %f1081; | |
neg.f32 %f1083, %f976; | |
fma.rn.f32 %f1084, %f429, %f730, %f1083; | |
neg.f32 %f1085, %f977; | |
fma.rn.f32 %f1086, %f430, %f732, %f1085; | |
neg.f32 %f1087, %f978; | |
fma.rn.f32 %f1088, %f431, %f734, %f1087; | |
neg.f32 %f1089, %f979; | |
fma.rn.f32 %f1090, %f432, %f736, %f1089; | |
neg.f32 %f1091, %f980; | |
fma.rn.f32 %f1092, %f433, %f738, %f1091; | |
neg.f32 %f1093, %f981; | |
fma.rn.f32 %f1094, %f434, %f740, %f1093; | |
neg.f32 %f1095, %f982; | |
fma.rn.f32 %f1096, %f435, %f742, %f1095; | |
neg.f32 %f1097, %f983; | |
fma.rn.f32 %f1098, %f436, %f744, %f1097; | |
neg.f32 %f1099, %f984; | |
fma.rn.f32 %f1100, %f437, %f746, %f1099; | |
neg.f32 %f1101, %f985; | |
fma.rn.f32 %f1102, %f438, %f748, %f1101; | |
neg.f32 %f1103, %f986; | |
fma.rn.f32 %f1104, %f439, %f750, %f1103; | |
neg.f32 %f1105, %f987; | |
fma.rn.f32 %f1106, %f440, %f752, %f1105; | |
neg.f32 %f1107, %f988; | |
fma.rn.f32 %f1108, %f441, %f754, %f1107; | |
neg.f32 %f1109, %f989; | |
fma.rn.f32 %f1110, %f442, %f756, %f1109; | |
neg.f32 %f1111, %f990; | |
fma.rn.f32 %f1112, %f443, %f758, %f1111; | |
neg.f32 %f1113, %f991; | |
fma.rn.f32 %f1114, %f444, %f760, %f1113; | |
neg.f32 %f1115, %f992; | |
fma.rn.f32 %f1116, %f445, %f762, %f1115; | |
neg.f32 %f1117, %f993; | |
fma.rn.f32 %f1118, %f446, %f764, %f1117; | |
neg.f32 %f1119, %f994; | |
fma.rn.f32 %f1120, %f447, %f766, %f1119; | |
neg.f32 %f1121, %f995; | |
fma.rn.f32 %f1122, %f448, %f768, %f1121; | |
neg.f32 %f1123, %f996; | |
fma.rn.f32 %f1124, %f449, %f770, %f1123; | |
.loc 1 126 41 // layer_norm.py:126:41 | |
mul.f32 %f1125, %f998, %f450; | |
mul.f32 %f1126, %f1000, %f450; | |
mul.f32 %f1127, %f1002, %f450; | |
mul.f32 %f1128, %f1004, %f450; | |
mul.f32 %f1129, %f1006, %f450; | |
mul.f32 %f1130, %f1008, %f450; | |
mul.f32 %f1131, %f1010, %f450; | |
mul.f32 %f1132, %f1012, %f450; | |
mul.f32 %f1133, %f1014, %f450; | |
mul.f32 %f1134, %f1016, %f450; | |
mul.f32 %f1135, %f1018, %f450; | |
mul.f32 %f1136, %f1020, %f450; | |
mul.f32 %f1137, %f1022, %f450; | |
mul.f32 %f1138, %f1024, %f450; | |
mul.f32 %f1139, %f1026, %f450; | |
mul.f32 %f1140, %f1028, %f450; | |
mul.f32 %f1141, %f1030, %f450; | |
mul.f32 %f1142, %f1032, %f450; | |
mul.f32 %f1143, %f1034, %f450; | |
mul.f32 %f1144, %f1036, %f450; | |
mul.f32 %f1145, %f1038, %f450; | |
mul.f32 %f1146, %f1040, %f450; | |
mul.f32 %f1147, %f1042, %f450; | |
mul.f32 %f1148, %f1044, %f450; | |
mul.f32 %f1149, %f1046, %f450; | |
mul.f32 %f1150, %f1048, %f450; | |
mul.f32 %f1151, %f1050, %f450; | |
mul.f32 %f1152, %f1052, %f450; | |
mul.f32 %f1153, %f1054, %f450; | |
mul.f32 %f1154, %f1056, %f450; | |
mul.f32 %f1155, %f1058, %f450; | |
mul.f32 %f1156, %f1060, %f450; | |
mul.f32 %f1157, %f1062, %f450; | |
mul.f32 %f1158, %f1064, %f450; | |
mul.f32 %f1159, %f1066, %f450; | |
mul.f32 %f1160, %f1068, %f450; | |
mul.f32 %f1161, %f1070, %f450; | |
mul.f32 %f1162, %f1072, %f450; | |
mul.f32 %f1163, %f1074, %f450; | |
mul.f32 %f1164, %f1076, %f450; | |
mul.f32 %f1165, %f1078, %f450; | |
mul.f32 %f1166, %f1080, %f450; | |
mul.f32 %f1167, %f1082, %f450; | |
mul.f32 %f1168, %f1084, %f450; | |
mul.f32 %f1169, %f1086, %f450; | |
mul.f32 %f1170, %f1088, %f450; | |
mul.f32 %f1171, %f1090, %f450; | |
mul.f32 %f1172, %f1092, %f450; | |
mul.f32 %f1173, %f1094, %f450; | |
mul.f32 %f1174, %f1096, %f450; | |
mul.f32 %f1175, %f1098, %f450; | |
mul.f32 %f1176, %f1100, %f450; | |
mul.f32 %f1177, %f1102, %f450; | |
mul.f32 %f1178, %f1104, %f450; | |
mul.f32 %f1179, %f1106, %f450; | |
mul.f32 %f1180, %f1108, %f450; | |
mul.f32 %f1181, %f1110, %f450; | |
mul.f32 %f1182, %f1112, %f450; | |
mul.f32 %f1183, %f1114, %f450; | |
mul.f32 %f1184, %f1116, %f450; | |
mul.f32 %f1185, %f1118, %f450; | |
mul.f32 %f1186, %f1120, %f450; | |
mul.f32 %f1187, %f1122, %f450; | |
mul.f32 %f1188, %f1124, %f450; | |
.loc 1 127 32 // layer_norm.py:127:32 | |
mov.b32 %r826, %f1125; | |
mov.b32 %r827, %f1126; | |
mov.b32 %r828, %f1127; | |
mov.b32 %r829, %f1128; | |
// begin inline asm | |
@%p2 st.global.v4.b32 [ %rd106 + 0 ], { %r826, %r827, %r828, %r829 }; | |
// end inline asm | |
mov.b32 %r830, %f1129; | |
mov.b32 %r831, %f1130; | |
mov.b32 %r832, %f1131; | |
mov.b32 %r833, %f1132; | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd107 + 0 ], { %r830, %r831, %r832, %r833 }; | |
// end inline asm | |
mov.b32 %r834, %f1133; | |
mov.b32 %r835, %f1134; | |
mov.b32 %r836, %f1135; | |
mov.b32 %r837, %f1136; | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd108 + 0 ], { %r834, %r835, %r836, %r837 }; | |
// end inline asm | |
mov.b32 %r838, %f1137; | |
mov.b32 %r839, %f1138; | |
mov.b32 %r840, %f1139; | |
mov.b32 %r841, %f1140; | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd109 + 0 ], { %r838, %r839, %r840, %r841 }; | |
// end inline asm | |
mov.b32 %r842, %f1141; | |
mov.b32 %r843, %f1142; | |
mov.b32 %r844, %f1143; | |
mov.b32 %r845, %f1144; | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd110 + 0 ], { %r842, %r843, %r844, %r845 }; | |
// end inline asm | |
mov.b32 %r846, %f1145; | |
mov.b32 %r847, %f1146; | |
mov.b32 %r848, %f1147; | |
mov.b32 %r849, %f1148; | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd111 + 0 ], { %r846, %r847, %r848, %r849 }; | |
// end inline asm | |
mov.b32 %r850, %f1149; | |
mov.b32 %r851, %f1150; | |
mov.b32 %r852, %f1151; | |
mov.b32 %r853, %f1152; | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd112 + 0 ], { %r850, %r851, %r852, %r853 }; | |
// end inline asm | |
mov.b32 %r854, %f1153; | |
mov.b32 %r855, %f1154; | |
mov.b32 %r856, %f1155; | |
mov.b32 %r857, %f1156; | |
// begin inline asm | |
@%p9 st.global.v4.b32 [ %rd113 + 0 ], { %r854, %r855, %r856, %r857 }; | |
// end inline asm | |
mov.b32 %r858, %f1157; | |
mov.b32 %r859, %f1158; | |
mov.b32 %r860, %f1159; | |
mov.b32 %r861, %f1160; | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd114 + 0 ], { %r858, %r859, %r860, %r861 }; | |
// end inline asm | |
mov.b32 %r862, %f1161; | |
mov.b32 %r863, %f1162; | |
mov.b32 %r864, %f1163; | |
mov.b32 %r865, %f1164; | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd115 + 0 ], { %r862, %r863, %r864, %r865 }; | |
// end inline asm | |
mov.b32 %r866, %f1165; | |
mov.b32 %r867, %f1166; | |
mov.b32 %r868, %f1167; | |
mov.b32 %r869, %f1168; | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd116 + 0 ], { %r866, %r867, %r868, %r869 }; | |
// end inline asm | |
mov.b32 %r870, %f1169; | |
mov.b32 %r871, %f1170; | |
mov.b32 %r872, %f1171; | |
mov.b32 %r873, %f1172; | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd117 + 0 ], { %r870, %r871, %r872, %r873 }; | |
// end inline asm | |
mov.b32 %r874, %f1173; | |
mov.b32 %r875, %f1174; | |
mov.b32 %r876, %f1175; | |
mov.b32 %r877, %f1176; | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd118 + 0 ], { %r874, %r875, %r876, %r877 }; | |
// end inline asm | |
mov.b32 %r878, %f1177; | |
mov.b32 %r879, %f1178; | |
mov.b32 %r880, %f1179; | |
mov.b32 %r881, %f1180; | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd119 + 0 ], { %r878, %r879, %r880, %r881 }; | |
// end inline asm | |
mov.b32 %r882, %f1181; | |
mov.b32 %r883, %f1182; | |
mov.b32 %r884, %f1183; | |
mov.b32 %r885, %f1184; | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd120 + 0 ], { %r882, %r883, %r884, %r885 }; | |
// end inline asm | |
mov.b32 %r886, %f1185; | |
mov.b32 %r887, %f1186; | |
mov.b32 %r888, %f1187; | |
mov.b32 %r889, %f1188; | |
// begin inline asm | |
@%p17 st.global.v4.b32 [ %rd121 + 0 ], { %r886, %r887, %r888, %r889 }; | |
// end inline asm | |
.loc 1 129 18 // layer_norm.py:129:18 | |
fma.rn.f32 %f1243, %f589, %f752, %f1243; | |
fma.rn.f32 %f1242, %f590, %f750, %f1242; | |
fma.rn.f32 %f1241, %f591, %f748, %f1241; | |
fma.rn.f32 %f1240, %f592, %f746, %f1240; | |
fma.rn.f32 %f1239, %f593, %f744, %f1239; | |
fma.rn.f32 %f1238, %f594, %f742, %f1238; | |
fma.rn.f32 %f1237, %f595, %f740, %f1237; | |
fma.rn.f32 %f1236, %f596, %f738, %f1236; | |
fma.rn.f32 %f1235, %f597, %f736, %f1235; | |
fma.rn.f32 %f1234, %f598, %f734, %f1234; | |
fma.rn.f32 %f1233, %f599, %f732, %f1233; | |
fma.rn.f32 %f1232, %f600, %f730, %f1232; | |
fma.rn.f32 %f1231, %f601, %f728, %f1231; | |
fma.rn.f32 %f1230, %f602, %f726, %f1230; | |
fma.rn.f32 %f1229, %f603, %f724, %f1229; | |
fma.rn.f32 %f1228, %f604, %f722, %f1228; | |
fma.rn.f32 %f1227, %f605, %f720, %f1227; | |
fma.rn.f32 %f1226, %f606, %f718, %f1226; | |
fma.rn.f32 %f1225, %f607, %f716, %f1225; | |
fma.rn.f32 %f1224, %f608, %f714, %f1224; | |
fma.rn.f32 %f1223, %f609, %f712, %f1223; | |
fma.rn.f32 %f1222, %f610, %f710, %f1222; | |
fma.rn.f32 %f1221, %f611, %f708, %f1221; | |
fma.rn.f32 %f1220, %f612, %f706, %f1220; | |
fma.rn.f32 %f1219, %f613, %f704, %f1219; | |
fma.rn.f32 %f1218, %f614, %f702, %f1218; | |
fma.rn.f32 %f1217, %f615, %f700, %f1217; | |
fma.rn.f32 %f1216, %f616, %f698, %f1216; | |
fma.rn.f32 %f1215, %f617, %f696, %f1215; | |
fma.rn.f32 %f1214, %f618, %f694, %f1214; | |
fma.rn.f32 %f1213, %f619, %f692, %f1213; | |
fma.rn.f32 %f1212, %f620, %f690, %f1212; | |
fma.rn.f32 %f1211, %f621, %f688, %f1211; | |
fma.rn.f32 %f1210, %f622, %f686, %f1210; | |
fma.rn.f32 %f1209, %f623, %f684, %f1209; | |
fma.rn.f32 %f1208, %f624, %f682, %f1208; | |
fma.rn.f32 %f1207, %f625, %f680, %f1207; | |
fma.rn.f32 %f1206, %f626, %f678, %f1206; | |
fma.rn.f32 %f1205, %f627, %f676, %f1205; | |
fma.rn.f32 %f1204, %f628, %f674, %f1204; | |
fma.rn.f32 %f1203, %f629, %f672, %f1203; | |
fma.rn.f32 %f1202, %f630, %f670, %f1202; | |
fma.rn.f32 %f1201, %f631, %f668, %f1201; | |
fma.rn.f32 %f1200, %f632, %f666, %f1200; | |
fma.rn.f32 %f1199, %f633, %f664, %f1199; | |
fma.rn.f32 %f1198, %f634, %f662, %f1198; | |
fma.rn.f32 %f1197, %f635, %f660, %f1197; | |
fma.rn.f32 %f1196, %f636, %f658, %f1196; | |
fma.rn.f32 %f1195, %f637, %f656, %f1195; | |
fma.rn.f32 %f1194, %f638, %f654, %f1194; | |
fma.rn.f32 %f1193, %f639, %f652, %f1193; | |
fma.rn.f32 %f1192, %f640, %f650, %f1192; | |
fma.rn.f32 %f1191, %f641, %f648, %f1191; | |
fma.rn.f32 %f1190, %f643, %f646, %f1190; | |
fma.rn.f32 %f1189, %f642, %f644, %f1189; | |
fma.rn.f32 %f1244, %f588, %f754, %f1244; | |
fma.rn.f32 %f1245, %f587, %f756, %f1245; | |
fma.rn.f32 %f1246, %f586, %f758, %f1246; | |
fma.rn.f32 %f1247, %f585, %f760, %f1247; | |
fma.rn.f32 %f1248, %f584, %f762, %f1248; | |
fma.rn.f32 %f1249, %f583, %f764, %f1249; | |
fma.rn.f32 %f1250, %f582, %f766, %f1250; | |
fma.rn.f32 %f1251, %f581, %f768, %f1251; | |
fma.rn.f32 %f1252, %f580, %f770, %f1252; | |
add.f32 %f1253, %f1253, %f644; | |
add.f32 %f1254, %f1254, %f646; | |
add.f32 %f1255, %f1255, %f648; | |
add.f32 %f1256, %f1256, %f650; | |
add.f32 %f1257, %f1257, %f652; | |
add.f32 %f1258, %f1258, %f654; | |
add.f32 %f1259, %f1259, %f656; | |
add.f32 %f1260, %f1260, %f658; | |
add.f32 %f1261, %f1261, %f660; | |
add.f32 %f1262, %f1262, %f662; | |
add.f32 %f1263, %f1263, %f664; | |
add.f32 %f1264, %f1264, %f666; | |
add.f32 %f1265, %f1265, %f668; | |
add.f32 %f1266, %f1266, %f670; | |
add.f32 %f1267, %f1267, %f672; | |
add.f32 %f1268, %f1268, %f674; | |
add.f32 %f1269, %f1269, %f676; | |
add.f32 %f1270, %f1270, %f678; | |
add.f32 %f1271, %f1271, %f680; | |
add.f32 %f1272, %f1272, %f682; | |
add.f32 %f1273, %f1273, %f684; | |
add.f32 %f1274, %f1274, %f686; | |
add.f32 %f1275, %f1275, %f688; | |
add.f32 %f1276, %f1276, %f690; | |
add.f32 %f1277, %f1277, %f692; | |
add.f32 %f1278, %f1278, %f694; | |
add.f32 %f1279, %f1279, %f696; | |
add.f32 %f1280, %f1280, %f698; | |
add.f32 %f1281, %f1281, %f700; | |
add.f32 %f1282, %f1282, %f702; | |
add.f32 %f1283, %f1283, %f704; | |
add.f32 %f1284, %f1284, %f706; | |
add.f32 %f1285, %f1285, %f708; | |
add.f32 %f1286, %f1286, %f710; | |
add.f32 %f1287, %f1287, %f712; | |
add.f32 %f1288, %f1288, %f714; | |
add.f32 %f1289, %f1289, %f716; | |
add.f32 %f1290, %f1290, %f718; | |
add.f32 %f1291, %f1291, %f720; | |
add.f32 %f1292, %f1292, %f722; | |
add.f32 %f1293, %f1293, %f724; | |
add.f32 %f1294, %f1294, %f726; | |
add.f32 %f1295, %f1295, %f728; | |
add.f32 %f1296, %f1296, %f730; | |
add.f32 %f1297, %f1297, %f732; | |
add.f32 %f1298, %f1298, %f734; | |
add.f32 %f1299, %f1299, %f736; | |
add.f32 %f1300, %f1300, %f738; | |
add.f32 %f1301, %f1301, %f740; | |
add.f32 %f1302, %f1302, %f742; | |
add.f32 %f1303, %f1303, %f744; | |
add.f32 %f1304, %f1304, %f746; | |
add.f32 %f1305, %f1305, %f748; | |
add.f32 %f1306, %f1306, %f750; | |
add.f32 %f1307, %f1307, %f752; | |
add.f32 %f1308, %f1308, %f754; | |
add.f32 %f1309, %f1309, %f756; | |
add.f32 %f1310, %f1310, %f758; | |
add.f32 %f1311, %f1311, %f760; | |
add.f32 %f1312, %f1312, %f762; | |
add.f32 %f1313, %f1313, %f764; | |
add.f32 %f1314, %f1314, %f766; | |
add.f32 %f1315, %f1315, %f768; | |
add.f32 %f1316, %f1316, %f770; | |
.loc 1 133 20 // layer_norm.py:133:20 | |
add.s64 %rd167, %rd167, 4; | |
.loc 1 134 20 // layer_norm.py:134:20 | |
add.s64 %rd166, %rd166, 4; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
add.s64 %rd165, %rd165, %rd26; | |
add.s64 %rd164, %rd164, %rd29; | |
add.s64 %rd163, %rd163, %rd30; | |
add.s32 %r1055, %r1055, -1; | |
setp.ne.s32 %p72, %r1055, 0; | |
@%p72 bra $L__BB0_2; | |
// %bb.3: // %._crit_edge.loopexit | |
.loc 1 138 55 // layer_norm.py:138:55 | |
mov.b32 %r1056, %f1189; | |
mov.b32 %r1057, %f1190; | |
mov.b32 %r1058, %f1191; | |
mov.b32 %r1059, %f1192; | |
mov.b32 %r1060, %f1193; | |
mov.b32 %r1061, %f1194; | |
mov.b32 %r1062, %f1195; | |
mov.b32 %r1063, %f1196; | |
mov.b32 %r1064, %f1197; | |
mov.b32 %r1065, %f1198; | |
mov.b32 %r1066, %f1199; | |
mov.b32 %r1067, %f1200; | |
mov.b32 %r1068, %f1201; | |
mov.b32 %r1069, %f1202; | |
mov.b32 %r1070, %f1203; | |
mov.b32 %r1071, %f1204; | |
mov.b32 %r1072, %f1205; | |
mov.b32 %r1073, %f1206; | |
mov.b32 %r1074, %f1207; | |
mov.b32 %r1075, %f1208; | |
mov.b32 %r1076, %f1209; | |
mov.b32 %r1077, %f1210; | |
mov.b32 %r1078, %f1211; | |
mov.b32 %r1079, %f1212; | |
mov.b32 %r1080, %f1213; | |
mov.b32 %r1081, %f1214; | |
mov.b32 %r1082, %f1215; | |
mov.b32 %r1083, %f1216; | |
mov.b32 %r1084, %f1217; | |
mov.b32 %r1085, %f1218; | |
mov.b32 %r1086, %f1219; | |
mov.b32 %r1087, %f1220; | |
mov.b32 %r1088, %f1221; | |
mov.b32 %r1089, %f1222; | |
mov.b32 %r1090, %f1223; | |
mov.b32 %r1091, %f1224; | |
mov.b32 %r1092, %f1225; | |
mov.b32 %r1093, %f1226; | |
mov.b32 %r1094, %f1227; | |
mov.b32 %r1095, %f1228; | |
mov.b32 %r1096, %f1229; | |
mov.b32 %r1097, %f1230; | |
mov.b32 %r1098, %f1231; | |
mov.b32 %r1099, %f1232; | |
mov.b32 %r1100, %f1233; | |
mov.b32 %r1101, %f1234; | |
mov.b32 %r1102, %f1235; | |
mov.b32 %r1103, %f1236; | |
mov.b32 %r1104, %f1237; | |
mov.b32 %r1105, %f1238; | |
mov.b32 %r1106, %f1239; | |
mov.b32 %r1107, %f1240; | |
mov.b32 %r1108, %f1241; | |
mov.b32 %r1109, %f1242; | |
mov.b32 %r1110, %f1243; | |
mov.b32 %r1111, %f1244; | |
mov.b32 %r1112, %f1245; | |
mov.b32 %r1113, %f1246; | |
mov.b32 %r1114, %f1247; | |
mov.b32 %r1115, %f1248; | |
mov.b32 %r1116, %f1249; | |
mov.b32 %r1117, %f1250; | |
mov.b32 %r1118, %f1251; | |
mov.b32 %r1119, %f1252; | |
mov.b32 %r1120, %f1253; | |
mov.b32 %r1121, %f1254; | |
mov.b32 %r1122, %f1255; | |
mov.b32 %r1123, %f1256; | |
mov.b32 %r1124, %f1257; | |
mov.b32 %r1125, %f1258; | |
mov.b32 %r1126, %f1259; | |
mov.b32 %r1127, %f1260; | |
mov.b32 %r1128, %f1261; | |
mov.b32 %r1129, %f1262; | |
mov.b32 %r1130, %f1263; | |
mov.b32 %r1131, %f1264; | |
mov.b32 %r1132, %f1265; | |
mov.b32 %r1133, %f1266; | |
mov.b32 %r1134, %f1267; | |
mov.b32 %r1135, %f1268; | |
mov.b32 %r1136, %f1269; | |
mov.b32 %r1137, %f1270; | |
mov.b32 %r1138, %f1271; | |
mov.b32 %r1139, %f1272; | |
mov.b32 %r1140, %f1273; | |
mov.b32 %r1141, %f1274; | |
mov.b32 %r1142, %f1275; | |
mov.b32 %r1143, %f1276; | |
mov.b32 %r1144, %f1277; | |
mov.b32 %r1145, %f1278; | |
mov.b32 %r1146, %f1279; | |
mov.b32 %r1147, %f1280; | |
mov.b32 %r1148, %f1281; | |
mov.b32 %r1149, %f1282; | |
mov.b32 %r1150, %f1283; | |
mov.b32 %r1151, %f1284; | |
mov.b32 %r1152, %f1285; | |
mov.b32 %r1153, %f1286; | |
mov.b32 %r1154, %f1287; | |
mov.b32 %r1155, %f1288; | |
mov.b32 %r1156, %f1289; | |
mov.b32 %r1157, %f1290; | |
mov.b32 %r1158, %f1291; | |
mov.b32 %r1159, %f1292; | |
mov.b32 %r1160, %f1293; | |
mov.b32 %r1161, %f1294; | |
mov.b32 %r1162, %f1295; | |
mov.b32 %r1163, %f1296; | |
mov.b32 %r1164, %f1297; | |
mov.b32 %r1165, %f1298; | |
mov.b32 %r1166, %f1299; | |
mov.b32 %r1167, %f1300; | |
mov.b32 %r1168, %f1301; | |
mov.b32 %r1169, %f1302; | |
mov.b32 %r1170, %f1303; | |
mov.b32 %r1171, %f1304; | |
mov.b32 %r1172, %f1305; | |
mov.b32 %r1173, %f1306; | |
mov.b32 %r1174, %f1307; | |
mov.b32 %r1175, %f1308; | |
mov.b32 %r1176, %f1309; | |
mov.b32 %r1177, %f1310; | |
mov.b32 %r1178, %f1311; | |
mov.b32 %r1179, %f1312; | |
mov.b32 %r1180, %f1313; | |
mov.b32 %r1181, %f1314; | |
mov.b32 %r1182, %f1315; | |
mov.b32 %r1183, %f1316; | |
$L__BB0_4: // %._crit_edge | |
.loc 1 104 18 // layer_norm.py:104:18 | |
setp.lt.s32 %p88, %r1052, %r285; | |
setp.lt.s32 %p80, %r1053, %r285; | |
setp.lt.s32 %p73, %r1054, %r285; | |
.loc 1 138 37 // layer_norm.py:138:37 | |
mul.lo.s32 %r1050, %r282, %r1; | |
.loc 1 138 22 // layer_norm.py:138:22 | |
mul.wide.s32 %rd154, %r1050, 4; | |
add.s64 %rd155, %rd45, %rd154; | |
.loc 1 138 49 // layer_norm.py:138:49 | |
shl.b64 %rd156, %rd1, 2; | |
add.s64 %rd122, %rd155, %rd156; | |
add.s64 %rd123, %rd122, 2048; | |
add.s64 %rd124, %rd122, 4096; | |
add.s64 %rd125, %rd122, 6144; | |
add.s64 %rd126, %rd122, 8192; | |
add.s64 %rd127, %rd122, 10240; | |
add.s64 %rd128, %rd122, 12288; | |
add.s64 %rd129, %rd155, %rd162; | |
add.s64 %rd130, %rd122, 16384; | |
add.s64 %rd131, %rd122, 18432; | |
add.s64 %rd132, %rd122, 20480; | |
add.s64 %rd133, %rd122, 22528; | |
add.s64 %rd134, %rd122, 24576; | |
add.s64 %rd135, %rd122, 26624; | |
add.s64 %rd136, %rd122, 28672; | |
add.s64 %rd137, %rd155, %rd161; | |
.loc 1 138 55 // layer_norm.py:138:55 | |
// begin inline asm | |
@%p73 st.global.v4.b32 [ %rd122 + 0 ], { %r1056, %r1057, %r1058, %r1059 }; | |
// end inline asm | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd123 + 0 ], { %r1060, %r1061, %r1062, %r1063 }; | |
// end inline asm | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd124 + 0 ], { %r1064, %r1065, %r1066, %r1067 }; | |
// end inline asm | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd125 + 0 ], { %r1068, %r1069, %r1070, %r1071 }; | |
// end inline asm | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd126 + 0 ], { %r1072, %r1073, %r1074, %r1075 }; | |
// end inline asm | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd127 + 0 ], { %r1076, %r1077, %r1078, %r1079 }; | |
// end inline asm | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd128 + 0 ], { %r1080, %r1081, %r1082, %r1083 }; | |
// end inline asm | |
// begin inline asm | |
@%p80 st.global.v4.b32 [ %rd129 + 0 ], { %r1084, %r1085, %r1086, %r1087 }; | |
// end inline asm | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd130 + 0 ], { %r1088, %r1089, %r1090, %r1091 }; | |
// end inline asm | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd131 + 0 ], { %r1092, %r1093, %r1094, %r1095 }; | |
// end inline asm | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd132 + 0 ], { %r1096, %r1097, %r1098, %r1099 }; | |
// end inline asm | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd133 + 0 ], { %r1100, %r1101, %r1102, %r1103 }; | |
// end inline asm | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd134 + 0 ], { %r1104, %r1105, %r1106, %r1107 }; | |
// end inline asm | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd135 + 0 ], { %r1108, %r1109, %r1110, %r1111 }; | |
// end inline asm | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd136 + 0 ], { %r1112, %r1113, %r1114, %r1115 }; | |
// end inline asm | |
// begin inline asm | |
@%p88 st.global.v4.b32 [ %rd137 + 0 ], { %r1116, %r1117, %r1118, %r1119 }; | |
// end inline asm | |
.loc 1 139 37 // layer_norm.py:139:37 | |
mul.lo.s32 %r1051, %r283, %r1; | |
.loc 1 139 22 // layer_norm.py:139:22 | |
mul.wide.s32 %rd159, %r1051, 4; | |
add.s64 %rd160, %rd46, %rd159; | |
.loc 1 139 49 // layer_norm.py:139:49 | |
add.s64 %rd138, %rd160, %rd156; | |
add.s64 %rd139, %rd138, 2048; | |
add.s64 %rd140, %rd138, 4096; | |
add.s64 %rd141, %rd138, 6144; | |
add.s64 %rd142, %rd138, 8192; | |
add.s64 %rd143, %rd138, 10240; | |
add.s64 %rd144, %rd138, 12288; | |
add.s64 %rd145, %rd160, %rd162; | |
add.s64 %rd146, %rd138, 16384; | |
add.s64 %rd147, %rd138, 18432; | |
add.s64 %rd148, %rd138, 20480; | |
add.s64 %rd149, %rd138, 22528; | |
add.s64 %rd150, %rd138, 24576; | |
add.s64 %rd151, %rd138, 26624; | |
add.s64 %rd152, %rd138, 28672; | |
add.s64 %rd153, %rd160, %rd161; | |
.loc 1 139 55 // layer_norm.py:139:55 | |
// begin inline asm | |
@%p73 st.global.v4.b32 [ %rd138 + 0 ], { %r1120, %r1121, %r1122, %r1123 }; | |
// end inline asm | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd139 + 0 ], { %r1124, %r1125, %r1126, %r1127 }; | |
// end inline asm | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd140 + 0 ], { %r1128, %r1129, %r1130, %r1131 }; | |
// end inline asm | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd141 + 0 ], { %r1132, %r1133, %r1134, %r1135 }; | |
// end inline asm | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd142 + 0 ], { %r1136, %r1137, %r1138, %r1139 }; | |
// end inline asm | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd143 + 0 ], { %r1140, %r1141, %r1142, %r1143 }; | |
// end inline asm | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd144 + 0 ], { %r1144, %r1145, %r1146, %r1147 }; | |
// end inline asm | |
// begin inline asm | |
@%p80 st.global.v4.b32 [ %rd145 + 0 ], { %r1148, %r1149, %r1150, %r1151 }; | |
// end inline asm | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd146 + 0 ], { %r1152, %r1153, %r1154, %r1155 }; | |
// end inline asm | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd147 + 0 ], { %r1156, %r1157, %r1158, %r1159 }; | |
// end inline asm | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd148 + 0 ], { %r1160, %r1161, %r1162, %r1163 }; | |
// end inline asm | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd149 + 0 ], { %r1164, %r1165, %r1166, %r1167 }; | |
// end inline asm | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd150 + 0 ], { %r1168, %r1169, %r1170, %r1171 }; | |
// end inline asm | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd151 + 0 ], { %r1172, %r1173, %r1174, %r1175 }; | |
// end inline asm | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd152 + 0 ], { %r1176, %r1177, %r1178, %r1179 }; | |
// end inline asm | |
// begin inline asm | |
@%p88 st.global.v4.b32 [ %rd153 + 0 ], { %r1180, %r1181, %r1182, %r1183 }; | |
// end inline asm | |
.loc 1 139 4 // layer_norm.py:139:4 | |
ret; | |
$L__tmp7: | |
$L__func_end0: | |
// -- End function | |
} | |
.file 1 "/home/dberard/local/pytorch-env7/Liger-Kernel/src/liger_kernel/ops/layer_norm.py" | |
.file 2 "/home/dberard/local/pytorch-env7/triton/python/triton/language/standard.py" | |
.section .debug_abbrev | |
{ | |
.b8 1 // Abbreviation Code | |
.b8 17 // DW_TAG_compile_unit | |
.b8 1 // DW_CHILDREN_yes | |
.b8 37 // DW_AT_producer | |
.b8 8 // DW_FORM_string | |
.b8 19 // DW_AT_language | |
.b8 5 // DW_FORM_data2 | |
.b8 3 // DW_AT_name | |
.b8 8 // DW_FORM_string | |
.b8 16 // DW_AT_stmt_list | |
.b8 6 // DW_FORM_data4 | |
.b8 27 // DW_AT_comp_dir | |
.b8 8 // DW_FORM_string | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 2 // Abbreviation Code | |
.b8 46 // DW_TAG_subprogram | |
.b8 0 // DW_CHILDREN_no | |
.b8 3 // DW_AT_name | |
.b8 8 // DW_FORM_string | |
.b8 32 // DW_AT_inline | |
.b8 11 // DW_FORM_data1 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 3 // Abbreviation Code | |
.b8 46 // DW_TAG_subprogram | |
.b8 1 // DW_CHILDREN_yes | |
.b8 17 // DW_AT_low_pc | |
.b8 1 // DW_FORM_addr | |
.b8 18 // DW_AT_high_pc | |
.b8 1 // DW_FORM_addr | |
.b8 49 // DW_AT_abstract_origin | |
.b8 19 // DW_FORM_ref4 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 4 // Abbreviation Code | |
.b8 29 // DW_TAG_inlined_subroutine | |
.b8 0 // DW_CHILDREN_no | |
.b8 49 // DW_AT_abstract_origin | |
.b8 19 // DW_FORM_ref4 | |
.b8 17 // DW_AT_low_pc | |
.b8 1 // DW_FORM_addr | |
.b8 18 // DW_AT_high_pc | |
.b8 1 // DW_FORM_addr | |
.b8 88 // DW_AT_call_file | |
.b8 11 // DW_FORM_data1 | |
.b8 89 // DW_AT_call_line | |
.b8 11 // DW_FORM_data1 | |
.b8 87 // DW_AT_call_column | |
.b8 11 // DW_FORM_data1 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 0 // EOM(3) | |
} | |
.section .debug_info | |
{ | |
.b32 203 // Length of Unit | |
.b8 2 // DWARF version number | |
.b8 0 | |
.b32 .debug_abbrev // Offset Into Abbrev. Section | |
.b8 8 // Address Size (in bytes) | |
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit | |
.b8 116 // DW_AT_producer | |
.b8 114 | |
.b8 105 | |
.b8 116 | |
.b8 111 | |
.b8 110 | |
.b8 0 | |
.b8 2 // DW_AT_language | |
.b8 0 | |
.b8 108 // DW_AT_name | |
.b8 97 | |
.b8 121 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 110 | |
.b8 111 | |
.b8 114 | |
.b8 109 | |
.b8 46 | |
.b8 112 | |
.b8 121 | |
.b8 0 | |
.b32 .debug_line // DW_AT_stmt_list | |
.b8 47 // DW_AT_comp_dir | |
.b8 104 | |
.b8 111 | |
.b8 109 | |
.b8 101 | |
.b8 47 | |
.b8 100 | |
.b8 98 | |
.b8 101 | |
.b8 114 | |
.b8 97 | |
.b8 114 | |
.b8 100 | |
.b8 47 | |
.b8 108 | |
.b8 111 | |
.b8 99 | |
.b8 97 | |
.b8 108 | |
.b8 47 | |
.b8 112 | |
.b8 121 | |
.b8 116 | |
.b8 111 | |
.b8 114 | |
.b8 99 | |
.b8 104 | |
.b8 45 | |
.b8 101 | |
.b8 110 | |
.b8 118 | |
.b8 55 | |
.b8 47 | |
.b8 76 | |
.b8 105 | |
.b8 103 | |
.b8 101 | |
.b8 114 | |
.b8 45 | |
.b8 75 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 47 | |
.b8 115 | |
.b8 114 | |
.b8 99 | |
.b8 47 | |
.b8 108 | |
.b8 105 | |
.b8 103 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 107 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 47 | |
.b8 111 | |
.b8 112 | |
.b8 115 | |
.b8 0 | |
.b8 2 // Abbrev [2] 0x6a:0x1e DW_TAG_subprogram | |
.b8 95 // DW_AT_name | |
.b8 108 | |
.b8 97 | |
.b8 121 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 110 | |
.b8 111 | |
.b8 114 | |
.b8 109 | |
.b8 95 | |
.b8 98 | |
.b8 97 | |
.b8 99 | |
.b8 107 | |
.b8 119 | |
.b8 97 | |
.b8 114 | |
.b8 100 | |
.b8 95 | |
.b8 107 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 0 | |
.b8 1 // DW_AT_inline | |
.b8 3 // Abbrev [3] 0x88:0x46 DW_TAG_subprogram | |
.b64 $L__func_begin0 // DW_AT_low_pc | |
.b64 $L__func_end0 // DW_AT_high_pc | |
.b32 106 // DW_AT_abstract_origin | |
.b8 4 // Abbrev [4] 0x9d:0x18 DW_TAG_inlined_subroutine | |
.b32 106 // DW_AT_abstract_origin | |
.b64 $L__tmp1 // DW_AT_low_pc | |
.b64 $L__tmp4 // DW_AT_high_pc | |
.b8 1 // DW_AT_call_file | |
.b8 124 // DW_AT_call_line | |
.b8 20 // DW_AT_call_column | |
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine | |
.b32 106 // DW_AT_abstract_origin | |
.b64 $L__tmp5 // DW_AT_low_pc | |
.b64 $L__tmp6 // DW_AT_high_pc | |
.b8 1 // DW_AT_call_file | |
.b8 125 // DW_AT_call_line | |
.b8 20 // DW_AT_call_column | |
.b8 0 // End Of Children Mark | |
.b8 0 // End Of Children Mark | |
} | |
.section .debug_macinfo { } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment