Created
August 17, 2024 16:22
-
-
Save Jokeren/8e5f8fe123fe7c8a40bbe9ae99bc165b to your computer and use it in GitHub Desktop.
AMD vec problem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'LLVMDialectModule' | |
source_filename = "LLVMDialectModule" | |
target triple = "amdgcn-amd-amdhsa" | |
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 | |
; Function Attrs: mustprogress nofree norecurse nounwind willreturn | |
define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 { | |
%3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7 | |
%4 = shl i32 %3, 2, !dbg !8 | |
%5 = and i32 %4, 1792, !dbg !8 | |
%6 = or disjoint i32 %5, 2048, !dbg !8 | |
%7 = and i32 %3, 1, !dbg !9 | |
%8 = and i32 %4, 252, !dbg !9 | |
%9 = or disjoint i32 %8, 1, !dbg !9 | |
%10 = or disjoint i32 %8, 2, !dbg !9 | |
%11 = or disjoint i32 %8, 3, !dbg !9 | |
%12 = and i32 %4, 2044, !dbg !10 | |
%13 = or disjoint i32 %9, %5, !dbg !10 | |
%14 = or disjoint i32 %10, %5, !dbg !10 | |
%15 = or disjoint i32 %11, %5, !dbg !10 | |
%16 = or disjoint i32 %6, %8, !dbg !10 | |
%17 = or disjoint i32 %9, %6, !dbg !10 | |
%18 = or disjoint i32 %10, %6, !dbg !10 | |
%19 = or disjoint i32 %11, %6, !dbg !10 | |
%20 = zext nneg i32 %12 to i64, !dbg !11 | |
%21 = getelementptr i32, ptr addrspace(1) %0, i64 %20, !dbg !11 | |
%22 = zext nneg i32 %16 to i64, !dbg !11 | |
%23 = getelementptr i32, ptr addrspace(1) %0, i64 %22, !dbg !11 | |
%24 = addrspacecast ptr addrspace(1) %21 to ptr, !dbg !12 | |
%25 = load <4 x i32>, ptr %24, align 16, !dbg !12 | |
%26 = addrspacecast ptr addrspace(1) %23 to ptr, !dbg !12 | |
%27 = load <4 x i32>, ptr %26, align 16, !dbg !12 | |
%28 = shl i32 %3, 3, !dbg !13 | |
%29 = and i32 %28, 248, !dbg !13 | |
%30 = or disjoint i32 %29, %5, !dbg !13 | |
%31 = and i32 %28, 4088, !dbg !13 | |
%32 = zext nneg i32 %31 to i64, !dbg !13 | |
%33 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %32, !dbg !13 | |
%34 = shufflevector <4 x i32> %25, <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg !13 | |
store <2 x i32> %34, ptr addrspace(3) %33, align 16, !dbg !13 | |
%35 = or disjoint i32 %31, 4, !dbg !13 | |
%36 = zext nneg i32 %35 to i64, !dbg !13 | |
%37 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %36, !dbg !13 | |
%38 = shufflevector <4 x i32> %25, <4 x i32> poison, <2 x i32> <i32 2, i32 3>, !dbg !13 | |
store <2 x i32> %38, ptr addrspace(3) %37, align 16, !dbg !13 | |
fence syncscope("workgroup") release, !dbg !13 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !13 | |
fence syncscope("workgroup") acquire, !dbg !13 | |
%39 = shl nuw nsw i32 %30, 1, !dbg !13 | |
%40 = zext nneg i32 %39 to i64, !dbg !13 | |
%41 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %40, !dbg !13 | |
%42 = load i32, ptr addrspace(3) %41, align 16, !dbg !13 | |
%43 = or disjoint i32 %39, 4, !dbg !13 | |
%44 = zext nneg i32 %43 to i64, !dbg !13 | |
%45 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %44, !dbg !13 | |
%46 = load i32, ptr addrspace(3) %45, align 16, !dbg !13 | |
%47 = or disjoint i32 %39, 8, !dbg !13 | |
%48 = zext nneg i32 %47 to i64, !dbg !13 | |
%49 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %48, !dbg !13 | |
%50 = load i32, ptr addrspace(3) %49, align 16, !dbg !13 | |
%51 = or disjoint i32 %39, 12, !dbg !13 | |
%52 = zext nneg i32 %51 to i64, !dbg !13 | |
%53 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %52, !dbg !13 | |
%54 = load i32, ptr addrspace(3) %53, align 16, !dbg !13 | |
%55 = or disjoint i32 %39, 1, !dbg !13 | |
%56 = zext nneg i32 %55 to i64, !dbg !13 | |
%57 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %56, !dbg !13 | |
%58 = load i32, ptr addrspace(3) %57, align 4, !dbg !13 | |
%59 = or disjoint i32 %39, 5, !dbg !13 | |
%60 = zext nneg i32 %59 to i64, !dbg !13 | |
%61 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %60, !dbg !13 | |
%62 = load i32, ptr addrspace(3) %61, align 4, !dbg !13 | |
%63 = or disjoint i32 %39, 9, !dbg !13 | |
%64 = zext nneg i32 %63 to i64, !dbg !13 | |
%65 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %64, !dbg !13 | |
%66 = load i32, ptr addrspace(3) %65, align 4, !dbg !13 | |
%67 = or disjoint i32 %39, 13, !dbg !13 | |
%68 = zext nneg i32 %67 to i64, !dbg !13 | |
%69 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %68, !dbg !13 | |
%70 = load i32, ptr addrspace(3) %69, align 4, !dbg !13 | |
fence syncscope("workgroup") release, !dbg !13 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !13 | |
fence syncscope("workgroup") acquire, !dbg !13 | |
%71 = shufflevector <4 x i32> %27, <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg !13 | |
store <2 x i32> %71, ptr addrspace(3) %33, align 16, !dbg !13 | |
%72 = shufflevector <4 x i32> %27, <4 x i32> poison, <2 x i32> <i32 2, i32 3>, !dbg !13 | |
store <2 x i32> %72, ptr addrspace(3) %37, align 16, !dbg !13 | |
fence syncscope("workgroup") release, !dbg !13 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !13 | |
fence syncscope("workgroup") acquire, !dbg !13 | |
%73 = load i32, ptr addrspace(3) %41, align 16, !dbg !13 | |
%74 = load i32, ptr addrspace(3) %45, align 16, !dbg !13 | |
%75 = load i32, ptr addrspace(3) %49, align 16, !dbg !13 | |
%76 = load i32, ptr addrspace(3) %53, align 16, !dbg !13 | |
%77 = load i32, ptr addrspace(3) %57, align 4, !dbg !13 | |
%78 = load i32, ptr addrspace(3) %61, align 4, !dbg !13 | |
%79 = load i32, ptr addrspace(3) %65, align 4, !dbg !13 | |
%80 = load i32, ptr addrspace(3) %69, align 4, !dbg !13 | |
%81 = lshr i32 %3, 1, !dbg !17 | |
%.lobit = and i32 %81, 1, !dbg !17 | |
%82 = icmp ne i32 %7, %.lobit, !dbg !18 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%83 = zext i1 %82 to i8, !dbg !19 | |
%84 = and i32 %3, 3, !dbg !19 | |
%85 = shl nuw nsw i32 %84, 1, !dbg !19 | |
%86 = zext nneg i32 %85 to i64, !dbg !19 | |
%87 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %86, !dbg !19 | |
%88 = insertelement <1 x i8> poison, i8 %83, i64 0, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%89 = lshr i32 %3, 3, !dbg !19 | |
%90 = and i32 %89, 6, !dbg !19 | |
%91 = zext nneg i32 %90 to i64, !dbg !19 | |
%92 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %91, !dbg !19 | |
%93 = load <8 x i1>, ptr addrspace(3) %92, align 2, !dbg !19 | |
%94 = extractelement <8 x i1> %93, i64 0, !dbg !19 | |
%95 = select i1 %94, i32 %42, i32 0, !dbg !19 | |
%96 = select i1 %94, i32 %46, i32 0, !dbg !19 | |
%97 = select i1 %94, i32 %50, i32 0, !dbg !19 | |
%98 = select i1 %94, i32 %54, i32 0, !dbg !19 | |
%99 = select i1 %94, i32 %58, i32 0, !dbg !19 | |
%100 = select i1 %94, i32 %62, i32 0, !dbg !19 | |
%101 = select i1 %94, i32 %66, i32 0, !dbg !19 | |
%102 = select i1 %94, i32 %70, i32 0, !dbg !19 | |
%103 = select i1 %94, i32 %73, i32 0, !dbg !19 | |
%104 = select i1 %94, i32 %74, i32 0, !dbg !19 | |
%105 = select i1 %94, i32 %75, i32 0, !dbg !19 | |
%106 = select i1 %94, i32 %76, i32 0, !dbg !19 | |
%107 = select i1 %94, i32 %77, i32 0, !dbg !19 | |
%108 = select i1 %94, i32 %78, i32 0, !dbg !19 | |
%109 = select i1 %94, i32 %79, i32 0, !dbg !19 | |
%110 = select i1 %94, i32 %80, i32 0, !dbg !19 | |
%111 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %95, i32 16415), !dbg !20 | |
%112 = add i32 %95, %111, !dbg !22 | |
%113 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %99, i32 16415), !dbg !20 | |
%114 = add i32 %99, %113, !dbg !22 | |
%115 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %96, i32 16415), !dbg !20 | |
%116 = add i32 %115, %96, !dbg !22 | |
%117 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %100, i32 16415), !dbg !20 | |
%118 = add i32 %117, %100, !dbg !22 | |
%119 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %97, i32 16415), !dbg !20 | |
%120 = add i32 %119, %97, !dbg !22 | |
%121 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %101, i32 16415), !dbg !20 | |
%122 = add i32 %121, %101, !dbg !22 | |
%123 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %98, i32 16415), !dbg !20 | |
%124 = add i32 %123, %98, !dbg !22 | |
%125 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %102, i32 16415), !dbg !20 | |
%126 = add i32 %125, %102, !dbg !22 | |
%127 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %103, i32 16415), !dbg !20 | |
%128 = add i32 %127, %103, !dbg !22 | |
%129 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %107, i32 16415), !dbg !20 | |
%130 = add i32 %129, %107, !dbg !22 | |
%131 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %104, i32 16415), !dbg !20 | |
%132 = add i32 %131, %104, !dbg !22 | |
%133 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %108, i32 16415), !dbg !20 | |
%134 = add i32 %133, %108, !dbg !22 | |
%135 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %105, i32 16415), !dbg !20 | |
%136 = add i32 %135, %105, !dbg !22 | |
%137 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %109, i32 16415), !dbg !20 | |
%138 = add i32 %137, %109, !dbg !22 | |
%139 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %106, i32 16415), !dbg !20 | |
%140 = add i32 %139, %106, !dbg !22 | |
%141 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %110, i32 16415), !dbg !20 | |
%142 = add i32 %141, %110, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%143 = lshr i32 %3, 2, !dbg !19 | |
%144 = and i32 %143, 6, !dbg !19 | |
%145 = zext nneg i32 %144 to i64, !dbg !19 | |
%146 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %145, !dbg !19 | |
%147 = load <8 x i1>, ptr addrspace(3) %146, align 2, !dbg !19 | |
%148 = extractelement <8 x i1> %147, i64 0, !dbg !19 | |
%149 = select i1 %148, i32 %112, i32 0, !dbg !19 | |
%150 = select i1 %148, i32 %116, i32 0, !dbg !19 | |
%151 = select i1 %148, i32 %120, i32 0, !dbg !19 | |
%152 = select i1 %148, i32 %124, i32 0, !dbg !19 | |
%153 = select i1 %148, i32 %114, i32 0, !dbg !19 | |
%154 = select i1 %148, i32 %118, i32 0, !dbg !19 | |
%155 = select i1 %148, i32 %122, i32 0, !dbg !19 | |
%156 = select i1 %148, i32 %126, i32 0, !dbg !19 | |
%157 = select i1 %148, i32 %128, i32 0, !dbg !19 | |
%158 = select i1 %148, i32 %132, i32 0, !dbg !19 | |
%159 = select i1 %148, i32 %136, i32 0, !dbg !19 | |
%160 = select i1 %148, i32 %140, i32 0, !dbg !19 | |
%161 = select i1 %148, i32 %130, i32 0, !dbg !19 | |
%162 = select i1 %148, i32 %134, i32 0, !dbg !19 | |
%163 = select i1 %148, i32 %138, i32 0, !dbg !19 | |
%164 = select i1 %148, i32 %142, i32 0, !dbg !19 | |
%165 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %149, i32 8223), !dbg !20 | |
%166 = add i32 %149, %165, !dbg !22 | |
%167 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %153, i32 8223), !dbg !20 | |
%168 = add i32 %153, %167, !dbg !22 | |
%169 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %150, i32 8223), !dbg !20 | |
%170 = add i32 %169, %150, !dbg !22 | |
%171 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %154, i32 8223), !dbg !20 | |
%172 = add i32 %171, %154, !dbg !22 | |
%173 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %151, i32 8223), !dbg !20 | |
%174 = add i32 %173, %151, !dbg !22 | |
%175 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %155, i32 8223), !dbg !20 | |
%176 = add i32 %175, %155, !dbg !22 | |
%177 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %152, i32 8223), !dbg !20 | |
%178 = add i32 %177, %152, !dbg !22 | |
%179 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %156, i32 8223), !dbg !20 | |
%180 = add i32 %179, %156, !dbg !22 | |
%181 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %157, i32 8223), !dbg !20 | |
%182 = add i32 %181, %157, !dbg !22 | |
%183 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %161, i32 8223), !dbg !20 | |
%184 = add i32 %183, %161, !dbg !22 | |
%185 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %158, i32 8223), !dbg !20 | |
%186 = add i32 %185, %158, !dbg !22 | |
%187 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %162, i32 8223), !dbg !20 | |
%188 = add i32 %187, %162, !dbg !22 | |
%189 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %159, i32 8223), !dbg !20 | |
%190 = add i32 %189, %159, !dbg !22 | |
%191 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %163, i32 8223), !dbg !20 | |
%192 = add i32 %191, %163, !dbg !22 | |
%193 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %160, i32 8223), !dbg !20 | |
%194 = add i32 %193, %160, !dbg !22 | |
%195 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %164, i32 8223), !dbg !20 | |
%196 = add i32 %195, %164, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%197 = and i32 %81, 6, !dbg !19 | |
%198 = zext nneg i32 %197 to i64, !dbg !19 | |
%199 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %198, !dbg !19 | |
%200 = load <8 x i1>, ptr addrspace(3) %199, align 2, !dbg !19 | |
%201 = extractelement <8 x i1> %200, i64 0, !dbg !19 | |
%202 = select i1 %201, i32 %166, i32 0, !dbg !19 | |
%203 = select i1 %201, i32 %170, i32 0, !dbg !19 | |
%204 = select i1 %201, i32 %174, i32 0, !dbg !19 | |
%205 = select i1 %201, i32 %178, i32 0, !dbg !19 | |
%206 = select i1 %201, i32 %168, i32 0, !dbg !19 | |
%207 = select i1 %201, i32 %172, i32 0, !dbg !19 | |
%208 = select i1 %201, i32 %176, i32 0, !dbg !19 | |
%209 = select i1 %201, i32 %180, i32 0, !dbg !19 | |
%210 = select i1 %201, i32 %182, i32 0, !dbg !19 | |
%211 = select i1 %201, i32 %186, i32 0, !dbg !19 | |
%212 = select i1 %201, i32 %190, i32 0, !dbg !19 | |
%213 = select i1 %201, i32 %194, i32 0, !dbg !19 | |
%214 = select i1 %201, i32 %184, i32 0, !dbg !19 | |
%215 = select i1 %201, i32 %188, i32 0, !dbg !19 | |
%216 = select i1 %201, i32 %192, i32 0, !dbg !19 | |
%217 = select i1 %201, i32 %196, i32 0, !dbg !19 | |
%218 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %202, i32 4127), !dbg !20 | |
%219 = add i32 %202, %218, !dbg !22 | |
%220 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %206, i32 4127), !dbg !20 | |
%221 = add i32 %206, %220, !dbg !22 | |
%222 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %203, i32 4127), !dbg !20 | |
%223 = add i32 %222, %203, !dbg !22 | |
%224 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %207, i32 4127), !dbg !20 | |
%225 = add i32 %224, %207, !dbg !22 | |
%226 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %204, i32 4127), !dbg !20 | |
%227 = add i32 %226, %204, !dbg !22 | |
%228 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %208, i32 4127), !dbg !20 | |
%229 = add i32 %228, %208, !dbg !22 | |
%230 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %205, i32 4127), !dbg !20 | |
%231 = add i32 %230, %205, !dbg !22 | |
%232 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %209, i32 4127), !dbg !20 | |
%233 = add i32 %232, %209, !dbg !22 | |
%234 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %210, i32 4127), !dbg !20 | |
%235 = add i32 %234, %210, !dbg !22 | |
%236 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %214, i32 4127), !dbg !20 | |
%237 = add i32 %236, %214, !dbg !22 | |
%238 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %211, i32 4127), !dbg !20 | |
%239 = add i32 %238, %211, !dbg !22 | |
%240 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %215, i32 4127), !dbg !20 | |
%241 = add i32 %240, %215, !dbg !22 | |
%242 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %212, i32 4127), !dbg !20 | |
%243 = add i32 %242, %212, !dbg !22 | |
%244 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %216, i32 4127), !dbg !20 | |
%245 = add i32 %244, %216, !dbg !22 | |
%246 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %213, i32 4127), !dbg !20 | |
%247 = add i32 %246, %213, !dbg !22 | |
%248 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %217, i32 4127), !dbg !20 | |
%249 = add i32 %248, %217, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%250 = shl nuw nsw i32 %.lobit, 1, !dbg !19 | |
%.mask = and i32 %3, 4, !dbg !19 | |
%251 = or disjoint i32 %250, %.mask, !dbg !19 | |
%252 = zext nneg i32 %251 to i64, !dbg !19 | |
%253 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %252, !dbg !19 | |
%254 = load <8 x i1>, ptr addrspace(3) %253, align 2, !dbg !19 | |
%255 = extractelement <8 x i1> %254, i64 0, !dbg !19 | |
%256 = select i1 %255, i32 %219, i32 0, !dbg !19 | |
%257 = select i1 %255, i32 %223, i32 0, !dbg !19 | |
%258 = select i1 %255, i32 %227, i32 0, !dbg !19 | |
%259 = select i1 %255, i32 %231, i32 0, !dbg !19 | |
%260 = select i1 %255, i32 %221, i32 0, !dbg !19 | |
%261 = select i1 %255, i32 %225, i32 0, !dbg !19 | |
%262 = select i1 %255, i32 %229, i32 0, !dbg !19 | |
%263 = select i1 %255, i32 %233, i32 0, !dbg !19 | |
%264 = select i1 %255, i32 %235, i32 0, !dbg !19 | |
%265 = select i1 %255, i32 %239, i32 0, !dbg !19 | |
%266 = select i1 %255, i32 %243, i32 0, !dbg !19 | |
%267 = select i1 %255, i32 %247, i32 0, !dbg !19 | |
%268 = select i1 %255, i32 %237, i32 0, !dbg !19 | |
%269 = select i1 %255, i32 %241, i32 0, !dbg !19 | |
%270 = select i1 %255, i32 %245, i32 0, !dbg !19 | |
%271 = select i1 %255, i32 %249, i32 0, !dbg !19 | |
%272 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %256, i32 2079), !dbg !20 | |
%273 = add i32 %256, %272, !dbg !22 | |
%274 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %260, i32 2079), !dbg !20 | |
%275 = add i32 %260, %274, !dbg !22 | |
%276 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %257, i32 2079), !dbg !20 | |
%277 = add i32 %276, %257, !dbg !22 | |
%278 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %261, i32 2079), !dbg !20 | |
%279 = add i32 %278, %261, !dbg !22 | |
%280 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %258, i32 2079), !dbg !20 | |
%281 = add i32 %280, %258, !dbg !22 | |
%282 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %262, i32 2079), !dbg !20 | |
%283 = add i32 %282, %262, !dbg !22 | |
%284 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %259, i32 2079), !dbg !20 | |
%285 = add i32 %284, %259, !dbg !22 | |
%286 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %263, i32 2079), !dbg !20 | |
%287 = add i32 %286, %263, !dbg !22 | |
%288 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %264, i32 2079), !dbg !20 | |
%289 = add i32 %288, %264, !dbg !22 | |
%290 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %268, i32 2079), !dbg !20 | |
%291 = add i32 %290, %268, !dbg !22 | |
%292 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %265, i32 2079), !dbg !20 | |
%293 = add i32 %292, %265, !dbg !22 | |
%294 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %269, i32 2079), !dbg !20 | |
%295 = add i32 %294, %269, !dbg !22 | |
%296 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %266, i32 2079), !dbg !20 | |
%297 = add i32 %296, %266, !dbg !22 | |
%298 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %270, i32 2079), !dbg !20 | |
%299 = add i32 %298, %270, !dbg !22 | |
%300 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %267, i32 2079), !dbg !20 | |
%301 = add i32 %300, %267, !dbg !22 | |
%302 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %271, i32 2079), !dbg !20 | |
%303 = add i32 %302, %271, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%304 = load <8 x i1>, ptr addrspace(3) %87, align 2, !dbg !19 | |
%305 = extractelement <8 x i1> %304, i64 0, !dbg !19 | |
%306 = select i1 %305, i32 %273, i32 0, !dbg !19 | |
%307 = select i1 %305, i32 %277, i32 0, !dbg !19 | |
%308 = select i1 %305, i32 %281, i32 0, !dbg !19 | |
%309 = select i1 %305, i32 %285, i32 0, !dbg !19 | |
%310 = select i1 %305, i32 %275, i32 0, !dbg !19 | |
%311 = select i1 %305, i32 %279, i32 0, !dbg !19 | |
%312 = select i1 %305, i32 %283, i32 0, !dbg !19 | |
%313 = select i1 %305, i32 %287, i32 0, !dbg !19 | |
%314 = select i1 %305, i32 %289, i32 0, !dbg !19 | |
%315 = select i1 %305, i32 %293, i32 0, !dbg !19 | |
%316 = select i1 %305, i32 %297, i32 0, !dbg !19 | |
%317 = select i1 %305, i32 %301, i32 0, !dbg !19 | |
%318 = select i1 %305, i32 %291, i32 0, !dbg !19 | |
%319 = select i1 %305, i32 %295, i32 0, !dbg !19 | |
%320 = select i1 %305, i32 %299, i32 0, !dbg !19 | |
%321 = select i1 %305, i32 %303, i32 0, !dbg !19 | |
%322 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %306, i32 1055), !dbg !20 | |
%323 = add i32 %306, %322, !dbg !22 | |
%324 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %310, i32 1055), !dbg !20 | |
%325 = add i32 %310, %324, !dbg !22 | |
%326 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %307, i32 1055), !dbg !20 | |
%327 = add i32 %326, %307, !dbg !22 | |
%328 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %311, i32 1055), !dbg !20 | |
%329 = add i32 %328, %311, !dbg !22 | |
%330 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %308, i32 1055), !dbg !20 | |
%331 = add i32 %330, %308, !dbg !22 | |
%332 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %312, i32 1055), !dbg !20 | |
%333 = add i32 %332, %312, !dbg !22 | |
%334 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %309, i32 1055), !dbg !20 | |
%335 = add i32 %334, %309, !dbg !22 | |
%336 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %313, i32 1055), !dbg !20 | |
%337 = add i32 %336, %313, !dbg !22 | |
%338 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %314, i32 1055), !dbg !20 | |
%339 = add i32 %338, %314, !dbg !22 | |
%340 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %318, i32 1055), !dbg !20 | |
%341 = add i32 %340, %318, !dbg !22 | |
%342 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %315, i32 1055), !dbg !20 | |
%343 = add i32 %342, %315, !dbg !22 | |
%344 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %319, i32 1055), !dbg !20 | |
%345 = add i32 %344, %319, !dbg !22 | |
%346 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %316, i32 1055), !dbg !20 | |
%347 = add i32 %346, %316, !dbg !22 | |
%348 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %320, i32 1055), !dbg !20 | |
%349 = add i32 %348, %320, !dbg !22 | |
%350 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %317, i32 1055), !dbg !20 | |
%351 = add i32 %350, %317, !dbg !22 | |
%352 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %321, i32 1055), !dbg !20 | |
%353 = add i32 %352, %321, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%354 = shl nuw nsw i32 %7, 2, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%355 = zext nneg i32 %354 to i64, !dbg !19 | |
%356 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %355, !dbg !19 | |
%357 = or disjoint i32 %354, 2, !dbg !19 | |
%358 = zext nneg i32 %357 to i64, !dbg !19 | |
%359 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %358, !dbg !19 | |
%360 = load <8 x i1>, ptr addrspace(3) %356, align 4, !dbg !19 | |
%361 = load <8 x i1>, ptr addrspace(3) %359, align 2, !dbg !19 | |
%362 = extractelement <8 x i1> %360, i64 0, !dbg !19 | |
%363 = extractelement <8 x i1> %361, i64 0, !dbg !19 | |
%364 = select i1 %362, i32 %323, i32 0, !dbg !19 | |
%365 = select i1 %362, i32 %327, i32 0, !dbg !19 | |
%366 = select i1 %363, i32 %331, i32 0, !dbg !19 | |
%367 = select i1 %363, i32 %335, i32 0, !dbg !19 | |
%368 = select i1 %362, i32 %325, i32 0, !dbg !19 | |
%369 = select i1 %362, i32 %329, i32 0, !dbg !19 | |
%370 = select i1 %363, i32 %333, i32 0, !dbg !19 | |
%371 = select i1 %363, i32 %337, i32 0, !dbg !19 | |
%372 = select i1 %362, i32 %339, i32 0, !dbg !19 | |
%373 = select i1 %362, i32 %343, i32 0, !dbg !19 | |
%374 = select i1 %363, i32 %347, i32 0, !dbg !19 | |
%375 = select i1 %363, i32 %351, i32 0, !dbg !19 | |
%376 = select i1 %362, i32 %341, i32 0, !dbg !19 | |
%377 = select i1 %362, i32 %345, i32 0, !dbg !19 | |
%378 = select i1 %363, i32 %349, i32 0, !dbg !19 | |
%379 = select i1 %363, i32 %353, i32 0, !dbg !19 | |
%380 = add i32 %366, %364, !dbg !22 | |
%381 = add i32 %367, %365, !dbg !22 | |
%382 = add i32 %370, %368, !dbg !22 | |
%383 = add i32 %371, %369, !dbg !22 | |
%384 = add i32 %374, %372, !dbg !22 | |
%385 = add i32 %375, %373, !dbg !22 | |
%386 = add i32 %378, %376, !dbg !22 | |
%387 = add i32 %379, %377, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%388 = lshr i32 %84, 1, !dbg !19 | |
%389 = and i32 %3, 2, !dbg !19 | |
%390 = add nuw nsw i32 %389, %84, !dbg !19 | |
%391 = zext nneg i32 %390 to i64, !dbg !19 | |
%392 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %391, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %392, align 1, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%393 = load <16 x i1>, ptr addrspace(3) @global_smem, align 16, !dbg !19 | |
%394 = load <16 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 4), align 4, !dbg !19 | |
%395 = extractelement <16 x i1> %393, i64 0, !dbg !19 | |
%396 = extractelement <16 x i1> %393, i64 8, !dbg !19 | |
%397 = extractelement <16 x i1> %394, i64 0, !dbg !19 | |
%398 = extractelement <16 x i1> %394, i64 8, !dbg !19 | |
%399 = select i1 %395, i32 %380, i32 0, !dbg !19 | |
%400 = select i1 %396, i32 %381, i32 0, !dbg !19 | |
%401 = select i1 %397, i32 %380, i32 0, !dbg !19 | |
%402 = select i1 %398, i32 %381, i32 0, !dbg !19 | |
%403 = select i1 %395, i32 %382, i32 0, !dbg !19 | |
%404 = select i1 %396, i32 %383, i32 0, !dbg !19 | |
%405 = select i1 %397, i32 %382, i32 0, !dbg !19 | |
%406 = select i1 %398, i32 %383, i32 0, !dbg !19 | |
%407 = select i1 %395, i32 %384, i32 0, !dbg !19 | |
%408 = select i1 %396, i32 %385, i32 0, !dbg !19 | |
%409 = select i1 %397, i32 %384, i32 0, !dbg !19 | |
%410 = select i1 %398, i32 %385, i32 0, !dbg !19 | |
%411 = select i1 %395, i32 %386, i32 0, !dbg !19 | |
%412 = select i1 %396, i32 %387, i32 0, !dbg !19 | |
%413 = select i1 %397, i32 %386, i32 0, !dbg !19 | |
%414 = select i1 %398, i32 %387, i32 0, !dbg !19 | |
%415 = add i32 %399, %400, !dbg !22 | |
%416 = add i32 %401, %402, !dbg !22 | |
%417 = add i32 %403, %404, !dbg !22 | |
%418 = add i32 %405, %406, !dbg !22 | |
%419 = add i32 %407, %408, !dbg !22 | |
%420 = add i32 %409, %410, !dbg !22 | |
%421 = add i32 %411, %412, !dbg !22 | |
%422 = add i32 %413, %414, !dbg !22 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%423 = add nuw nsw i32 %388, %84, !dbg !19 | |
%424 = zext nneg i32 %423 to i64, !dbg !19 | |
%425 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %424, !dbg !19 | |
store <1 x i8> %88, ptr addrspace(3) %425, align 1, !dbg !19 | |
fence syncscope("workgroup") release, !dbg !19 | |
tail call void @llvm.amdgcn.s.barrier(), !dbg !19 | |
fence syncscope("workgroup") acquire, !dbg !19 | |
%426 = load <8 x i1>, ptr addrspace(3) @global_smem, align 16, !dbg !19 | |
%427 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 3), align 1, !dbg !19 | |
%428 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 1), align 1, !dbg !19 | |
%429 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 4), align 4, !dbg !19 | |
%430 = extractelement <8 x i1> %426, i64 0, !dbg !19 | |
%431 = extractelement <8 x i1> %427, i64 0, !dbg !19 | |
%432 = extractelement <8 x i1> %428, i64 0, !dbg !19 | |
%433 = extractelement <8 x i1> %429, i64 0, !dbg !19 | |
%434 = select i1 %430, i32 %415, i32 0, !dbg !19 | |
%435 = select i1 %431, i32 %415, i32 0, !dbg !19 | |
%436 = select i1 %430, i32 %416, i32 0, !dbg !19 | |
%437 = select i1 %431, i32 %416, i32 0, !dbg !19 | |
%438 = select i1 %432, i32 %417, i32 0, !dbg !19 | |
%439 = select i1 %433, i32 %417, i32 0, !dbg !19 | |
%440 = select i1 %432, i32 %418, i32 0, !dbg !19 | |
%441 = select i1 %433, i32 %418, i32 0, !dbg !19 | |
%442 = select i1 %430, i32 %419, i32 0, !dbg !19 | |
%443 = select i1 %431, i32 %419, i32 0, !dbg !19 | |
%444 = select i1 %430, i32 %420, i32 0, !dbg !19 | |
%445 = select i1 %431, i32 %420, i32 0, !dbg !19 | |
%446 = select i1 %432, i32 %421, i32 0, !dbg !19 | |
%447 = select i1 %433, i32 %421, i32 0, !dbg !19 | |
%448 = select i1 %432, i32 %422, i32 0, !dbg !19 | |
%449 = select i1 %433, i32 %422, i32 0, !dbg !19 | |
%450 = add i32 %438, %434, !dbg !22 | |
%451 = add i32 %439, %435, !dbg !22 | |
%452 = add i32 %440, %436, !dbg !22 | |
%453 = add i32 %441, %437, !dbg !22 | |
%454 = add i32 %446, %442, !dbg !22 | |
%455 = add i32 %447, %443, !dbg !22 | |
%456 = add i32 %448, %444, !dbg !22 | |
%457 = add i32 %449, %445, !dbg !22 | |
%458 = getelementptr i32, ptr addrspace(1) %1, i64 %20, !dbg !24 | |
%459 = zext nneg i32 %13 to i64, !dbg !24 | |
%460 = getelementptr i32, ptr addrspace(1) %1, i64 %459, !dbg !24 | |
%461 = zext nneg i32 %14 to i64, !dbg !24 | |
%462 = getelementptr i32, ptr addrspace(1) %1, i64 %461, !dbg !24 | |
%463 = zext nneg i32 %15 to i64, !dbg !24 | |
%464 = getelementptr i32, ptr addrspace(1) %1, i64 %463, !dbg !24 | |
%465 = getelementptr i32, ptr addrspace(1) %1, i64 %22, !dbg !24 | |
%466 = zext nneg i32 %17 to i64, !dbg !24 | |
%467 = getelementptr i32, ptr addrspace(1) %1, i64 %466, !dbg !24 | |
%468 = zext nneg i32 %18 to i64, !dbg !24 | |
%469 = getelementptr i32, ptr addrspace(1) %1, i64 %468, !dbg !24 | |
%470 = zext nneg i32 %19 to i64, !dbg !24 | |
%471 = getelementptr i32, ptr addrspace(1) %1, i64 %470, !dbg !24 | |
store i32 %450, ptr addrspace(1) %458, align 4, !dbg !25 | |
store i32 %451, ptr addrspace(1) %460, align 4, !dbg !25 | |
store i32 %452, ptr addrspace(1) %462, align 4, !dbg !25 | |
store i32 %453, ptr addrspace(1) %464, align 4, !dbg !25 | |
store i32 %454, ptr addrspace(1) %465, align 4, !dbg !25 | |
store i32 %455, ptr addrspace(1) %467, align 4, !dbg !25 | |
store i32 %456, ptr addrspace(1) %469, align 4, !dbg !25 | |
store i32 %457, ptr addrspace(1) %471, align 4, !dbg !25 | |
ret void, !dbg !26 | |
} | |
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare noundef i32 @llvm.amdgcn.workitem.id.x() #1 | |
; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn | |
declare void @llvm.amdgcn.s.barrier() #2 | |
; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none) | |
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32 immarg) #3 | |
attributes #0 = { mustprogress nofree norecurse nounwind willreturn "amdgpu-flat-work-group-size"="1,512" "amdgpu-waves-per-eu"="1" "denormal-fp-math-f32"="ieee" } | |
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } | |
attributes #2 = { convergent mustprogress nocallback nofree nounwind willreturn } | |
attributes #3 = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) } | |
!llvm.module.flags = !{!0, !1} | |
!llvm.dbg.cu = !{!2} | |
!0 = !{i32 2, !"Debug Info Version", i32 3} | |
!1 = !{i32 1, !"amdhsa_code_object_version", i32 400} | |
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) | |
!3 = !DIFile(filename: "test_standard.py", directory: "/home/openai/triton/python/test/unit/language") | |
!4 = distinct !DISubprogram(name: "flip_kernel", linkageName: "flip_kernel", scope: !3, file: !3, line: 62, type: !5, scopeLine: 62, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) | |
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) | |
!6 = !{} | |
!7 = !DILocation(line: 64, column: 24, scope: !4) | |
!8 = !DILocation(line: 64, column: 29, scope: !4) | |
!9 = !DILocation(line: 65, column: 17, scope: !4) | |
!10 = !DILocation(line: 65, column: 28, scope: !4) | |
!11 = !DILocation(line: 66, column: 20, scope: !4) | |
!12 = !DILocation(line: 66, column: 16, scope: !4) | |
!13 = !DILocation(line: 420, column: 28, scope: !14, inlinedAt: !16) | |
!14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0) | |
!15 = !DIFile(filename: "standard.py", directory: "/home/openai/triton/python/triton/language") | |
!16 = !DILocation(line: 67, column: 16, scope: !4) | |
!17 = !DILocation(line: 421, column: 30, scope: !14, inlinedAt: !16) | |
!18 = !DILocation(line: 421, column: 42, scope: !14, inlinedAt: !16) | |
!19 = !DILocation(line: 427, column: 20, scope: !14, inlinedAt: !16) | |
!20 = !DILocation(line: 267, column: 36, scope: !21, inlinedAt: !16) | |
!21 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0) | |
!22 = !DILocation(line: 256, column: 15, scope: !23, inlinedAt: !16) | |
!23 = distinct !DILexicalBlockFile(scope: !21, file: !15, discriminator: 0) | |
!24 = !DILocation(line: 68, column: 17, scope: !4) | |
!25 = !DILocation(line: 68, column: 24, scope: !4) | |
!26 = !DILocation(line: 68, column: 4, scope: !4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment