Created
May 12, 2025 20:54
-
-
Save TheBlackPlague/35fc4091e77191b16e3ba7cba38cdbf6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
std::__1::array<int, 1ul> MantaRay::ActivateFlattenAndForward<&MantaRay::ClippedReLU<short, (short)0, (short)255>::Activate(short), short, int, 256ul, 1ul>(std::__1::array<short, 256ul> const&, std::__1::array<short, 256ul> const&, std::__1::array<short, 256ul * 2 * 1ul> const&, std::__1::array<short, 1ul> const&): | |
sub sp, sp, #128 | |
stp d9, d8, [sp, #16] | |
stp x28, x27, [sp, #32] | |
stp x26, x25, [sp, #48] | |
stp x24, x23, [sp, #64] | |
stp x22, x21, [sp, #80] | |
stp x20, x19, [sp, #96] | |
stp x29, x30, [sp, #112] | |
str x3, [sp, #8] | |
mov x8, #0 | |
add x9, x1, #32 | |
add x10, x2, #32 | |
add x11, x0, #32 | |
movi.2d v0, #0000000000000000 | |
movi.2d v1, #0xff00ff00ff00ff | |
mov w5, #532 | |
mov w6, #534 | |
mov w7, #536 | |
mov w19, #538 | |
mov w20, #540 | |
mov w21, #542 | |
mov w22, #544 | |
mov w23, #546 | |
mov w24, #548 | |
mov w25, #550 | |
mov w26, #552 | |
mov w27, #554 | |
movi.2d v2, #0000000000000000 | |
mov w28, #556 | |
movi.2d v3, #0000000000000000 | |
mov w30, #558 | |
mov w12, #560 | |
movi.2d v4, #0000000000000000 | |
mov w3, #562 | |
movi.2d v6, #0000000000000000 | |
movi.2d v7, #0000000000000000 | |
movi.2d v5, #0000000000000000 | |
movi.2d v17, #0000000000000000 | |
movi.2d v16, #0000000000000000 | |
mov w13, #564 | |
mov w14, #566 | |
mov w15, #568 | |
mov w16, #570 | |
mov w17, #572 | |
mov w0, #574 | |
LBB25_1: | |
lsl x1, x8, #1 | |
orr x4, x1, #0x200 | |
ldr h18, [x2, x4] | |
mov w4, #514 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v18 }[1], [x4] | |
mov w4, #516 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v18 }[2], [x4] | |
mov w4, #518 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v18 }[3], [x4] | |
mov w4, #520 | |
orr x4, x1, x4 | |
ldr h19, [x2, x4] | |
mov w4, #522 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v19 }[1], [x4] | |
mov w4, #524 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v19 }[2], [x4] | |
mov w4, #526 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v19 }[3], [x4] | |
mov w4, #528 | |
orr x4, x1, x4 | |
ldr h20, [x2, x4] | |
mov w4, #530 | |
orr x4, x1, x4 | |
add x4, x2, x4 | |
ld1.h { v20 }[1], [x4] | |
orr x4, x1, x5 | |
add x4, x2, x4 | |
ld1.h { v20 }[2], [x4] | |
orr x4, x1, x6 | |
add x4, x2, x4 | |
ld1.h { v20 }[3], [x4] | |
orr x4, x1, x7 | |
ldr h21, [x2, x4] | |
orr x4, x1, x19 | |
add x4, x2, x4 | |
ld1.h { v21 }[1], [x4] | |
orr x4, x1, x20 | |
add x4, x2, x4 | |
ld1.h { v21 }[2], [x4] | |
orr x4, x1, x21 | |
add x4, x2, x4 | |
ld1.h { v21 }[3], [x4] | |
orr x4, x1, x22 | |
ldr h22, [x2, x4] | |
orr x4, x1, x23 | |
add x4, x2, x4 | |
ld1.h { v22 }[1], [x4] | |
orr x4, x1, x24 | |
add x4, x2, x4 | |
ld1.h { v22 }[2], [x4] | |
orr x4, x1, x25 | |
add x4, x2, x4 | |
ld1.h { v22 }[3], [x4] | |
orr x4, x1, x26 | |
ldr h23, [x2, x4] | |
orr x4, x1, x27 | |
add x4, x2, x4 | |
ld1.h { v23 }[1], [x4] | |
orr x4, x1, x28 | |
add x4, x2, x4 | |
ld1.h { v23 }[2], [x4] | |
orr x4, x1, x30 | |
add x4, x2, x4 | |
ld1.h { v23 }[3], [x4] | |
orr x4, x1, x12 | |
ldr h24, [x2, x4] | |
orr x4, x1, x3 | |
add x4, x2, x4 | |
ld1.h { v24 }[1], [x4] | |
orr x4, x1, x13 | |
add x4, x2, x4 | |
ld1.h { v24 }[2], [x4] | |
orr x4, x1, x14 | |
add x4, x2, x4 | |
ld1.h { v24 }[3], [x4] | |
orr x4, x1, x15 | |
ldr h25, [x2, x4] | |
orr x4, x1, x16 | |
add x4, x2, x4 | |
ld1.h { v25 }[1], [x4] | |
orr x4, x1, x17 | |
add x4, x2, x4 | |
ld1.h { v25 }[2], [x4] | |
orr x1, x1, x0 | |
add x1, x2, x1 | |
ld1.h { v25 }[3], [x1] | |
ldp q26, q27, [x11, #-32] | |
ldp q28, q29, [x11], #64 | |
smax.8h v26, v26, v0 | |
smax.8h v27, v27, v0 | |
smax.8h v28, v28, v0 | |
smax.8h v29, v29, v0 | |
smin.8h v26, v26, v1 | |
smin.8h v27, v27, v1 | |
smin.8h v28, v28, v1 | |
smin.8h v29, v29, v1 | |
ldp q30, q31, [x10, #-32] | |
ldp q8, q9, [x10], #64 | |
smlal.4s v2, v26, v30 | |
smlal2.4s v3, v26, v30 | |
smlal.4s v4, v27, v31 | |
smlal2.4s v6, v27, v31 | |
smlal.4s v7, v28, v8 | |
smlal2.4s v5, v28, v8 | |
smlal.4s v17, v29, v9 | |
smlal2.4s v16, v29, v9 | |
ldp q26, q27, [x9, #-32] | |
ldp q28, q29, [x9], #64 | |
smax.8h v26, v26, v0 | |
smax.8h v27, v27, v0 | |
smax.8h v28, v28, v0 | |
smax.8h v29, v29, v0 | |
smin.8h v26, v26, v1 | |
smin.8h v27, v27, v1 | |
smin.8h v28, v28, v1 | |
smin.8h v29, v29, v1 | |
ext.16b v30, v26, v26, #8 | |
ext.16b v31, v27, v27, #8 | |
ext.16b v8, v28, v28, #8 | |
ext.16b v9, v29, v29, #8 | |
smlal.4s v3, v30, v19 | |
smlal.4s v2, v26, v18 | |
smlal.4s v6, v31, v21 | |
smlal.4s v4, v27, v20 | |
smlal.4s v5, v8, v23 | |
smlal.4s v7, v28, v22 | |
smlal.4s v16, v9, v25 | |
add x8, x8, #32 | |
smlal.4s v17, v29, v24 | |
cmp x8, #256 | |
b.ne LBB25_1 | |
add.4s v0, v4, v2 | |
add.4s v1, v6, v3 | |
add.4s v2, v17, v7 | |
add.4s v0, v2, v0 | |
add.4s v2, v16, v5 | |
add.4s v1, v2, v1 | |
add.4s v0, v0, v1 | |
addv.4s s0, v0 | |
fmov w8, s0 | |
ldr x9, [sp, #8] | |
ldrsh w9, [x9] | |
add w0, w8, w9 | |
ldp x29, x30, [sp, #112] | |
ldp x20, x19, [sp, #96] | |
ldp x22, x21, [sp, #80] | |
ldp x24, x23, [sp, #64] | |
ldp x26, x25, [sp, #48] | |
ldp x28, x27, [sp, #32] | |
ldp d9, d8, [sp, #16] | |
add sp, sp, #128 | |
ret | |
std::__1::array<int, 1ul> MantaRay::ActivateFlattenAndForward<&MantaRay::ClippedReLU<short, (short)0, (short)255>::Activate(short), short, int, 384ul, 1ul>(std::__1::array<short, 384ul> const&, std::__1::array<short, 384ul> const&, std::__1::array<short, 384ul * 2 * 1ul> const&, std::__1::array<short, 1ul> const&): | |
movi.2d v0, #0000000000000000 | |
movi.2d v1, #0xff00ff00ff00ff | |
movi.2d v2, #0000000000000000 | |
movi.2d v3, #0000000000000000 | |
add x8, x1, #32 | |
add x9, x0, #32 | |
mov w10, #384 | |
movi.2d v5, #0000000000000000 | |
movi.2d v6, #0000000000000000 | |
movi.2d v7, #0000000000000000 | |
movi.2d v4, #0000000000000000 | |
movi.2d v17, #0000000000000000 | |
movi.2d v16, #0000000000000000 | |
LBB26_1: | |
ldp q18, q19, [x9, #-32] | |
ldp q20, q21, [x9], #64 | |
smax.8h v18, v18, v0 | |
smax.8h v19, v19, v0 | |
smax.8h v20, v20, v0 | |
smax.8h v21, v21, v0 | |
smin.8h v18, v18, v1 | |
smin.8h v19, v19, v1 | |
smin.8h v20, v20, v1 | |
smin.8h v21, v21, v1 | |
ldp q22, q23, [x2] | |
ldp q24, q25, [x2, #32] | |
smlal.4s v2, v18, v22 | |
smlal2.4s v3, v18, v22 | |
smlal.4s v5, v19, v23 | |
smlal2.4s v6, v19, v23 | |
smlal.4s v7, v20, v24 | |
smlal2.4s v4, v20, v24 | |
smlal.4s v17, v21, v25 | |
smlal2.4s v16, v21, v25 | |
ldp q18, q19, [x8, #-32] | |
ldp q20, q21, [x8], #64 | |
smax.8h v18, v18, v0 | |
smax.8h v19, v19, v0 | |
smax.8h v20, v20, v0 | |
smax.8h v21, v21, v0 | |
smin.8h v18, v18, v1 | |
smin.8h v19, v19, v1 | |
smin.8h v20, v20, v1 | |
smin.8h v21, v21, v1 | |
ldp q22, q23, [x2, #768] | |
ldp q24, q25, [x2, #800] | |
smlal2.4s v3, v18, v22 | |
smlal.4s v2, v18, v22 | |
smlal2.4s v6, v19, v23 | |
smlal.4s v5, v19, v23 | |
smlal2.4s v4, v20, v24 | |
smlal.4s v7, v20, v24 | |
smlal2.4s v16, v21, v25 | |
smlal.4s v17, v21, v25 | |
add x2, x2, #64 | |
subs x10, x10, #32 | |
b.ne LBB26_1 | |
add.4s v0, v5, v2 | |
add.4s v1, v6, v3 | |
add.4s v2, v17, v7 | |
add.4s v0, v2, v0 | |
add.4s v2, v16, v4 | |
add.4s v1, v2, v1 | |
add.4s v0, v0, v1 | |
addv.4s s0, v0 | |
fmov w8, s0 | |
ldrsh w9, [x3] | |
add w0, w8, w9 | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment