#if defined(__has_feature) #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) #define OPENSSL_NO_ASM #endif #endif #if defined(__arm__) && !defined(OPENSSL_NO_ASM) && !defined(__APPLE__) #if defined(BORINGSSL_PREFIX) #include #endif # This implementation was taken from the public domain, neon2 version in # SUPERCOP by D. J. Bernstein and Peter Schwabe. # qhasm: int32 input_0 # qhasm: int32 input_1 # qhasm: int32 input_2 # qhasm: int32 input_3 # qhasm: stack32 input_4 # qhasm: stack32 input_5 # qhasm: stack32 input_6 # qhasm: stack32 input_7 # qhasm: int32 caller_r4 # qhasm: int32 caller_r5 # qhasm: int32 caller_r6 # qhasm: int32 caller_r7 # qhasm: int32 caller_r8 # qhasm: int32 caller_r9 # qhasm: int32 caller_r10 # qhasm: int32 caller_r11 # qhasm: int32 caller_r12 # qhasm: int32 caller_r14 # qhasm: reg128 caller_q4 # qhasm: reg128 caller_q5 # qhasm: reg128 caller_q6 # qhasm: reg128 caller_q7 # qhasm: startcode .fpu neon .text # qhasm: reg128 r0 # qhasm: reg128 r1 # qhasm: reg128 r2 # qhasm: reg128 r3 # qhasm: reg128 r4 # qhasm: reg128 x01 # qhasm: reg128 x23 # qhasm: reg128 x4 # qhasm: reg128 y0 # qhasm: reg128 y12 # qhasm: reg128 y34 # qhasm: reg128 5y12 # qhasm: reg128 5y34 # qhasm: stack128 y0_stack # qhasm: stack128 y12_stack # qhasm: stack128 y34_stack # qhasm: stack128 5y12_stack # qhasm: stack128 5y34_stack # qhasm: reg128 z0 # qhasm: reg128 z12 # qhasm: reg128 z34 # qhasm: reg128 5z12 # qhasm: reg128 5z34 # qhasm: stack128 z0_stack # qhasm: stack128 z12_stack # qhasm: stack128 z34_stack # qhasm: stack128 5z12_stack # qhasm: stack128 5z34_stack # qhasm: stack128 two24 # qhasm: int32 ptr # qhasm: reg128 c01 # qhasm: reg128 c23 # qhasm: reg128 d01 # qhasm: reg128 d23 # qhasm: reg128 t0 # qhasm: reg128 t1 # qhasm: reg128 t2 # qhasm: reg128 t3 # qhasm: reg128 t4 # qhasm: reg128 mask # qhasm: reg128 u0 # qhasm: reg128 u1 # qhasm: reg128 u2 # qhasm: reg128 u3 # qhasm: reg128 u4 # qhasm: reg128 v01 # qhasm: reg128 mid # qhasm: reg128 v23 # qhasm: reg128 v4 # qhasm: int32 len # qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks .align 4 .global openssl_poly1305_neon2_blocks .hidden openssl_poly1305_neon2_blocks .type openssl_poly1305_neon2_blocks STT_FUNC openssl_poly1305_neon2_blocks: vpush {q4,q5,q6,q7} mov r12,sp sub sp,sp,#192 bic sp,sp,#31 # qhasm: len = input_3 # asm 1: mov >len=int32#4,len=r3,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[input_1=int32#2,input_1=r1,z12=reg128#5%bot->z12=reg128#5%top},[z12=d8->z12=d9},[z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[mask=reg128#7,#0xffffffff # asm 2: vmov.i64 >mask=q6,#0xffffffff vmov.i64 q6,#0xffffffff # qhasm: 2x u4 = 0xff # asm 1: vmov.i64 >u4=reg128#8,#0xff # asm 2: vmov.i64 >u4=q7,#0xff vmov.i64 q7,#0xff # qhasm: x01 aligned= mem128[input_0];input_0+=16 # asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[x01=d16->x01=d17},[x23=reg128#10%bot->x23=reg128#10%top},[x23=d18->x23=d19},[input_0=int32#1,input_0=r0,>=6 # asm 1: vshr.u64 >mask=reg128#7,mask=q6,>= 7 # asm 1: vshr.u64 >u4=reg128#8,u4=q7,5y12=reg128#12,5y12=q11,5y34=reg128#13,5y34=q12,5y12=reg128#12,<5y12=reg128#12,5y12=q11,<5y12=q11,5y34=reg128#13,<5y34=reg128#13,5y34=q12,<5y34=q12,u4=reg128#8,u4=q7,5z12=reg128#14,5z12=q13,5z34=reg128#15,5z34=q14,5z12=reg128#14,<5z12=reg128#14,5z12=q13,<5z12=q13,5z34=reg128#15,<5z34=reg128#15,5z34=q14,<5z34=q14,ptr=int32#2,ptr=r1,r4=reg128#16,r4=q15,r0=reg128#8,r0=q7,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,<5y12_stack=stack128#5 # asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] add r1,sp,#64 # qhasm: mem128[ptr] aligned= 5y12 # asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[ptr=int32#2,<5y34_stack=stack128#6 # asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] add r1,sp,#80 # qhasm: mem128[ptr] aligned= 5y34 # asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[ptr=int32#2,<5z12_stack=stack128#10 # asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] add r1,sp,#144 # qhasm: mem128[ptr] aligned= 5z12 # asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[ptr=int32#2,<5z34_stack=stack128#11 # asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] add r1,sp,#160 # qhasm: mem128[ptr] aligned= 5z34 # asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[? len - 64 # asm 1: cmp bls ._below64bytes # qhasm: input_2 += 32 # asm 1: add >input_2=int32#2,input_2=r1,c01=reg128#1%bot->c01=reg128#1%top},[c01=d0->c01=d1},[c23=reg128#2%bot->c23=reg128#2%top},[c23=d2->c23=d3},[ptr=int32#3,ptr=r2,z12=reg128#3%bot->z12=reg128#3%top},[z12=d4->z12=d5},[ptr=int32#3,ptr=r2,z0=reg128#4%bot->z0=reg128#4%top},[z0=d6->z0=d7},[r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,ptr=int32#3,<5z34_stack=stack128#11 # asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] add r2,sp,#160 # qhasm: 5z34 aligned= mem128[ptr] # asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[5z34=d10->5z34=d11},[r0=reg128#8,r0=q7,r2=reg128#14,r2=q13,d01=reg128#12%bot->d01=reg128#12%top},[d01=d22->d01=d23},[r1=reg128#15,r1=q14,ptr=int32#3,<5z12_stack=stack128#10 # asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] add r2,sp,#144 # qhasm: 5z12 aligned= mem128[ptr] # asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[5z12=d0->5z12=d1},[d23=reg128#2%bot->d23=reg128#2%top},[d23=d2->d23=d3},[input_2=int32#2,input_2=r1,> 40 # asm 1: vshr.u64 >v4=reg128#4,v4=q3,> 14; v23[3] = d23[2,3] unsigned>> 14 # asm 1: vshrn.u64 > 26; v01[3] = d01[2,3] unsigned>> 26 # asm 1: vshrn.u64 > 20; v23[1] = mid[2,3] unsigned>> 20 # asm 1: vshrn.u64 ptr=int32#3,ptr=r2,y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[ptr=int32#3,ptr=r2,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[ptr=int32#3,ptr=r2,y0=reg128#1%bot->y0=reg128#1%top},[y0=d0->y0=d1},[ptr=int32#3,<5y34_stack=stack128#6 # asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] add r2,sp,#80 # qhasm: 5y34 aligned= mem128[ptr] # asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[5y34=d24->5y34=d25},[ptr=int32#3,<5y12_stack=stack128#5 # asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] add r2,sp,#64 # qhasm: 5y12 aligned= mem128[ptr] # asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[5y12=d22->5y12=d23},[ptr=int32#3,ptr=r2,> 26 # asm 1: vshr.u64 >t1=reg128#4,t1=q3,len=int32#4,len=r3,r0=reg128#6,r0=q5,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#5,r3=q4,x4=reg128#8,x4=q7,r4=reg128#16%bot->r4=reg128#16%top},[r4=d30->r4=d31},[> 26 # asm 1: vshr.u64 >t2=reg128#9,t2=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t0=reg128#10,t0=q9,r2=reg128#9,r2=q8,x4=reg128#11,x4=q10,x01=reg128#6,x01=q5,r0=reg128#8%bot->r0=reg128#8%top},[r0=d14->r0=d15},[ptr=int32#3,ptr=r2,t0=reg128#10,t0=q9,> 26 # asm 1: vshr.u64 >t3=reg128#14,t3=q13,x01=reg128#15,x01=q14,z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[x23=reg128#10,x23=q9,r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,> 26 # asm 1: vshr.u64 >t1=reg128#14,t1=q13,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#14,t4=q13,r3=reg128#5,r3=q4,x4=reg128#11,x4=q10,? len - 64 # asm 1: cmp bhi ._mainloop2 # qhasm: input_2 -= 32 # asm 1: sub >input_2=int32#3,input_2=r2,? len - 32 # asm 1: cmp bls ._end # qhasm: mainloop: ._mainloop: # qhasm: new r0 # qhasm: ptr = &two24 # asm 1: lea >ptr=int32#2,ptr=r1,r4=reg128#5%bot->r4=reg128#5%top},[r4=d8->r4=d9},[u4=reg128#6%bot->u4=reg128#6%top},[u4=d10->u4=d11},[c01=reg128#8%bot->c01=reg128#8%top},[c01=d14->c01=d15},[c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[r0=reg128#4,r0=q3,r3=reg128#6,r3=q5,r1=reg128#14,r1=q13,r2=reg128#8,r2=q7,> 26 # asm 1: vshr.u64 >t1=reg128#9,t1=q8,r0=reg128#4,r0=q3,r1=reg128#9,r1=q8,> 26 # asm 1: vshr.u64 >t4=reg128#10,t4=q9,r3=reg128#6,r3=q5,r4=reg128#5,r4=q4,> 26 # asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#11,r1=q10,> 26 # asm 1: vshr.u64 >t0=reg128#9,t0=q8,r2=reg128#8,r2=q7,r4=reg128#5,r4=q4,r0=reg128#4,r0=q3,t0=reg128#9,t0=q8,> 26 # asm 1: vshr.u64 >t3=reg128#14,t3=q13,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#6,r3=q5,> 26 # asm 1: vshr.u64 >t1=reg128#8,t1=q7,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#6,r3=q5,x4=reg128#11,x4=q10,len=int32#4,len=r3,? len - 32 # asm 1: cmp bhi ._mainloop # qhasm: end: ._end: # qhasm: mem128[input_0] = x01;input_0+=16 # asm 1: vst1.8 {len=int32#1,len=r0,mask=reg128#1,#0xffffffff # asm 2: vmov.i64 >mask=q0,#0xffffffff vmov.i64 q0,#0xffffffff # qhasm: y01 aligned= mem128[input_2];input_2+=16 # asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[y01=d2->y01=d3},[_5y01=reg128#3,_5y01=q2,y23=reg128#4%bot->y23=reg128#4%top},[y23=d6->y23=d7},[_5y23=reg128#9,_5y23=q8,_5y4=reg128#11,_5y4=q10,x01=reg128#12%bot->x01=reg128#12%top},[x01=d22->x01=d23},[_5y01=reg128#3,<_5y01=reg128#3,_5y01=q2,<_5y01=q2,x23=reg128#13%bot->x23=reg128#13%top},[x23=d24->x23=d25},[_5y23=reg128#9,<_5y23=reg128#9,_5y23=q8,<_5y23=q8,_5y4=reg128#11,<_5y4=reg128#11,_5y4=q10,<_5y4=q10,c01=reg128#14%bot->c01=reg128#14%top},[c01=d26->c01=d27},[x01=reg128#12,x01=q11,c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[x23=reg128#13,x23=q12,>=6 # asm 1: vshr.u64 >mask=reg128#1,mask=q0,x4=reg128#14,x4=q13,r0=reg128#15,r0=q14,r1=reg128#3,r1=q2,r2=reg128#16,r2=q15,r3=reg128#9,r3=q8,r4=reg128#10,r4=q9,> 26 # asm 1: vshr.u64 >t1=reg128#2,t1=q1,r0=reg128#4,r0=q3,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t4=reg128#3,t4=q2,r3=reg128#9,r3=q8,r4=reg128#3,r4=q2,> 26 # asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t0=reg128#11,t0=q10,r2=reg128#10,r2=q9,r4=reg128#3,r4=q2,r0=reg128#4,r0=q3,t0=reg128#11,t0=q10,> 26 # asm 1: vshr.u64 >t3=reg128#12,t3=q11,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#9,r3=q8,> 26 # asm 1: vshr.u64 >t1=reg128#11,t1=q10,x01=reg128#4,x01=q3,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t4=reg128#11,t4=q10,r3=reg128#1,r3=q0,x4=reg128#3,x4=q2,