#! /usr/bin/env perl # # April 2019 # # Abstract: field arithmetic in aarch64 assembly for SIDH/p434 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $PREFIX="sike"; $code.=<<___; .section .rodata # p434 x 2 .Lp434x2: .quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF .quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 .quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 # p434 + 1 .Lp434p1: .quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 .quad 0x6CFC5FD681C52056, 0x0002341F27177344 .text ___ # Computes C0-C2 = A0 * (B0-B1) # Inputs remain intact sub mul64x128 { my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_; my $body=<<___; mul $T1, $A0, $B0 umulh $B0, $A0, $B0 adds $C0, $C0, $C2 adc $C1, $C1, xzr mul $T0, $A0, $B1 umulh $B1, $A0, $B1 adds $C0, $C0, $T1 adcs $C1, $C1, $B0 adc $C2, xzr, xzr adds $C1, $C1, $T0 adc $C2, $C2, $B1 ___ return $body; } # Computes C0-C4 = A0 * (B0-B3) # Inputs remain intact sub mul64x256 { my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_; my $body=<<___; mul $C0, $A0, $B0 // C0 umulh $T0, $A0, $B0 mul $C1, $A0, $B1 umulh $T1, $A0, $B1 adds $C1, $C1, $T0 // C1 adc $T0, xzr, xzr mul $C2, $A0, $B2 umulh $T2, $A0, $B2 adds $T1, $T0, $T1 adcs $C2, $C2, $T1 // C2 adc $T0, xzr, xzr mul $C3, $A0, $B3 umulh $C4, $A0, $B3 adds $T2, $T0, $T2 adcs $C3, $C3, $T2 // C3 adc $C4, $C4, xzr // C4 ___ return $body; } # Computes C0-C4 = (A0-A1) * (B0-B3) # Inputs remain intact sub mul128x256 { my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; my $body=<<___; mul $C0, $A0, $B0 // C0 umulh $C3, $A0, $B0 mul $C1, $A0, $B1 umulh $C2, $A0, $B1 mul $T0, $A1, $B0 umulh $T1, $A1, $B0 adds $C1, $C1, $C3 adc $C2, $C2, xzr mul $T2, $A0, $B2 umulh $T3, $A0, $B2 adds $C1, $C1, $T0 // C1 adcs $C2, $C2, $T1 adc $C3, xzr, xzr mul $T0, $A1, $B1 umulh $T1, $A1, $B1 adds $C2, $C2, $T2 adcs $C3, $C3, $T3 adc $C4, xzr, xzr mul $T2, $A0, $B3 umulh $T3, $A0, $B3 adds $C2, $C2, $T0 // C2 adcs $C3, $C3, $T1 adc $C4, $C4, xzr mul $T0, $A1, $B2 umulh $T1, $A1, $B2 adds $C3, $C3, $T2 adcs $C4, $C4, $T3 adc $C5, xzr, xzr mul $T2, $A1, $B3 umulh $T3, $A1, $B3 adds $C3, $C3, $T0 // C3 adcs $C4, $C4, $T1 adc $C5, $C5, xzr adds $C4, $C4, $T2 // C4 adc $C5, $C5, $T3 // C5 ___ return $body; } # Computes C0-C5 = (A0-A2) * (B0-B2) # Inputs remain intact sub mul192 { my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; my $body=<<___; // A0 * B0 mul $C0, $A0, $B0 // C0 umulh $C3, $A0, $B0 // A0 * B1 mul $C1, $A0, $B1 umulh $C2, $A0, $B1 // A1 * B0 mul $T0, $A1, $B0 umulh $T1, $A1, $B0 adds $C1, $C1, $C3 adc $C2, $C2, xzr // A0 * B2 mul $T2, $A0, $B2 umulh $T3, $A0, $B2 adds $C1, $C1, $T0 // C1 adcs $C2, $C2, $T1 adc $C3, xzr, xzr // A2 * B0 mul $T0, $A2, $B0 umulh $C4, $A2, $B0 adds $C2, $C2, $T2 adcs $C3, $C3, $C4 adc $C4, xzr, xzr // A1 * B1 mul $T2, $A1, $B1 umulh $T1, $A1, $B1 adds $C2, $C2, $T0 adcs $C3, $C3, $T3 adc $C4, $C4, xzr // A1 * B2 mul $T0, $A1, $B2 umulh $T3, $A1, $B2 adds $C2, $C2, $T2 // C2 adcs $C3, $C3, $T1 adc $C4, $C4, xzr // A2 * B1 mul $T2, $A2, $B1 umulh $T1, $A2, $B1 adds $C3, $C3, $T0 adcs $C4, $C4, $T3 adc $C5, xzr, xzr // A2 * B2 mul $T0, $A2, $B2 umulh $T3, $A2, $B2 adds $C3, $C3, $T2 // C3 adcs $C4, $C4, $T1 adc $C5, $C5, xzr adds $C4, $C4, $T0 // C4 adc $C5, $C5, $T3 // C5 ___ return $body; } sub mul256_karatsuba { my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_; # (AH+AL) x (BH+BL), low part my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0); # AL x BL my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0); # AH x BH my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2); my $body=<<___; // A0-A1 <- AH + AL, T0 <- mask adds $A0, $A0, $A2 adcs $A1, $A1, $A3 adc $T0, xzr, xzr // C6, T1 <- BH + BL, C7 <- mask adds $C6, $B0, $B2 adcs $T1, $B1, $B3 adc $C7, xzr, xzr // C0-C1 <- masked (BH + BL) sub $C2, xzr, $T0 sub $C3, xzr, $C7 and $C0, $C6, $C2 and $C1, $T1, $C2 // C4-C5 <- masked (AH + AL), T0 <- combined carry and $C4, $A0, $C3 and $C5, $A1, $C3 mul $C2, $A0, $C6 mul $C3, $A0, $T1 and $T0, $T0, $C7 // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 adds $C0, $C4, $C0 umulh $C4, $A0, $T1 adcs $C1, $C5, $C1 umulh $C5, $A0, $C6 adc $T0, $T0, xzr // C2-C5 <- (AH+AL) x (BH+BL), low part $mul_low ldp $A0, $A1, [$M,#0] // C2-C5, T0 <- (AH+AL) x (BH+BL), final part adds $C4, $C0, $C4 umulh $C7, $A0, $B0 umulh $T1, $A0, $B1 adcs $C5, $C1, $C5 mul $C0, $A0, $B0 mul $C1, $A0, $B1 adc $T0, $T0, xzr // C0-C1, T1, C7 <- AL x BL $mul_albl // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL mul $A0, $A2, $B2 umulh $B0, $A2, $B2 subs $C2, $C2, $C0 sbcs $C3, $C3, $C1 sbcs $C4, $C4, $T1 mul $A1, $A2, $B3 umulh $C6, $A2, $B3 sbcs $C5, $C5, $C7 sbc $T0, $T0, xzr // A0, A1, C6, B0 <- AH x BH $mul_ahbh // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH subs $C2, $C2, $A0 sbcs $C3, $C3, $A1 sbcs $C4, $C4, $C6 sbcs $C5, $C5, $B0 sbc $T0, $T0, xzr adds $C2, $C2, $T1 adcs $C3, $C3, $C7 adcs $C4, $C4, $A0 adcs $C5, $C5, $A1 adcs $C6, $T0, $C6 adc $C7, $B0, xzr ___ return $body; } # 512-bit integer multiplication using Karatsuba (two levels), # Comba (lower level). # Operation: c [x2] = a [x0] * b [x1] sub mul { # (AH+AL) x (BH+BL), low part my $mul_kc_low=&mul256_karatsuba( "x2", # M0 "x3","x4","x5","x6", # A0-A3 "x10","x11","x12","x13", # B0-B3 "x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7 "x25","x26"); # TMP # AL x BL my $mul_albl=&mul256_karatsuba( "x0", # M0f "x3","x4","x5","x6", # A0-A3 "x10","x11","x12","x13", # B0-B3 "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7 "x8","x9"); # TMP # AH x BH my $mul_ahbh=&mul192( "x3","x4","x5", # A0-A2 "x10","x11","x12", # B0-B2 "x21","x22","x23","x24","x25","x26", # C0-C5 "x8","x9","x27","x28"); # TMP my $body=<<___; .global ${PREFIX}_mpmul .align 4 ${PREFIX}_mpmul: stp x29, x30, [sp,#-96]! add x29, sp, #0 stp x19, x20, [sp,#16] stp x21, x22, [sp,#32] stp x23, x24, [sp,#48] stp x25, x26, [sp,#64] stp x27, x28, [sp,#80] ldp x3, x4, [x0] ldp x5, x6, [x0,#16] ldp x7, x8, [x0,#32] ldr x9, [x0,#48] ldp x10, x11, [x1,#0] ldp x12, x13, [x1,#16] ldp x14, x15, [x1,#32] ldr x16, [x1,#48] // x3-x7 <- AH + AL, x7 <- carry adds x3, x3, x7 adcs x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, xzr adc x7, xzr, xzr // x10-x13 <- BH + BL, x8 <- carry adds x10, x10, x14 adcs x11, x11, x15 adcs x12, x12, x16 adcs x13, x13, xzr adc x8, xzr, xzr // x9 <- combined carry and x9, x7, x8 // x7-x8 <- mask sub x7, xzr, x7 sub x8, xzr, x8 // x15-x19 <- masked (BH + BL) and x14, x10, x7 and x15, x11, x7 and x16, x12, x7 and x17, x13, x7 // x20-x23 <- masked (AH + AL) and x20, x3, x8 and x21, x4, x8 and x22, x5, x8 and x23, x6, x8 // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 adds x14, x14, x20 adcs x15, x15, x21 adcs x16, x16, x22 adcs x17, x17, x23 adc x7, x9, xzr // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part stp x3, x4, [x2,#0] $mul_kc_low // x15-x19, x7 <- (AH+AL) x (BH+BL), final step adds x14, x14, x21 adcs x15, x15, x22 adcs x16, x16, x23 adcs x17, x17, x24 adc x7, x7, xzr // Load AL ldp x3, x4, [x0] ldp x5, x6, [x0,#16] // Load BL ldp x10, x11, [x1,#0] ldp x12, x13, [x1,#16] // Temporarily store x8 in x2 stp x8, x9, [x2,#0] // x21-x28 <- AL x BL $mul_albl // Restore x8 ldp x8, x9, [x2,#0] // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL subs x8, x8, x21 sbcs x9, x9, x22 sbcs x19, x19, x23 sbcs x20, x20, x24 sbcs x14, x14, x25 sbcs x15, x15, x26 sbcs x16, x16, x27 sbcs x17, x17, x28 sbc x7, x7, xzr // Store ALxBL, low stp x21, x22, [x2] stp x23, x24, [x2,#16] // Load AH ldp x3, x4, [x0,#32] ldr x5, [x0,#48] // Load BH ldp x10, x11, [x1,#32] ldr x12, [x1,#48] adds x8, x8, x25 adcs x9, x9, x26 adcs x19, x19, x27 adcs x20, x20, x28 adc x1, xzr, xzr add x0, x0, #32 // Temporarily store x8,x9 in x2 stp x8,x9, [x2,#32] // x21-x28 <- AH x BH $mul_ahbh // Restore x8,x9 ldp x8,x9, [x2,#32] neg x1, x1 // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH subs x8, x8, x21 sbcs x9, x9, x22 sbcs x19, x19, x23 sbcs x20, x20, x24 sbcs x14, x14, x25 sbcs x15, x15, x26 sbcs x16, x16, xzr sbcs x17, x17, xzr sbc x7, x7, xzr // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low stp x8, x9, [x2,#32] stp x19, x20, [x2,#48] adds x1, x1, #1 adcs x14, x14, x21 adcs x15, x15, x22 adcs x16, x16, x23 adcs x17, x17, x24 adcs x25, x7, x25 adc x26, x26, xzr stp x14, x15, [x2,#64] stp x16, x17, [x2,#80] stp x25, x26, [x2,#96] ldp x19, x20, [x29,#16] ldp x21, x22, [x29,#32] ldp x23, x24, [x29,#48] ldp x25, x26, [x29,#64] ldp x27, x28, [x29,#80] ldp x29, x30, [sp],#96 ret ___ return $body; } $code.=&mul(); # Montgomery reduction # Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 # Operation: mc [x1] = ma [x0] # NOTE: ma=mc is not allowed sub rdc { my $mul01=&mul128x256( "x2","x3", # A0-A1 "x23","x24","x25","x26", # B0-B3 "x4","x5","x6","x7","x8","x9", # C0-C5 "x10","x11","x27","x28"); # TMP my $mul23=&mul128x256( "x2","x10", # A0-A1 "x23","x24","x25","x26", # B0-B3 "x4","x5","x6","x7","x8","x9", # C0-C5 "x0","x3","x27","x28"); # TMP my $mul45=&mul128x256( "x11","x12", # A0-A1 "x23","x24","x25","x26", # B0-B3 "x4","x5","x6","x7","x8","x9", # C0-C5 "x10","x3","x27","x28"); # TMP my $mul67=&mul64x256( "x13", # A0 "x23","x24","x25","x26", # B0-B3 "x4","x5","x6","x7","x8", # C0-C4 "x10","x27","x28"); # TMP my $body=<<___; .global ${PREFIX}_fprdc .align 4 ${PREFIX}_fprdc: stp x29, x30, [sp, #-96]! add x29, sp, xzr stp x19, x20, [sp,#16] stp x21, x22, [sp,#32] stp x23, x24, [sp,#48] stp x25, x26, [sp,#64] stp x27, x28, [sp,#80] ldp x2, x3, [x0,#0] // a[0-1] // Load the prime constant adrp x26, :pg_hi21:.Lp434p1 add x26, x26, :lo12:.Lp434p1 ldp x23, x24, [x26, #0x0] ldp x25, x26, [x26,#0x10] // a[0-1] * p434+1 $mul01 ldp x10, x11, [x0, #0x18] ldp x12, x13, [x0, #0x28] ldp x14, x15, [x0, #0x38] ldp x16, x17, [x0, #0x48] ldp x19, x20, [x0, #0x58] ldr x21, [x0, #0x68] adds x10, x10, x4 adcs x11, x11, x5 adcs x12, x12, x6 adcs x13, x13, x7 adcs x14, x14, x8 adcs x15, x15, x9 adcs x22, x16, xzr adcs x17, x17, xzr adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr ldr x2, [x0,#0x10] // a[2] // a[2-3] * p434+1 $mul23 adds x12, x12, x4 adcs x13, x13, x5 adcs x14, x14, x6 adcs x15, x15, x7 adcs x16, x22, x8 adcs x17, x17, x9 adcs x22, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr $mul45 adds x14, x14, x4 adcs x15, x15, x5 adcs x16, x16, x6 adcs x17, x17, x7 adcs x19, x22, x8 adcs x20, x20, x9 adc x22, x21, xzr stp x14, x15, [x1, #0x0] // C0, C1 $mul67 adds x16, x16, x4 adcs x17, x17, x5 adcs x19, x19, x6 adcs x20, x20, x7 adc x21, x22, x8 str x16, [x1, #0x10] stp x17, x19, [x1, #0x18] stp x20, x21, [x1, #0x28] ldp x19, x20, [x29,#16] ldp x21, x22, [x29,#32] ldp x23, x24, [x29,#48] ldp x25, x26, [x29,#64] ldp x27, x28, [x29,#80] ldp x29, x30, [sp],#96 ret ___ } $code.=&rdc(); # Field addition # Operation: c [x2] = a [x0] + b [x1] $code.=<<___; .global ${PREFIX}_fpadd .align 4 ${PREFIX}_fpadd: stp x29,x30, [sp,#-16]! add x29, sp, #0 ldp x3, x4, [x0,#0] ldp x5, x6, [x0,#16] ldp x7, x8, [x0,#32] ldr x9, [x0,#48] ldp x11, x12, [x1,#0] ldp x13, x14, [x1,#16] ldp x15, x16, [x1,#32] ldr x17, [x1,#48] // Add a + b adds x3, x3, x11 adcs x4, x4, x12 adcs x5, x5, x13 adcs x6, x6, x14 adcs x7, x7, x15 adcs x8, x8, x16 adc x9, x9, x17 // Subtract 2xp434 adrp x17, :pg_hi21:.Lp434x2 add x17, x17, :lo12:.Lp434x2 ldp x11, x12, [x17, #0] ldp x13, x14, [x17, #16] ldp x15, x16, [x17, #32] subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x12 sbcs x6, x6, x13 sbcs x7, x7, x14 sbcs x8, x8, x15 sbcs x9, x9, x16 sbc x0, xzr, xzr // x0 can be reused now // Add 2xp434 anded with the mask in x0 and x11, x11, x0 and x12, x12, x0 and x13, x13, x0 and x14, x14, x0 and x15, x15, x0 and x16, x16, x0 adds x3, x3, x11 adcs x4, x4, x12 adcs x5, x5, x12 adcs x6, x6, x13 adcs x7, x7, x14 adcs x8, x8, x15 adc x9, x9, x16 stp x3, x4, [x2,#0] stp x5, x6, [x2,#16] stp x7, x8, [x2,#32] str x9, [x2,#48] ldp x29, x30, [sp],#16 ret ___ # Field subtraction # Operation: c [x2] = a [x0] - b [x1] $code.=<<___; .global ${PREFIX}_fpsub .align 4 ${PREFIX}_fpsub: stp x29, x30, [sp,#-16]! add x29, sp, #0 ldp x3, x4, [x0,#0] ldp x5, x6, [x0,#16] ldp x7, x8, [x0,#32] ldr x9, [x0,#48] ldp x11, x12, [x1,#0] ldp x13, x14, [x1,#16] ldp x15, x16, [x1,#32] ldr x17, [x1,#48] // Subtract a - b subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 sbcs x8, x8, x16 sbcs x9, x9, x17 sbc x0, xzr, xzr // Add 2xp434 anded with the mask in x0 adrp x17, :pg_hi21:.Lp434x2 add x17, x17, :lo12:.Lp434x2 // First half ldp x11, x12, [x17, #0] ldp x13, x14, [x17, #16] ldp x15, x16, [x17, #32] // Add 2xp434 anded with the mask in x0 and x11, x11, x0 and x12, x12, x0 and x13, x13, x0 and x14, x14, x0 and x15, x15, x0 and x16, x16, x0 adds x3, x3, x11 adcs x4, x4, x12 adcs x5, x5, x12 adcs x6, x6, x13 adcs x7, x7, x14 adcs x8, x8, x15 adc x9, x9, x16 stp x3, x4, [x2,#0] stp x5, x6, [x2,#16] stp x7, x8, [x2,#32] str x9, [x2,#48] ldp x29, x30, [sp],#16 ret ___ # 434-bit multiprecision addition # Operation: c [x2] = a [x0] + b [x1] $code.=<<___; .global ${PREFIX}_mpadd_asm .align 4 ${PREFIX}_mpadd_asm: stp x29, x30, [sp,#-16]! add x29, sp, #0 ldp x3, x4, [x0,#0] ldp x5, x6, [x0,#16] ldp x7, x8, [x0,#32] ldr x9, [x0,#48] ldp x11, x12, [x1,#0] ldp x13, x14, [x1,#16] ldp x15, x16, [x1,#32] ldr x17, [x1,#48] adds x3, x3, x11 adcs x4, x4, x12 adcs x5, x5, x13 adcs x6, x6, x14 adcs x7, x7, x15 adcs x8, x8, x16 adc x9, x9, x17 stp x3, x4, [x2,#0] stp x5, x6, [x2,#16] stp x7, x8, [x2,#32] str x9, [x2,#48] ldp x29, x30, [sp],#16 ret ___ # 2x434-bit multiprecision subtraction # Operation: c [x2] = a [x0] - b [x1]. # Returns borrow mask $code.=<<___; .global ${PREFIX}_mpsubx2_asm .align 4 ${PREFIX}_mpsubx2_asm: stp x29, x30, [sp,#-16]! add x29, sp, #0 ldp x3, x4, [x0,#0] ldp x5, x6, [x0,#16] ldp x11, x12, [x1,#0] ldp x13, x14, [x1,#16] subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 ldp x7, x8, [x0,#32] ldp x9, x10, [x0,#48] ldp x11, x12, [x1,#32] ldp x13, x14, [x1,#48] sbcs x7, x7, x11 sbcs x8, x8, x12 sbcs x9, x9, x13 sbcs x10, x10, x14 stp x3, x4, [x2,#0] stp x5, x6, [x2,#16] stp x7, x8, [x2,#32] stp x9, x10, [x2,#48] ldp x3, x4, [x0,#64] ldp x5, x6, [x0,#80] ldp x11, x12, [x1,#64] ldp x13, x14, [x1,#80] sbcs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 ldp x7, x8, [x0,#96] ldp x11, x12, [x1,#96] sbcs x7, x7, x11 sbcs x8, x8, x12 sbc x0, xzr, xzr stp x3, x4, [x2,#64] stp x5, x6, [x2,#80] stp x7, x8, [x2,#96] ldp x29, x30, [sp],#16 ret ___ # Double 2x434-bit multiprecision subtraction # Operation: c [x2] = c [x2] - a [x0] - b [x1] $code.=<<___; .global ${PREFIX}_mpdblsubx2_asm .align 4 ${PREFIX}_mpdblsubx2_asm: stp x29, x30, [sp, #-16]! add x29, sp, #0 ldp x3, x4, [x2, #0] ldp x5, x6, [x2,#16] ldp x7, x8, [x2,#32] ldp x11, x12, [x0, #0] ldp x13, x14, [x0,#16] ldp x15, x16, [x0,#32] subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 sbcs x8, x8, x16 // x9 stores carry adc x9, xzr, xzr ldp x11, x12, [x1, #0] ldp x13, x14, [x1,#16] ldp x15, x16, [x1,#32] subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 sbcs x8, x8, x16 adc x9, x9, xzr stp x3, x4, [x2, #0] stp x5, x6, [x2,#16] stp x7, x8, [x2,#32] ldp x3, x4, [x2,#48] ldp x5, x6, [x2,#64] ldp x7, x8, [x2,#80] ldp x11, x12, [x0,#48] ldp x13, x14, [x0,#64] ldp x15, x16, [x0,#80] // x9 = 2 - x9 neg x9, x9 add x9, x9, #2 subs x3, x3, x9 sbcs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 sbcs x8, x8, x16 adc x9, xzr, xzr ldp x11, x12, [x1,#48] ldp x13, x14, [x1,#64] ldp x15, x16, [x1,#80] subs x3, x3, x11 sbcs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 sbcs x8, x8, x16 adc x9, x9, xzr stp x3, x4, [x2,#48] stp x5, x6, [x2,#64] stp x7, x8, [x2,#80] ldp x3, x4, [x2,#96] ldp x11, x12, [x0,#96] ldp x13, x14, [x1,#96] // x9 = 2 - x9 neg x9, x9 add x9, x9, #2 subs x3, x3, x9 sbcs x3, x3, x11 sbcs x4, x4, x12 subs x3, x3, x13 sbc x4, x4, x14 stp x3, x4, [x2,#96] ldp x29, x30, [sp],#16 ret ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; print $_,"\n"; } close STDOUT;