Nagram/TMessagesProj/jni/boringssl/third_party/sike/asm/fp-armv8.pl
2019-12-31 16:08:08 +03:00

916 lines
24 KiB
Raku

#! /usr/bin/env perl
#
# April 2019
#
# Abstract: field arithmetic in aarch64 assembly for SIDH/p434
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$PREFIX="sike";
$code.=<<___;
.section .rodata
# p434 x 2
.Lp434x2:
.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47
.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688
# p434 + 1
.Lp434p1:
.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3
.quad 0x6CFC5FD681C52056, 0x0002341F27177344
.text
___
# Computes C0-C2 = A0 * (B0-B1)
# Inputs remain intact
sub mul64x128 {
my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
my $body=<<___;
mul $T1, $A0, $B0
umulh $B0, $A0, $B0
adds $C0, $C0, $C2
adc $C1, $C1, xzr
mul $T0, $A0, $B1
umulh $B1, $A0, $B1
adds $C0, $C0, $T1
adcs $C1, $C1, $B0
adc $C2, xzr, xzr
adds $C1, $C1, $T0
adc $C2, $C2, $B1
___
return $body;
}
# Computes C0-C4 = A0 * (B0-B3)
# Inputs remain intact
sub mul64x256 {
my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_;
my $body=<<___;
mul $C0, $A0, $B0 // C0
umulh $T0, $A0, $B0
mul $C1, $A0, $B1
umulh $T1, $A0, $B1
adds $C1, $C1, $T0 // C1
adc $T0, xzr, xzr
mul $C2, $A0, $B2
umulh $T2, $A0, $B2
adds $T1, $T0, $T1
adcs $C2, $C2, $T1 // C2
adc $T0, xzr, xzr
mul $C3, $A0, $B3
umulh $C4, $A0, $B3
adds $T2, $T0, $T2
adcs $C3, $C3, $T2 // C3
adc $C4, $C4, xzr // C4
___
return $body;
}
# Computes C0-C4 = (A0-A1) * (B0-B3)
# Inputs remain intact
sub mul128x256 {
my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
my $body=<<___;
mul $C0, $A0, $B0 // C0
umulh $C3, $A0, $B0
mul $C1, $A0, $B1
umulh $C2, $A0, $B1
mul $T0, $A1, $B0
umulh $T1, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
mul $T2, $A0, $B2
umulh $T3, $A0, $B2
adds $C1, $C1, $T0 // C1
adcs $C2, $C2, $T1
adc $C3, xzr, xzr
mul $T0, $A1, $B1
umulh $T1, $A1, $B1
adds $C2, $C2, $T2
adcs $C3, $C3, $T3
adc $C4, xzr, xzr
mul $T2, $A0, $B3
umulh $T3, $A0, $B3
adds $C2, $C2, $T0 // C2
adcs $C3, $C3, $T1
adc $C4, $C4, xzr
mul $T0, $A1, $B2
umulh $T1, $A1, $B2
adds $C3, $C3, $T2
adcs $C4, $C4, $T3
adc $C5, xzr, xzr
mul $T2, $A1, $B3
umulh $T3, $A1, $B3
adds $C3, $C3, $T0 // C3
adcs $C4, $C4, $T1
adc $C5, $C5, xzr
adds $C4, $C4, $T2 // C4
adc $C5, $C5, $T3 // C5
___
return $body;
}
# Computes C0-C5 = (A0-A2) * (B0-B2)
# Inputs remain intact
sub mul192 {
my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
my $body=<<___;
// A0 * B0
mul $C0, $A0, $B0 // C0
umulh $C3, $A0, $B0
// A0 * B1
mul $C1, $A0, $B1
umulh $C2, $A0, $B1
// A1 * B0
mul $T0, $A1, $B0
umulh $T1, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
// A0 * B2
mul $T2, $A0, $B2
umulh $T3, $A0, $B2
adds $C1, $C1, $T0 // C1
adcs $C2, $C2, $T1
adc $C3, xzr, xzr
// A2 * B0
mul $T0, $A2, $B0
umulh $C4, $A2, $B0
adds $C2, $C2, $T2
adcs $C3, $C3, $C4
adc $C4, xzr, xzr
// A1 * B1
mul $T2, $A1, $B1
umulh $T1, $A1, $B1
adds $C2, $C2, $T0
adcs $C3, $C3, $T3
adc $C4, $C4, xzr
// A1 * B2
mul $T0, $A1, $B2
umulh $T3, $A1, $B2
adds $C2, $C2, $T2 // C2
adcs $C3, $C3, $T1
adc $C4, $C4, xzr
// A2 * B1
mul $T2, $A2, $B1
umulh $T1, $A2, $B1
adds $C3, $C3, $T0
adcs $C4, $C4, $T3
adc $C5, xzr, xzr
// A2 * B2
mul $T0, $A2, $B2
umulh $T3, $A2, $B2
adds $C3, $C3, $T2 // C3
adcs $C4, $C4, $T1
adc $C5, $C5, xzr
adds $C4, $C4, $T0 // C4
adc $C5, $C5, $T3 // C5
___
return $body;
}
sub mul256_karatsuba {
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
# (AH+AL) x (BH+BL), low part
my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
# AL x BL
my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
# AH x BH
my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
my $body=<<___;
// A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2
adcs $A1, $A1, $A3
adc $T0, xzr, xzr
// C6, T1 <- BH + BL, C7 <- mask
adds $C6, $B0, $B2
adcs $T1, $B1, $B3
adc $C7, xzr, xzr
// C0-C1 <- masked (BH + BL)
sub $C2, xzr, $T0
sub $C3, xzr, $C7
and $C0, $C6, $C2
and $C1, $T1, $C2
// C4-C5 <- masked (AH + AL), T0 <- combined carry
and $C4, $A0, $C3
and $C5, $A1, $C3
mul $C2, $A0, $C6
mul $C3, $A0, $T1
and $T0, $T0, $C7
// C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
adds $C0, $C4, $C0
umulh $C4, $A0, $T1
adcs $C1, $C5, $C1
umulh $C5, $A0, $C6
adc $T0, $T0, xzr
// C2-C5 <- (AH+AL) x (BH+BL), low part
$mul_low
ldp $A0, $A1, [$M,#0]
// C2-C5, T0 <- (AH+AL) x (BH+BL), final part
adds $C4, $C0, $C4
umulh $C7, $A0, $B0
umulh $T1, $A0, $B1
adcs $C5, $C1, $C5
mul $C0, $A0, $B0
mul $C1, $A0, $B1
adc $T0, $T0, xzr
// C0-C1, T1, C7 <- AL x BL
$mul_albl
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
mul $A0, $A2, $B2
umulh $B0, $A2, $B2
subs $C2, $C2, $C0
sbcs $C3, $C3, $C1
sbcs $C4, $C4, $T1
mul $A1, $A2, $B3
umulh $C6, $A2, $B3
sbcs $C5, $C5, $C7
sbc $T0, $T0, xzr
// A0, A1, C6, B0 <- AH x BH
$mul_ahbh
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs $C2, $C2, $A0
sbcs $C3, $C3, $A1
sbcs $C4, $C4, $C6
sbcs $C5, $C5, $B0
sbc $T0, $T0, xzr
adds $C2, $C2, $T1
adcs $C3, $C3, $C7
adcs $C4, $C4, $A0
adcs $C5, $C5, $A1
adcs $C6, $T0, $C6
adc $C7, $B0, xzr
___
return $body;
}
# 512-bit integer multiplication using Karatsuba (two levels),
# Comba (lower level).
# Operation: c [x2] = a [x0] * b [x1]
sub mul {
# (AH+AL) x (BH+BL), low part
my $mul_kc_low=&mul256_karatsuba(
"x2", # M0
"x3","x4","x5","x6", # A0-A3
"x10","x11","x12","x13", # B0-B3
"x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7
"x25","x26"); # TMP
# AL x BL
my $mul_albl=&mul256_karatsuba(
"x0", # M0f
"x3","x4","x5","x6", # A0-A3
"x10","x11","x12","x13", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP
# AH x BH
my $mul_ahbh=&mul192(
"x3","x4","x5", # A0-A2
"x10","x11","x12", # B0-B2
"x21","x22","x23","x24","x25","x26", # C0-C5
"x8","x9","x27","x28"); # TMP
my $body=<<___;
.global ${PREFIX}_mpmul
.align 4
${PREFIX}_mpmul:
stp x29, x30, [sp,#-96]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
ldp x7, x8, [x0,#32]
ldr x9, [x0,#48]
ldp x10, x11, [x1,#0]
ldp x12, x13, [x1,#16]
ldp x14, x15, [x1,#32]
ldr x16, [x1,#48]
// x3-x7 <- AH + AL, x7 <- carry
adds x3, x3, x7
adcs x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, xzr
adc x7, xzr, xzr
// x10-x13 <- BH + BL, x8 <- carry
adds x10, x10, x14
adcs x11, x11, x15
adcs x12, x12, x16
adcs x13, x13, xzr
adc x8, xzr, xzr
// x9 <- combined carry
and x9, x7, x8
// x7-x8 <- mask
sub x7, xzr, x7
sub x8, xzr, x8
// x15-x19 <- masked (BH + BL)
and x14, x10, x7
and x15, x11, x7
and x16, x12, x7
and x17, x13, x7
// x20-x23 <- masked (AH + AL)
and x20, x3, x8
and x21, x4, x8
and x22, x5, x8
and x23, x6, x8
// x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
adds x14, x14, x20
adcs x15, x15, x21
adcs x16, x16, x22
adcs x17, x17, x23
adc x7, x9, xzr
// x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part
stp x3, x4, [x2,#0]
$mul_kc_low
// x15-x19, x7 <- (AH+AL) x (BH+BL), final step
adds x14, x14, x21
adcs x15, x15, x22
adcs x16, x16, x23
adcs x17, x17, x24
adc x7, x7, xzr
// Load AL
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
// Load BL
ldp x10, x11, [x1,#0]
ldp x12, x13, [x1,#16]
// Temporarily store x8 in x2
stp x8, x9, [x2,#0]
// x21-x28 <- AL x BL
$mul_albl
// Restore x8
ldp x8, x9, [x2,#0]
// x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x21
sbcs x9, x9, x22
sbcs x19, x19, x23
sbcs x20, x20, x24
sbcs x14, x14, x25
sbcs x15, x15, x26
sbcs x16, x16, x27
sbcs x17, x17, x28
sbc x7, x7, xzr
// Store ALxBL, low
stp x21, x22, [x2]
stp x23, x24, [x2,#16]
// Load AH
ldp x3, x4, [x0,#32]
ldr x5, [x0,#48]
// Load BH
ldp x10, x11, [x1,#32]
ldr x12, [x1,#48]
adds x8, x8, x25
adcs x9, x9, x26
adcs x19, x19, x27
adcs x20, x20, x28
adc x1, xzr, xzr
add x0, x0, #32
// Temporarily store x8,x9 in x2
stp x8,x9, [x2,#32]
// x21-x28 <- AH x BH
$mul_ahbh
// Restore x8,x9
ldp x8,x9, [x2,#32]
neg x1, x1
// x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x21
sbcs x9, x9, x22
sbcs x19, x19, x23
sbcs x20, x20, x24
sbcs x14, x14, x25
sbcs x15, x15, x26
sbcs x16, x16, xzr
sbcs x17, x17, xzr
sbc x7, x7, xzr
// Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
stp x8, x9, [x2,#32]
stp x19, x20, [x2,#48]
adds x1, x1, #1
adcs x14, x14, x21
adcs x15, x15, x22
adcs x16, x16, x23
adcs x17, x17, x24
adcs x25, x7, x25
adc x26, x26, xzr
stp x14, x15, [x2,#64]
stp x16, x17, [x2,#80]
stp x25, x26, [x2,#96]
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#96
ret
___
return $body;
}
$code.=&mul();
# Montgomery reduction
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
# Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed
sub rdc {
my $mul01=&mul128x256(
"x2","x3", # A0-A1
"x23","x24","x25","x26", # B0-B3
"x4","x5","x6","x7","x8","x9", # C0-C5
"x10","x11","x27","x28"); # TMP
my $mul23=&mul128x256(
"x2","x10", # A0-A1
"x23","x24","x25","x26", # B0-B3
"x4","x5","x6","x7","x8","x9", # C0-C5
"x0","x3","x27","x28"); # TMP
my $mul45=&mul128x256(
"x11","x12", # A0-A1
"x23","x24","x25","x26", # B0-B3
"x4","x5","x6","x7","x8","x9", # C0-C5
"x10","x3","x27","x28"); # TMP
my $mul67=&mul64x256(
"x13", # A0
"x23","x24","x25","x26", # B0-B3
"x4","x5","x6","x7","x8", # C0-C4
"x10","x27","x28"); # TMP
my $body=<<___;
.global ${PREFIX}_fprdc
.align 4
${PREFIX}_fprdc:
stp x29, x30, [sp, #-96]!
add x29, sp, xzr
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
ldp x2, x3, [x0,#0] // a[0-1]
// Load the prime constant
adrp x26, :pg_hi21:.Lp434p1
add x26, x26, :lo12:.Lp434p1
ldp x23, x24, [x26, #0x0]
ldp x25, x26, [x26,#0x10]
// a[0-1] * p434+1
$mul01
ldp x10, x11, [x0, #0x18]
ldp x12, x13, [x0, #0x28]
ldp x14, x15, [x0, #0x38]
ldp x16, x17, [x0, #0x48]
ldp x19, x20, [x0, #0x58]
ldr x21, [x0, #0x68]
adds x10, x10, x4
adcs x11, x11, x5
adcs x12, x12, x6
adcs x13, x13, x7
adcs x14, x14, x8
adcs x15, x15, x9
adcs x22, x16, xzr
adcs x17, x17, xzr
adcs x19, x19, xzr
adcs x20, x20, xzr
adc x21, x21, xzr
ldr x2, [x0,#0x10] // a[2]
// a[2-3] * p434+1
$mul23
adds x12, x12, x4
adcs x13, x13, x5
adcs x14, x14, x6
adcs x15, x15, x7
adcs x16, x22, x8
adcs x17, x17, x9
adcs x22, x19, xzr
adcs x20, x20, xzr
adc x21, x21, xzr
$mul45
adds x14, x14, x4
adcs x15, x15, x5
adcs x16, x16, x6
adcs x17, x17, x7
adcs x19, x22, x8
adcs x20, x20, x9
adc x22, x21, xzr
stp x14, x15, [x1, #0x0] // C0, C1
$mul67
adds x16, x16, x4
adcs x17, x17, x5
adcs x19, x19, x6
adcs x20, x20, x7
adc x21, x22, x8
str x16, [x1, #0x10]
stp x17, x19, [x1, #0x18]
stp x20, x21, [x1, #0x28]
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#96
ret
___
}
$code.=&rdc();
# Field addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_fpadd
.align 4
${PREFIX}_fpadd:
stp x29,x30, [sp,#-16]!
add x29, sp, #0
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x7, x8, [x0,#32]
ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
ldp x15, x16, [x1,#32]
ldr x17, [x1,#48]
// Add a + b
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
adcs x7, x7, x15
adcs x8, x8, x16
adc x9, x9, x17
// Subtract 2xp434
adrp x17, :pg_hi21:.Lp434x2
add x17, x17, :lo12:.Lp434x2
ldp x11, x12, [x17, #0]
ldp x13, x14, [x17, #16]
ldp x15, x16, [x17, #32]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x12
sbcs x6, x6, x13
sbcs x7, x7, x14
sbcs x8, x8, x15
sbcs x9, x9, x16
sbc x0, xzr, xzr // x0 can be reused now
// Add 2xp434 anded with the mask in x0
and x11, x11, x0
and x12, x12, x0
and x13, x13, x0
and x14, x14, x0
and x15, x15, x0
and x16, x16, x0
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x12
adcs x6, x6, x13
adcs x7, x7, x14
adcs x8, x8, x15
adc x9, x9, x16
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
___
# Field subtraction
# Operation: c [x2] = a [x0] - b [x1]
$code.=<<___;
.global ${PREFIX}_fpsub
.align 4
${PREFIX}_fpsub:
stp x29, x30, [sp,#-16]!
add x29, sp, #0
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x7, x8, [x0,#32]
ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
ldp x15, x16, [x1,#32]
ldr x17, [x1,#48]
// Subtract a - b
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
sbcs x7, x7, x15
sbcs x8, x8, x16
sbcs x9, x9, x17
sbc x0, xzr, xzr
// Add 2xp434 anded with the mask in x0
adrp x17, :pg_hi21:.Lp434x2
add x17, x17, :lo12:.Lp434x2
// First half
ldp x11, x12, [x17, #0]
ldp x13, x14, [x17, #16]
ldp x15, x16, [x17, #32]
// Add 2xp434 anded with the mask in x0
and x11, x11, x0
and x12, x12, x0
and x13, x13, x0
and x14, x14, x0
and x15, x15, x0
and x16, x16, x0
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x12
adcs x6, x6, x13
adcs x7, x7, x14
adcs x8, x8, x15
adc x9, x9, x16
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
___
# 434-bit multiprecision addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_mpadd_asm
.align 4
${PREFIX}_mpadd_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x7, x8, [x0,#32]
ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
ldp x15, x16, [x1,#32]
ldr x17, [x1,#48]
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
adcs x7, x7, x15
adcs x8, x8, x16
adc x9, x9, x17
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
___
# 2x434-bit multiprecision subtraction
# Operation: c [x2] = a [x0] - b [x1].
# Returns borrow mask
$code.=<<___;
.global ${PREFIX}_mpsubx2_asm
.align 4
${PREFIX}_mpsubx2_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x11, x12, [x1,#32]
ldp x13, x14, [x1,#48]
sbcs x7, x7, x11
sbcs x8, x8, x12
sbcs x9, x9, x13
sbcs x10, x10, x14
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ldp x3, x4, [x0,#64]
ldp x5, x6, [x0,#80]
ldp x11, x12, [x1,#64]
ldp x13, x14, [x1,#80]
sbcs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#96]
ldp x11, x12, [x1,#96]
sbcs x7, x7, x11
sbcs x8, x8, x12
sbc x0, xzr, xzr
stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
ldp x29, x30, [sp],#16
ret
___
# Double 2x434-bit multiprecision subtraction
# Operation: c [x2] = c [x2] - a [x0] - b [x1]
$code.=<<___;
.global ${PREFIX}_mpdblsubx2_asm
.align 4
${PREFIX}_mpdblsubx2_asm:
stp x29, x30, [sp, #-16]!
add x29, sp, #0
ldp x3, x4, [x2, #0]
ldp x5, x6, [x2,#16]
ldp x7, x8, [x2,#32]
ldp x11, x12, [x0, #0]
ldp x13, x14, [x0,#16]
ldp x15, x16, [x0,#32]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
sbcs x7, x7, x15
sbcs x8, x8, x16
// x9 stores carry
adc x9, xzr, xzr
ldp x11, x12, [x1, #0]
ldp x13, x14, [x1,#16]
ldp x15, x16, [x1,#32]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
sbcs x7, x7, x15
sbcs x8, x8, x16
adc x9, x9, xzr
stp x3, x4, [x2, #0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
ldp x3, x4, [x2,#48]
ldp x5, x6, [x2,#64]
ldp x7, x8, [x2,#80]
ldp x11, x12, [x0,#48]
ldp x13, x14, [x0,#64]
ldp x15, x16, [x0,#80]
// x9 = 2 - x9
neg x9, x9
add x9, x9, #2
subs x3, x3, x9
sbcs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
sbcs x7, x7, x15
sbcs x8, x8, x16
adc x9, xzr, xzr
ldp x11, x12, [x1,#48]
ldp x13, x14, [x1,#64]
ldp x15, x16, [x1,#80]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
sbcs x7, x7, x15
sbcs x8, x8, x16
adc x9, x9, xzr
stp x3, x4, [x2,#48]
stp x5, x6, [x2,#64]
stp x7, x8, [x2,#80]
ldp x3, x4, [x2,#96]
ldp x11, x12, [x0,#96]
ldp x13, x14, [x1,#96]
// x9 = 2 - x9
neg x9, x9
add x9, x9, #2
subs x3, x3, x9
sbcs x3, x3, x11
sbcs x4, x4, x12
subs x3, x3, x13
sbc x4, x4, x14
stp x3, x4, [x2,#96]
ldp x29, x30, [sp],#16
ret
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
print $_,"\n";
}
close STDOUT;