916 lines
24 KiB
Raku
916 lines
24 KiB
Raku
#! /usr/bin/env perl
|
|
#
|
|
# April 2019
|
|
#
|
|
# Abstract: field arithmetic in aarch64 assembly for SIDH/p434
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
$PREFIX="sike";
|
|
|
|
$code.=<<___;
|
|
.section .rodata
|
|
|
|
# p434 x 2
|
|
.Lp434x2:
|
|
.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
|
|
.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47
|
|
.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688
|
|
|
|
# p434 + 1
|
|
.Lp434p1:
|
|
.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3
|
|
.quad 0x6CFC5FD681C52056, 0x0002341F27177344
|
|
|
|
.text
|
|
___
|
|
|
|
# Computes C0-C2 = A0 * (B0-B1)
|
|
# Inputs remain intact
|
|
sub mul64x128 {
|
|
my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
|
|
my $body=<<___;
|
|
mul $T1, $A0, $B0
|
|
umulh $B0, $A0, $B0
|
|
adds $C0, $C0, $C2
|
|
adc $C1, $C1, xzr
|
|
|
|
mul $T0, $A0, $B1
|
|
umulh $B1, $A0, $B1
|
|
adds $C0, $C0, $T1
|
|
adcs $C1, $C1, $B0
|
|
adc $C2, xzr, xzr
|
|
|
|
adds $C1, $C1, $T0
|
|
adc $C2, $C2, $B1
|
|
___
|
|
return $body;
|
|
}
|
|
|
|
# Computes C0-C4 = A0 * (B0-B3)
|
|
# Inputs remain intact
|
|
sub mul64x256 {
|
|
my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_;
|
|
my $body=<<___;
|
|
mul $C0, $A0, $B0 // C0
|
|
umulh $T0, $A0, $B0
|
|
|
|
mul $C1, $A0, $B1
|
|
umulh $T1, $A0, $B1
|
|
adds $C1, $C1, $T0 // C1
|
|
adc $T0, xzr, xzr
|
|
|
|
mul $C2, $A0, $B2
|
|
umulh $T2, $A0, $B2
|
|
adds $T1, $T0, $T1
|
|
adcs $C2, $C2, $T1 // C2
|
|
adc $T0, xzr, xzr
|
|
|
|
mul $C3, $A0, $B3
|
|
umulh $C4, $A0, $B3
|
|
adds $T2, $T0, $T2
|
|
adcs $C3, $C3, $T2 // C3
|
|
adc $C4, $C4, xzr // C4
|
|
___
|
|
return $body;
|
|
}
|
|
|
|
# Computes C0-C4 = (A0-A1) * (B0-B3)
|
|
# Inputs remain intact
|
|
sub mul128x256 {
|
|
my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
|
|
my $body=<<___;
|
|
mul $C0, $A0, $B0 // C0
|
|
umulh $C3, $A0, $B0
|
|
|
|
mul $C1, $A0, $B1
|
|
umulh $C2, $A0, $B1
|
|
|
|
mul $T0, $A1, $B0
|
|
umulh $T1, $A1, $B0
|
|
adds $C1, $C1, $C3
|
|
adc $C2, $C2, xzr
|
|
|
|
mul $T2, $A0, $B2
|
|
umulh $T3, $A0, $B2
|
|
adds $C1, $C1, $T0 // C1
|
|
adcs $C2, $C2, $T1
|
|
adc $C3, xzr, xzr
|
|
|
|
mul $T0, $A1, $B1
|
|
umulh $T1, $A1, $B1
|
|
adds $C2, $C2, $T2
|
|
adcs $C3, $C3, $T3
|
|
adc $C4, xzr, xzr
|
|
|
|
mul $T2, $A0, $B3
|
|
umulh $T3, $A0, $B3
|
|
adds $C2, $C2, $T0 // C2
|
|
adcs $C3, $C3, $T1
|
|
adc $C4, $C4, xzr
|
|
|
|
mul $T0, $A1, $B2
|
|
umulh $T1, $A1, $B2
|
|
adds $C3, $C3, $T2
|
|
adcs $C4, $C4, $T3
|
|
adc $C5, xzr, xzr
|
|
|
|
mul $T2, $A1, $B3
|
|
umulh $T3, $A1, $B3
|
|
adds $C3, $C3, $T0 // C3
|
|
adcs $C4, $C4, $T1
|
|
adc $C5, $C5, xzr
|
|
adds $C4, $C4, $T2 // C4
|
|
adc $C5, $C5, $T3 // C5
|
|
|
|
___
|
|
return $body;
|
|
}
|
|
|
|
# Computes C0-C5 = (A0-A2) * (B0-B2)
|
|
# Inputs remain intact
|
|
sub mul192 {
|
|
my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
|
|
my $body=<<___;
|
|
|
|
// A0 * B0
|
|
mul $C0, $A0, $B0 // C0
|
|
umulh $C3, $A0, $B0
|
|
|
|
// A0 * B1
|
|
mul $C1, $A0, $B1
|
|
umulh $C2, $A0, $B1
|
|
|
|
// A1 * B0
|
|
mul $T0, $A1, $B0
|
|
umulh $T1, $A1, $B0
|
|
adds $C1, $C1, $C3
|
|
adc $C2, $C2, xzr
|
|
|
|
// A0 * B2
|
|
mul $T2, $A0, $B2
|
|
umulh $T3, $A0, $B2
|
|
adds $C1, $C1, $T0 // C1
|
|
adcs $C2, $C2, $T1
|
|
adc $C3, xzr, xzr
|
|
|
|
// A2 * B0
|
|
mul $T0, $A2, $B0
|
|
umulh $C4, $A2, $B0
|
|
adds $C2, $C2, $T2
|
|
adcs $C3, $C3, $C4
|
|
adc $C4, xzr, xzr
|
|
|
|
// A1 * B1
|
|
mul $T2, $A1, $B1
|
|
umulh $T1, $A1, $B1
|
|
adds $C2, $C2, $T0
|
|
adcs $C3, $C3, $T3
|
|
adc $C4, $C4, xzr
|
|
|
|
// A1 * B2
|
|
mul $T0, $A1, $B2
|
|
umulh $T3, $A1, $B2
|
|
adds $C2, $C2, $T2 // C2
|
|
adcs $C3, $C3, $T1
|
|
adc $C4, $C4, xzr
|
|
|
|
// A2 * B1
|
|
mul $T2, $A2, $B1
|
|
umulh $T1, $A2, $B1
|
|
adds $C3, $C3, $T0
|
|
adcs $C4, $C4, $T3
|
|
adc $C5, xzr, xzr
|
|
|
|
// A2 * B2
|
|
mul $T0, $A2, $B2
|
|
umulh $T3, $A2, $B2
|
|
adds $C3, $C3, $T2 // C3
|
|
adcs $C4, $C4, $T1
|
|
adc $C5, $C5, xzr
|
|
|
|
adds $C4, $C4, $T0 // C4
|
|
adc $C5, $C5, $T3 // C5
|
|
___
|
|
return $body;
|
|
}
|
|
sub mul256_karatsuba {
|
|
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
|
|
# (AH+AL) x (BH+BL), low part
|
|
my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
|
|
# AL x BL
|
|
my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
|
|
# AH x BH
|
|
my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
|
|
my $body=<<___;
|
|
// A0-A1 <- AH + AL, T0 <- mask
|
|
adds $A0, $A0, $A2
|
|
adcs $A1, $A1, $A3
|
|
adc $T0, xzr, xzr
|
|
|
|
// C6, T1 <- BH + BL, C7 <- mask
|
|
adds $C6, $B0, $B2
|
|
adcs $T1, $B1, $B3
|
|
adc $C7, xzr, xzr
|
|
|
|
// C0-C1 <- masked (BH + BL)
|
|
sub $C2, xzr, $T0
|
|
sub $C3, xzr, $C7
|
|
and $C0, $C6, $C2
|
|
and $C1, $T1, $C2
|
|
|
|
// C4-C5 <- masked (AH + AL), T0 <- combined carry
|
|
and $C4, $A0, $C3
|
|
and $C5, $A1, $C3
|
|
mul $C2, $A0, $C6
|
|
mul $C3, $A0, $T1
|
|
and $T0, $T0, $C7
|
|
|
|
// C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
|
|
adds $C0, $C4, $C0
|
|
umulh $C4, $A0, $T1
|
|
adcs $C1, $C5, $C1
|
|
umulh $C5, $A0, $C6
|
|
adc $T0, $T0, xzr
|
|
|
|
// C2-C5 <- (AH+AL) x (BH+BL), low part
|
|
$mul_low
|
|
ldp $A0, $A1, [$M,#0]
|
|
|
|
// C2-C5, T0 <- (AH+AL) x (BH+BL), final part
|
|
adds $C4, $C0, $C4
|
|
umulh $C7, $A0, $B0
|
|
umulh $T1, $A0, $B1
|
|
adcs $C5, $C1, $C5
|
|
mul $C0, $A0, $B0
|
|
mul $C1, $A0, $B1
|
|
adc $T0, $T0, xzr
|
|
|
|
// C0-C1, T1, C7 <- AL x BL
|
|
$mul_albl
|
|
|
|
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
|
|
mul $A0, $A2, $B2
|
|
umulh $B0, $A2, $B2
|
|
subs $C2, $C2, $C0
|
|
sbcs $C3, $C3, $C1
|
|
sbcs $C4, $C4, $T1
|
|
mul $A1, $A2, $B3
|
|
umulh $C6, $A2, $B3
|
|
sbcs $C5, $C5, $C7
|
|
sbc $T0, $T0, xzr
|
|
|
|
// A0, A1, C6, B0 <- AH x BH
|
|
$mul_ahbh
|
|
|
|
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
|
|
subs $C2, $C2, $A0
|
|
sbcs $C3, $C3, $A1
|
|
sbcs $C4, $C4, $C6
|
|
sbcs $C5, $C5, $B0
|
|
sbc $T0, $T0, xzr
|
|
|
|
adds $C2, $C2, $T1
|
|
adcs $C3, $C3, $C7
|
|
adcs $C4, $C4, $A0
|
|
adcs $C5, $C5, $A1
|
|
adcs $C6, $T0, $C6
|
|
adc $C7, $B0, xzr
|
|
___
|
|
return $body;
|
|
}
|
|
|
|
# 512-bit integer multiplication using Karatsuba (two levels),
|
|
# Comba (lower level).
|
|
# Operation: c [x2] = a [x0] * b [x1]
|
|
sub mul {
|
|
# (AH+AL) x (BH+BL), low part
|
|
my $mul_kc_low=&mul256_karatsuba(
|
|
"x2", # M0
|
|
"x3","x4","x5","x6", # A0-A3
|
|
"x10","x11","x12","x13", # B0-B3
|
|
"x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7
|
|
"x25","x26"); # TMP
|
|
# AL x BL
|
|
my $mul_albl=&mul256_karatsuba(
|
|
"x0", # M0f
|
|
"x3","x4","x5","x6", # A0-A3
|
|
"x10","x11","x12","x13", # B0-B3
|
|
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
|
|
"x8","x9"); # TMP
|
|
# AH x BH
|
|
my $mul_ahbh=&mul192(
|
|
"x3","x4","x5", # A0-A2
|
|
"x10","x11","x12", # B0-B2
|
|
"x21","x22","x23","x24","x25","x26", # C0-C5
|
|
"x8","x9","x27","x28"); # TMP
|
|
|
|
my $body=<<___;
|
|
.global ${PREFIX}_mpmul
|
|
.align 4
|
|
${PREFIX}_mpmul:
|
|
stp x29, x30, [sp,#-96]!
|
|
add x29, sp, #0
|
|
stp x19, x20, [sp,#16]
|
|
stp x21, x22, [sp,#32]
|
|
stp x23, x24, [sp,#48]
|
|
stp x25, x26, [sp,#64]
|
|
stp x27, x28, [sp,#80]
|
|
|
|
ldp x3, x4, [x0]
|
|
ldp x5, x6, [x0,#16]
|
|
ldp x7, x8, [x0,#32]
|
|
ldr x9, [x0,#48]
|
|
ldp x10, x11, [x1,#0]
|
|
ldp x12, x13, [x1,#16]
|
|
ldp x14, x15, [x1,#32]
|
|
ldr x16, [x1,#48]
|
|
|
|
// x3-x7 <- AH + AL, x7 <- carry
|
|
adds x3, x3, x7
|
|
adcs x4, x4, x8
|
|
adcs x5, x5, x9
|
|
adcs x6, x6, xzr
|
|
adc x7, xzr, xzr
|
|
|
|
// x10-x13 <- BH + BL, x8 <- carry
|
|
adds x10, x10, x14
|
|
adcs x11, x11, x15
|
|
adcs x12, x12, x16
|
|
adcs x13, x13, xzr
|
|
adc x8, xzr, xzr
|
|
|
|
// x9 <- combined carry
|
|
and x9, x7, x8
|
|
// x7-x8 <- mask
|
|
sub x7, xzr, x7
|
|
sub x8, xzr, x8
|
|
|
|
// x15-x19 <- masked (BH + BL)
|
|
and x14, x10, x7
|
|
and x15, x11, x7
|
|
and x16, x12, x7
|
|
and x17, x13, x7
|
|
|
|
// x20-x23 <- masked (AH + AL)
|
|
and x20, x3, x8
|
|
and x21, x4, x8
|
|
and x22, x5, x8
|
|
and x23, x6, x8
|
|
|
|
// x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
|
|
adds x14, x14, x20
|
|
adcs x15, x15, x21
|
|
adcs x16, x16, x22
|
|
adcs x17, x17, x23
|
|
adc x7, x9, xzr
|
|
|
|
// x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part
|
|
stp x3, x4, [x2,#0]
|
|
$mul_kc_low
|
|
|
|
// x15-x19, x7 <- (AH+AL) x (BH+BL), final step
|
|
adds x14, x14, x21
|
|
adcs x15, x15, x22
|
|
adcs x16, x16, x23
|
|
adcs x17, x17, x24
|
|
adc x7, x7, xzr
|
|
|
|
// Load AL
|
|
ldp x3, x4, [x0]
|
|
ldp x5, x6, [x0,#16]
|
|
// Load BL
|
|
ldp x10, x11, [x1,#0]
|
|
ldp x12, x13, [x1,#16]
|
|
|
|
// Temporarily store x8 in x2
|
|
stp x8, x9, [x2,#0]
|
|
// x21-x28 <- AL x BL
|
|
$mul_albl
|
|
// Restore x8
|
|
ldp x8, x9, [x2,#0]
|
|
|
|
// x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
|
|
subs x8, x8, x21
|
|
sbcs x9, x9, x22
|
|
sbcs x19, x19, x23
|
|
sbcs x20, x20, x24
|
|
sbcs x14, x14, x25
|
|
sbcs x15, x15, x26
|
|
sbcs x16, x16, x27
|
|
sbcs x17, x17, x28
|
|
sbc x7, x7, xzr
|
|
|
|
// Store ALxBL, low
|
|
stp x21, x22, [x2]
|
|
stp x23, x24, [x2,#16]
|
|
|
|
// Load AH
|
|
ldp x3, x4, [x0,#32]
|
|
ldr x5, [x0,#48]
|
|
// Load BH
|
|
ldp x10, x11, [x1,#32]
|
|
ldr x12, [x1,#48]
|
|
|
|
adds x8, x8, x25
|
|
adcs x9, x9, x26
|
|
adcs x19, x19, x27
|
|
adcs x20, x20, x28
|
|
adc x1, xzr, xzr
|
|
|
|
add x0, x0, #32
|
|
// Temporarily store x8,x9 in x2
|
|
stp x8,x9, [x2,#32]
|
|
// x21-x28 <- AH x BH
|
|
$mul_ahbh
|
|
// Restore x8,x9
|
|
ldp x8,x9, [x2,#32]
|
|
|
|
neg x1, x1
|
|
|
|
// x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
|
|
subs x8, x8, x21
|
|
sbcs x9, x9, x22
|
|
sbcs x19, x19, x23
|
|
sbcs x20, x20, x24
|
|
sbcs x14, x14, x25
|
|
sbcs x15, x15, x26
|
|
sbcs x16, x16, xzr
|
|
sbcs x17, x17, xzr
|
|
sbc x7, x7, xzr
|
|
|
|
// Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
|
|
stp x8, x9, [x2,#32]
|
|
stp x19, x20, [x2,#48]
|
|
|
|
adds x1, x1, #1
|
|
adcs x14, x14, x21
|
|
adcs x15, x15, x22
|
|
adcs x16, x16, x23
|
|
adcs x17, x17, x24
|
|
adcs x25, x7, x25
|
|
adc x26, x26, xzr
|
|
|
|
stp x14, x15, [x2,#64]
|
|
stp x16, x17, [x2,#80]
|
|
stp x25, x26, [x2,#96]
|
|
|
|
ldp x19, x20, [x29,#16]
|
|
ldp x21, x22, [x29,#32]
|
|
ldp x23, x24, [x29,#48]
|
|
ldp x25, x26, [x29,#64]
|
|
ldp x27, x28, [x29,#80]
|
|
ldp x29, x30, [sp],#96
|
|
ret
|
|
___
|
|
return $body;
|
|
}
|
|
$code.=&mul();
|
|
|
|
# Montgomery reduction
|
|
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
|
|
# Operation: mc [x1] = ma [x0]
|
|
# NOTE: ma=mc is not allowed
|
|
sub rdc {
|
|
my $mul01=&mul128x256(
|
|
"x2","x3", # A0-A1
|
|
"x23","x24","x25","x26", # B0-B3
|
|
"x4","x5","x6","x7","x8","x9", # C0-C5
|
|
"x10","x11","x27","x28"); # TMP
|
|
my $mul23=&mul128x256(
|
|
"x2","x10", # A0-A1
|
|
"x23","x24","x25","x26", # B0-B3
|
|
"x4","x5","x6","x7","x8","x9", # C0-C5
|
|
"x0","x3","x27","x28"); # TMP
|
|
my $mul45=&mul128x256(
|
|
"x11","x12", # A0-A1
|
|
"x23","x24","x25","x26", # B0-B3
|
|
"x4","x5","x6","x7","x8","x9", # C0-C5
|
|
"x10","x3","x27","x28"); # TMP
|
|
my $mul67=&mul64x256(
|
|
"x13", # A0
|
|
"x23","x24","x25","x26", # B0-B3
|
|
"x4","x5","x6","x7","x8", # C0-C4
|
|
"x10","x27","x28"); # TMP
|
|
my $body=<<___;
|
|
.global ${PREFIX}_fprdc
|
|
.align 4
|
|
${PREFIX}_fprdc:
|
|
stp x29, x30, [sp, #-96]!
|
|
add x29, sp, xzr
|
|
stp x19, x20, [sp,#16]
|
|
stp x21, x22, [sp,#32]
|
|
stp x23, x24, [sp,#48]
|
|
stp x25, x26, [sp,#64]
|
|
stp x27, x28, [sp,#80]
|
|
|
|
ldp x2, x3, [x0,#0] // a[0-1]
|
|
|
|
// Load the prime constant
|
|
adrp x26, :pg_hi21:.Lp434p1
|
|
add x26, x26, :lo12:.Lp434p1
|
|
ldp x23, x24, [x26, #0x0]
|
|
ldp x25, x26, [x26,#0x10]
|
|
|
|
// a[0-1] * p434+1
|
|
$mul01
|
|
|
|
ldp x10, x11, [x0, #0x18]
|
|
ldp x12, x13, [x0, #0x28]
|
|
ldp x14, x15, [x0, #0x38]
|
|
ldp x16, x17, [x0, #0x48]
|
|
ldp x19, x20, [x0, #0x58]
|
|
ldr x21, [x0, #0x68]
|
|
|
|
adds x10, x10, x4
|
|
adcs x11, x11, x5
|
|
adcs x12, x12, x6
|
|
adcs x13, x13, x7
|
|
adcs x14, x14, x8
|
|
adcs x15, x15, x9
|
|
adcs x22, x16, xzr
|
|
adcs x17, x17, xzr
|
|
adcs x19, x19, xzr
|
|
adcs x20, x20, xzr
|
|
adc x21, x21, xzr
|
|
|
|
ldr x2, [x0,#0x10] // a[2]
|
|
// a[2-3] * p434+1
|
|
$mul23
|
|
|
|
adds x12, x12, x4
|
|
adcs x13, x13, x5
|
|
adcs x14, x14, x6
|
|
adcs x15, x15, x7
|
|
adcs x16, x22, x8
|
|
adcs x17, x17, x9
|
|
adcs x22, x19, xzr
|
|
adcs x20, x20, xzr
|
|
adc x21, x21, xzr
|
|
|
|
$mul45
|
|
adds x14, x14, x4
|
|
adcs x15, x15, x5
|
|
adcs x16, x16, x6
|
|
adcs x17, x17, x7
|
|
adcs x19, x22, x8
|
|
adcs x20, x20, x9
|
|
adc x22, x21, xzr
|
|
|
|
stp x14, x15, [x1, #0x0] // C0, C1
|
|
|
|
$mul67
|
|
adds x16, x16, x4
|
|
adcs x17, x17, x5
|
|
adcs x19, x19, x6
|
|
adcs x20, x20, x7
|
|
adc x21, x22, x8
|
|
|
|
str x16, [x1, #0x10]
|
|
stp x17, x19, [x1, #0x18]
|
|
stp x20, x21, [x1, #0x28]
|
|
|
|
ldp x19, x20, [x29,#16]
|
|
ldp x21, x22, [x29,#32]
|
|
ldp x23, x24, [x29,#48]
|
|
ldp x25, x26, [x29,#64]
|
|
ldp x27, x28, [x29,#80]
|
|
ldp x29, x30, [sp],#96
|
|
ret
|
|
___
|
|
}
|
|
$code.=&rdc();
|
|
|
|
# Field addition
|
|
# Operation: c [x2] = a [x0] + b [x1]
|
|
$code.=<<___;
|
|
.global ${PREFIX}_fpadd
|
|
.align 4
|
|
${PREFIX}_fpadd:
|
|
stp x29,x30, [sp,#-16]!
|
|
add x29, sp, #0
|
|
|
|
ldp x3, x4, [x0,#0]
|
|
ldp x5, x6, [x0,#16]
|
|
ldp x7, x8, [x0,#32]
|
|
ldr x9, [x0,#48]
|
|
ldp x11, x12, [x1,#0]
|
|
ldp x13, x14, [x1,#16]
|
|
ldp x15, x16, [x1,#32]
|
|
ldr x17, [x1,#48]
|
|
|
|
// Add a + b
|
|
adds x3, x3, x11
|
|
adcs x4, x4, x12
|
|
adcs x5, x5, x13
|
|
adcs x6, x6, x14
|
|
adcs x7, x7, x15
|
|
adcs x8, x8, x16
|
|
adc x9, x9, x17
|
|
|
|
// Subtract 2xp434
|
|
adrp x17, :pg_hi21:.Lp434x2
|
|
add x17, x17, :lo12:.Lp434x2
|
|
ldp x11, x12, [x17, #0]
|
|
ldp x13, x14, [x17, #16]
|
|
ldp x15, x16, [x17, #32]
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x12
|
|
sbcs x6, x6, x13
|
|
sbcs x7, x7, x14
|
|
sbcs x8, x8, x15
|
|
sbcs x9, x9, x16
|
|
sbc x0, xzr, xzr // x0 can be reused now
|
|
|
|
// Add 2xp434 anded with the mask in x0
|
|
and x11, x11, x0
|
|
and x12, x12, x0
|
|
and x13, x13, x0
|
|
and x14, x14, x0
|
|
and x15, x15, x0
|
|
and x16, x16, x0
|
|
|
|
adds x3, x3, x11
|
|
adcs x4, x4, x12
|
|
adcs x5, x5, x12
|
|
adcs x6, x6, x13
|
|
adcs x7, x7, x14
|
|
adcs x8, x8, x15
|
|
adc x9, x9, x16
|
|
|
|
stp x3, x4, [x2,#0]
|
|
stp x5, x6, [x2,#16]
|
|
stp x7, x8, [x2,#32]
|
|
str x9, [x2,#48]
|
|
|
|
ldp x29, x30, [sp],#16
|
|
ret
|
|
___
|
|
|
|
# Field subtraction
|
|
# Operation: c [x2] = a [x0] - b [x1]
|
|
$code.=<<___;
|
|
.global ${PREFIX}_fpsub
|
|
.align 4
|
|
${PREFIX}_fpsub:
|
|
stp x29, x30, [sp,#-16]!
|
|
add x29, sp, #0
|
|
|
|
ldp x3, x4, [x0,#0]
|
|
ldp x5, x6, [x0,#16]
|
|
ldp x7, x8, [x0,#32]
|
|
ldr x9, [x0,#48]
|
|
ldp x11, x12, [x1,#0]
|
|
ldp x13, x14, [x1,#16]
|
|
ldp x15, x16, [x1,#32]
|
|
ldr x17, [x1,#48]
|
|
|
|
// Subtract a - b
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
sbcs x7, x7, x15
|
|
sbcs x8, x8, x16
|
|
sbcs x9, x9, x17
|
|
sbc x0, xzr, xzr
|
|
|
|
// Add 2xp434 anded with the mask in x0
|
|
adrp x17, :pg_hi21:.Lp434x2
|
|
add x17, x17, :lo12:.Lp434x2
|
|
|
|
// First half
|
|
ldp x11, x12, [x17, #0]
|
|
ldp x13, x14, [x17, #16]
|
|
ldp x15, x16, [x17, #32]
|
|
|
|
// Add 2xp434 anded with the mask in x0
|
|
and x11, x11, x0
|
|
and x12, x12, x0
|
|
and x13, x13, x0
|
|
and x14, x14, x0
|
|
and x15, x15, x0
|
|
and x16, x16, x0
|
|
|
|
adds x3, x3, x11
|
|
adcs x4, x4, x12
|
|
adcs x5, x5, x12
|
|
adcs x6, x6, x13
|
|
adcs x7, x7, x14
|
|
adcs x8, x8, x15
|
|
adc x9, x9, x16
|
|
|
|
stp x3, x4, [x2,#0]
|
|
stp x5, x6, [x2,#16]
|
|
stp x7, x8, [x2,#32]
|
|
str x9, [x2,#48]
|
|
|
|
ldp x29, x30, [sp],#16
|
|
ret
|
|
___
|
|
|
|
# 434-bit multiprecision addition
|
|
# Operation: c [x2] = a [x0] + b [x1]
|
|
$code.=<<___;
|
|
.global ${PREFIX}_mpadd_asm
|
|
.align 4
|
|
${PREFIX}_mpadd_asm:
|
|
stp x29, x30, [sp,#-16]!
|
|
add x29, sp, #0
|
|
|
|
ldp x3, x4, [x0,#0]
|
|
ldp x5, x6, [x0,#16]
|
|
ldp x7, x8, [x0,#32]
|
|
ldr x9, [x0,#48]
|
|
ldp x11, x12, [x1,#0]
|
|
ldp x13, x14, [x1,#16]
|
|
ldp x15, x16, [x1,#32]
|
|
ldr x17, [x1,#48]
|
|
|
|
adds x3, x3, x11
|
|
adcs x4, x4, x12
|
|
adcs x5, x5, x13
|
|
adcs x6, x6, x14
|
|
adcs x7, x7, x15
|
|
adcs x8, x8, x16
|
|
adc x9, x9, x17
|
|
|
|
stp x3, x4, [x2,#0]
|
|
stp x5, x6, [x2,#16]
|
|
stp x7, x8, [x2,#32]
|
|
str x9, [x2,#48]
|
|
|
|
ldp x29, x30, [sp],#16
|
|
ret
|
|
___
|
|
|
|
# 2x434-bit multiprecision subtraction
|
|
# Operation: c [x2] = a [x0] - b [x1].
|
|
# Returns borrow mask
|
|
$code.=<<___;
|
|
.global ${PREFIX}_mpsubx2_asm
|
|
.align 4
|
|
${PREFIX}_mpsubx2_asm:
|
|
stp x29, x30, [sp,#-16]!
|
|
add x29, sp, #0
|
|
|
|
ldp x3, x4, [x0,#0]
|
|
ldp x5, x6, [x0,#16]
|
|
ldp x11, x12, [x1,#0]
|
|
ldp x13, x14, [x1,#16]
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
ldp x7, x8, [x0,#32]
|
|
ldp x9, x10, [x0,#48]
|
|
ldp x11, x12, [x1,#32]
|
|
ldp x13, x14, [x1,#48]
|
|
sbcs x7, x7, x11
|
|
sbcs x8, x8, x12
|
|
sbcs x9, x9, x13
|
|
sbcs x10, x10, x14
|
|
|
|
stp x3, x4, [x2,#0]
|
|
stp x5, x6, [x2,#16]
|
|
stp x7, x8, [x2,#32]
|
|
stp x9, x10, [x2,#48]
|
|
|
|
ldp x3, x4, [x0,#64]
|
|
ldp x5, x6, [x0,#80]
|
|
ldp x11, x12, [x1,#64]
|
|
ldp x13, x14, [x1,#80]
|
|
sbcs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
ldp x7, x8, [x0,#96]
|
|
ldp x11, x12, [x1,#96]
|
|
sbcs x7, x7, x11
|
|
sbcs x8, x8, x12
|
|
sbc x0, xzr, xzr
|
|
|
|
stp x3, x4, [x2,#64]
|
|
stp x5, x6, [x2,#80]
|
|
stp x7, x8, [x2,#96]
|
|
|
|
ldp x29, x30, [sp],#16
|
|
ret
|
|
___
|
|
|
|
|
|
# Double 2x434-bit multiprecision subtraction
|
|
# Operation: c [x2] = c [x2] - a [x0] - b [x1]
|
|
$code.=<<___;
|
|
.global ${PREFIX}_mpdblsubx2_asm
|
|
.align 4
|
|
${PREFIX}_mpdblsubx2_asm:
|
|
stp x29, x30, [sp, #-16]!
|
|
add x29, sp, #0
|
|
|
|
ldp x3, x4, [x2, #0]
|
|
ldp x5, x6, [x2,#16]
|
|
ldp x7, x8, [x2,#32]
|
|
|
|
ldp x11, x12, [x0, #0]
|
|
ldp x13, x14, [x0,#16]
|
|
ldp x15, x16, [x0,#32]
|
|
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
sbcs x7, x7, x15
|
|
sbcs x8, x8, x16
|
|
|
|
// x9 stores carry
|
|
adc x9, xzr, xzr
|
|
|
|
ldp x11, x12, [x1, #0]
|
|
ldp x13, x14, [x1,#16]
|
|
ldp x15, x16, [x1,#32]
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
sbcs x7, x7, x15
|
|
sbcs x8, x8, x16
|
|
adc x9, x9, xzr
|
|
|
|
stp x3, x4, [x2, #0]
|
|
stp x5, x6, [x2,#16]
|
|
stp x7, x8, [x2,#32]
|
|
|
|
ldp x3, x4, [x2,#48]
|
|
ldp x5, x6, [x2,#64]
|
|
ldp x7, x8, [x2,#80]
|
|
|
|
ldp x11, x12, [x0,#48]
|
|
ldp x13, x14, [x0,#64]
|
|
ldp x15, x16, [x0,#80]
|
|
|
|
// x9 = 2 - x9
|
|
neg x9, x9
|
|
add x9, x9, #2
|
|
|
|
subs x3, x3, x9
|
|
sbcs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
sbcs x7, x7, x15
|
|
sbcs x8, x8, x16
|
|
adc x9, xzr, xzr
|
|
|
|
ldp x11, x12, [x1,#48]
|
|
ldp x13, x14, [x1,#64]
|
|
ldp x15, x16, [x1,#80]
|
|
subs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
sbcs x5, x5, x13
|
|
sbcs x6, x6, x14
|
|
sbcs x7, x7, x15
|
|
sbcs x8, x8, x16
|
|
adc x9, x9, xzr
|
|
|
|
stp x3, x4, [x2,#48]
|
|
stp x5, x6, [x2,#64]
|
|
stp x7, x8, [x2,#80]
|
|
|
|
ldp x3, x4, [x2,#96]
|
|
ldp x11, x12, [x0,#96]
|
|
ldp x13, x14, [x1,#96]
|
|
|
|
// x9 = 2 - x9
|
|
neg x9, x9
|
|
add x9, x9, #2
|
|
|
|
subs x3, x3, x9
|
|
sbcs x3, x3, x11
|
|
sbcs x4, x4, x12
|
|
subs x3, x3, x13
|
|
sbc x4, x4, x14
|
|
stp x3, x4, [x2,#96]
|
|
|
|
ldp x29, x30, [sp],#16
|
|
ret
|
|
___
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
print $_,"\n";
|
|
}
|
|
|
|
close STDOUT;
|