Nagram/TMessagesProj/jni/boringssl/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl

#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
# implements the multiplication algorithm described in:
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
#
# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
# NEON, the low and high halves of the 128-bit register q0 are accessible as
# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
# vN. Where the 32-bit version would use the upper half, this file must keep
# halves in separate registers.
#
# The other distinction is in syntax. 32-bit NEON embeds lane information in the
# instruction name, while AArch64 uses suffixes on the registers. For instance,
# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
#
#     vshl.i64 q0, q0, #1
#
# in 64-bit, it would be written:
#
#     shl v0.2d, v0.2d, #1
#
# See Programmer's Guide for ARMv8-A, section 7 for details.
# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
#
# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
# polynomial and is conditioned on the PMULL extension. This file emulates the
# latter with the former.

use strict;

my $flavour = shift;
my $output;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
    my $dir = $1;
    my $xlate;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open OUT,"| \"$^X\" $xlate $flavour $output";
    *STDOUT=*OUT;
} else {
    open OUT,">$output";
    *STDOUT=*OUT;
}

my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
# to spare.
my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
my ($k48_k32, $k16_k0) = map("v$_", (24..25));

my $code = "";

# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
sub clmul64x64 {
my ($r, $a, $b) = @_;
$code .= <<___;
	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
	ext	$r.8b, $b.8b, $b.8b, #1		// B1
	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
	ext	$r.8b, $b.8b, $b.8b, #3		// B3
	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3

	// Here we diverge from the 32-bit version. It computes the following
	// (instructions reordered for clarity):
	//
	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
	//     vand	\$t0#hi, \$t0#hi, \$k48
	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
	//
	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
	//     vand	\$t1#hi, \$t1#hi, \$k32
	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
	//
	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
	//     vand	\$t2#hi, \$t2#hi, \$k16
	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
	//
	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
	//     vmov.i64	\$t3#hi, #0
	//
	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
	// upper halves of SIMD registers, so we must split each half into
	// separate registers. To compensate, we pair computations up and
	// parallelize.

	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4

	// This can probably be scheduled more efficiently. For now, we just
	// pair up independent instructions.
	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d

	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
	eor	$t0.16b, $t0.16b, $t1.16b
	eor	$t2.16b, $t2.16b, $t3.16b
	eor	$r.16b, $r.16b, $t0.16b
	eor	$r.16b, $r.16b, $t2.16b
___
}

$code .= <<___;
.text

.global	gcm_init_neon
.type	gcm_init_neon,%function
.align	4
gcm_init_neon:
	// This function is adapted from gcm_init_v8. xC2 is t3.
	ld1	{$t1.2d}, [x1]			// load H
	movi	$t3.16b, #0xe1
	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
	ext	$INlo.16b, $t1.16b, $t1.16b, #8
	ushr	$t2.2d, $t3.2d, #63
	dup	$t1.4s, $t1.s[1]
	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
	ushr	$t2.2d, $INlo.2d, #63
	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
	and	$t2.16b, $t2.16b, $t0.16b
	shl	$INlo.2d, $INlo.2d, #1
	ext	$t2.16b, $t2.16b, $t2.16b, #8
	and	$t0.16b, $t0.16b, $t1.16b
	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
	st1	{$Hlo.2d}, [x0]			// store Htable[0]
	ret
.size	gcm_init_neon,.-gcm_init_neon

.global	gcm_gmult_neon
.type	gcm_gmult_neon,%function
.align	4
gcm_gmult_neon:
	ld1	{$INlo.16b}, [$Xi]		// load Xi
	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
	ld1	{$Hhi.1d}, [$Htbl]
	adrp	x9, :pg_hi21:.Lmasks		// load constants
	add	x9, x9, :lo12:.Lmasks
	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing

	mov	$len, #16
	b	.Lgmult_neon
.size	gcm_gmult_neon,.-gcm_gmult_neon

.global	gcm_ghash_neon
.type	gcm_ghash_neon,%function
.align	4
gcm_ghash_neon:
	ld1	{$Xl.16b}, [$Xi]		// load Xi
	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
	ld1	{$Hhi.1d}, [$Htbl]
	adrp	x9, :pg_hi21:.Lmasks		// load constants
	add	x9, x9, :lo12:.Lmasks
	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing

.Loop_neon:
	ld1	{$INlo.16b}, [$inp], #16	// load inp
	rev64	$INlo.16b, $INlo.16b		// byteswap inp
	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi

.Lgmult_neon:
	// Split the input into $INlo and $INhi. (The upper halves are unused,
	// so it is okay to leave them alone.)
	ins	$INhi.d[0], $INlo.d[1]
___
&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
$code .= <<___;
	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
___
&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
$code .= <<___;
	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
	eor	$Xm.16b, $Xm.16b, $Xh.16b
	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
	// This is a no-op due to the ins instruction below.
	// ins	$Xh.d[0], $Xm.d[1]

	// equivalent of reduction_avx from ghash-x86_64.pl
	shl	$t1.2d, $Xl.2d, #57		// 1st phase
	shl	$t2.2d, $Xl.2d, #62
	eor	$t2.16b, $t2.16b, $t1.16b	//
	shl	$t1.2d, $Xl.2d, #63
	eor	$t2.16b, $t2.16b, $t1.16b	//
	// Note Xm contains {Xl.d[1], Xh.d[0]}.
	eor	$t2.16b, $t2.16b, $Xm.16b
	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]

	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
	eor	$Xh.16b, $Xh.16b,$Xl.16b
	eor	$Xl.16b, $Xl.16b,$t2.16b	//
	ushr	$t2.2d, $t2.2d, #6
	ushr	$Xl.2d, $Xl.2d, #1		//
	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
	eor	$Xl.16b, $Xl.16b, $t2.16b	//

	subs	$len, $len, #16
	bne	.Loop_neon

	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
	st1	{$Xl.16b}, [$Xi]

	ret
.size	gcm_ghash_neon,.-gcm_ghash_neon

.section	.rodata
.align	4
.Lmasks:
.quad	0x0000ffffffffffff	// k48
.quad	0x00000000ffffffff	// k32
.quad	0x000000000000ffff	// k16
.quad	0x0000000000000000	// k0
.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
.align  2
___

foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/geo;

	print $_,"\n";
}
close STDOUT or die "error closing STDOUT"; # enforce flush
Update to 5.13.0 (1818) 2019-12-31 13:08:08 +00:00			`#! /usr/bin/env perl`
			`# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.`
			`#`
			`# Licensed under the OpenSSL license (the "License"). You may not use`
			`# this file except in compliance with the License. You can obtain a copy`
			`# in the file LICENSE in the source distribution or at`
			`# https://www.openssl.org/source/license.html`

			`# ====================================================================`
			`# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL`
			`# project. The module is, however, dual licensed under OpenSSL and`
			`# CRYPTOGAMS licenses depending on where you obtain it. For further`
			`# details see http://www.openssl.org/~appro/cryptogams/.`
			`# ====================================================================`

			`# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It`
			`# implements the multiplication algorithm described in:`
			`#`
			`# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software`
			`# Polynomial Multiplication on ARM Processors using the NEON Engine.`
			`#`
			`# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf`
			`#`
			`# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is`
			`# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit`
			`# NEON, the low and high halves of the 128-bit register q0 are accessible as`
			`# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of`
			`# vN. Where the 32-bit version would use the upper half, this file must keep`
			`# halves in separate registers.`
			`#`
			`# The other distinction is in syntax. 32-bit NEON embeds lane information in the`
			`# instruction name, while AArch64 uses suffixes on the registers. For instance,`
			`# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:`
			`#`
			`# vshl.i64 q0, q0, #1`
			`#`
			`# in 64-bit, it would be written:`
			`#`
			`# shl v0.2d, v0.2d, #1`
			`#`
			`# See Programmer's Guide for ARMv8-A, section 7 for details.`
			`# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf`
			`#`
			`# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ`
			`# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials`
			`# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit`
			`# polynomial and is conditioned on the PMULL extension. This file emulates the`
			`# latter with the former.`

			`use strict;`

			`my $flavour = shift;`
			`my $output;`
			`if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }`
			`else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }`

			`if ($flavour && $flavour ne "void") {`
			`$0 =~ m/(.*[\/\\])[^\/\\]+$/;`
			`my $dir = $1;`
			`my $xlate;`
			`( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or`
			`( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or`
			`die "can't locate arm-xlate.pl";`

			`open OUT,"\| \"$^X\" $xlate $flavour $output";`
			`STDOUT=OUT;`
			`} else {`
			`open OUT,">$output";`
			`STDOUT=OUT;`
			`}`

			`my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block`
			`my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));`
			`my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));`
			`# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers`
			`# to spare.`
			`my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));`
			`my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));`
			`my ($k48_k32, $k16_k0) = map("v$_", (24..25));`

			`my $code = "";`

			`# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b`
			`# must be distinct from $t* and $k. $t are clobbered by the emitted code.`
			`sub clmul64x64 {`
			`my ($r, $a, $b) = @_;`
			`$code .= <<___;`
			`ext $t0.8b, $a.8b, $a.8b, #1 // A1`
			`pmull $t0.8h, $t0.8b, $b.8b // F = A1*B`
			`ext $r.8b, $b.8b, $b.8b, #1 // B1`
			`pmull $r.8h, $a.8b, $r.8b // E = A*B1`
			`ext $t1.8b, $a.8b, $a.8b, #2 // A2`
			`pmull $t1.8h, $t1.8b, $b.8b // H = A2*B`
			`ext $t3.8b, $b.8b, $b.8b, #2 // B2`
			`pmull $t3.8h, $a.8b, $t3.8b // G = A*B2`
			`ext $t2.8b, $a.8b, $a.8b, #3 // A3`
			`eor $t0.16b, $t0.16b, $r.16b // L = E + F`
			`pmull $t2.8h, $t2.8b, $b.8b // J = A3*B`
			`ext $r.8b, $b.8b, $b.8b, #3 // B3`
			`eor $t1.16b, $t1.16b, $t3.16b // M = G + H`
			`pmull $r.8h, $a.8b, $r.8b // I = A*B3`

			`// Here we diverge from the 32-bit version. It computes the following`
			`// (instructions reordered for clarity):`
			`//`
			`// veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)`
			`// vand \$t0#hi, \$t0#hi, \$k48`
			`// veor \$t0#lo, \$t0#lo, \$t0#hi`
			`//`
			`// veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)`
			`// vand \$t1#hi, \$t1#hi, \$k32`
			`// veor \$t1#lo, \$t1#lo, \$t1#hi`
			`//`
			`// veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)`
			`// vand \$t2#hi, \$t2#hi, \$k16`
			`// veor \$t2#lo, \$t2#lo, \$t2#hi`
			`//`
			`// veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)`
			`// vmov.i64 \$t3#hi, #0`
			`//`
			`// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on`
			`// upper halves of SIMD registers, so we must split each half into`
			`// separate registers. To compensate, we pair computations up and`
			`// parallelize.`

			`ext $t3.8b, $b.8b, $b.8b, #4 // B4`
			`eor $t2.16b, $t2.16b, $r.16b // N = I + J`
			`pmull $t3.8h, $a.8b, $t3.8b // K = A*B4`

			`// This can probably be scheduled more efficiently. For now, we just`
			`// pair up independent instructions.`
			`zip1 $t0l_t1l.2d, $t0.2d, $t1.2d`
			`zip1 $t2l_t3l.2d, $t2.2d, $t3.2d`
			`zip2 $t0h_t1h.2d, $t0.2d, $t1.2d`
			`zip2 $t2h_t3h.2d, $t2.2d, $t3.2d`
			`eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b`
			`eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b`
			`and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b`
			`and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b`
			`eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b`
			`eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b`
			`zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d`
			`zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d`
			`zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d`
			`zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d`

			`ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8`
			`ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16`
			`pmull $r.8h, $a.8b, $b.8b // D = A*B`
			`ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32`
			`ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24`
			`eor $t0.16b, $t0.16b, $t1.16b`
			`eor $t2.16b, $t2.16b, $t3.16b`
			`eor $r.16b, $r.16b, $t0.16b`
			`eor $r.16b, $r.16b, $t2.16b`
			`___`
			`}`

			`$code .= <<___;`
			`.text`

			`.global gcm_init_neon`
			`.type gcm_init_neon,%function`
			`.align 4`
			`gcm_init_neon:`
			`// This function is adapted from gcm_init_v8. xC2 is t3.`
			`ld1 {$t1.2d}, [x1] // load H`
			`movi $t3.16b, #0xe1`
			`shl $t3.2d, $t3.2d, #57 // 0xc2.0`
			`ext $INlo.16b, $t1.16b, $t1.16b, #8`
			`ushr $t2.2d, $t3.2d, #63`
			`dup $t1.4s, $t1.s[1]`
			`ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01`
			`ushr $t2.2d, $INlo.2d, #63`
			`sshr $t1.4s, $t1.4s, #31 // broadcast carry bit`
			`and $t2.16b, $t2.16b, $t0.16b`
			`shl $INlo.2d, $INlo.2d, #1`
			`ext $t2.16b, $t2.16b, $t2.16b, #8`
			`and $t0.16b, $t0.16b, $t1.16b`
			`orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1`
			`eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H`
			`st1 {$Hlo.2d}, [x0] // store Htable[0]`
			`ret`
			`.size gcm_init_neon,.-gcm_init_neon`

			`.global gcm_gmult_neon`
			`.type gcm_gmult_neon,%function`
			`.align 4`
			`gcm_gmult_neon:`
			`ld1 {$INlo.16b}, [$Xi] // load Xi`
			`ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H`
			`ld1 {$Hhi.1d}, [$Htbl]`
			`adrp x9, :pg_hi21:.Lmasks // load constants`
			`add x9, x9, :lo12:.Lmasks`
			`ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]`
			`rev64 $INlo.16b, $INlo.16b // byteswap Xi`
			`ext $INlo.16b, $INlo.16b, $INlo.16b, #8`
			`eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing`

			`mov $len, #16`
			`b .Lgmult_neon`
			`.size gcm_gmult_neon,.-gcm_gmult_neon`

			`.global gcm_ghash_neon`
			`.type gcm_ghash_neon,%function`
			`.align 4`
			`gcm_ghash_neon:`
			`ld1 {$Xl.16b}, [$Xi] // load Xi`
			`ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H`
			`ld1 {$Hhi.1d}, [$Htbl]`
			`adrp x9, :pg_hi21:.Lmasks // load constants`
			`add x9, x9, :lo12:.Lmasks`
			`ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]`
			`rev64 $Xl.16b, $Xl.16b // byteswap Xi`
			`ext $Xl.16b, $Xl.16b, $Xl.16b, #8`
			`eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing`

			`.Loop_neon:`
			`ld1 {$INlo.16b}, [$inp], #16 // load inp`
			`rev64 $INlo.16b, $INlo.16b // byteswap inp`
			`ext $INlo.16b, $INlo.16b, $INlo.16b, #8`
			`eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi`

			`.Lgmult_neon:`
			`// Split the input into $INlo and $INhi. (The upper halves are unused,`
			`// so it is okay to leave them alone.)`
			`ins $INhi.d[0], $INlo.d[1]`
			`___`
			`&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo`
			`$code .= <<___;`
			`eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing`
			`___`
			`&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)`
			`&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi`
			`$code .= <<___;`
			`ext $t0.16b, $Xl.16b, $Xh.16b, #8`
			`eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing`
			`eor $Xm.16b, $Xm.16b, $Xh.16b`
			`eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi`
			`ins $Xl.d[1], $Xm.d[0] // Xh\|Xl - 256-bit result`
			`// This is a no-op due to the ins instruction below.`
			`// ins $Xh.d[0], $Xm.d[1]`

			`// equivalent of reduction_avx from ghash-x86_64.pl`
			`shl $t1.2d, $Xl.2d, #57 // 1st phase`
			`shl $t2.2d, $Xl.2d, #62`
			`eor $t2.16b, $t2.16b, $t1.16b //`
			`shl $t1.2d, $Xl.2d, #63`
			`eor $t2.16b, $t2.16b, $t1.16b //`
			`// Note Xm contains {Xl.d[1], Xh.d[0]}.`
			`eor $t2.16b, $t2.16b, $Xm.16b`
			`ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]`
			`ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]`

			`ushr $t2.2d, $Xl.2d, #1 // 2nd phase`
			`eor $Xh.16b, $Xh.16b,$Xl.16b`
			`eor $Xl.16b, $Xl.16b,$t2.16b //`
			`ushr $t2.2d, $t2.2d, #6`
			`ushr $Xl.2d, $Xl.2d, #1 //`
			`eor $Xl.16b, $Xl.16b, $Xh.16b //`
			`eor $Xl.16b, $Xl.16b, $t2.16b //`

			`subs $len, $len, #16`
			`bne .Loop_neon`

			`rev64 $Xl.16b, $Xl.16b // byteswap Xi and write`
			`ext $Xl.16b, $Xl.16b, $Xl.16b, #8`
			`st1 {$Xl.16b}, [$Xi]`

			`ret`
			`.size gcm_ghash_neon,.-gcm_ghash_neon`

			`.section .rodata`
			`.align 4`
			`.Lmasks:`
			`.quad 0x0000ffffffffffff // k48`
			`.quad 0x00000000ffffffff // k32`
			`.quad 0x000000000000ffff // k16`
			`.quad 0x0000000000000000 // k0`
			`.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"`
			`.align 2`
			`___`

			`foreach (split("\n",$code)) {`
			s/\`([^\`]*)\`/eval $1/geo;

			`print $_,"\n";`
			`}`
			`close STDOUT or die "error closing STDOUT"; # enforce flush`