guix/gnu/packages/patches/rust-ring-0.16-missing-files.patch

These 4 files exist in the git repository for rust-ring, and are from
the same commit where 0.16.20 is taken from. They were not added to the
include list in Cargo.toml, so they were not added to the tarball.

---
 crypto/curve25519/make_curve25519_tables.py   | 222 +++++
 crypto/fipsmodule/aes/asm/vpaes-armv7.pl      | 896 ++++++++++++++++++
 crypto/fipsmodule/aes/asm/vpaes-armv8.pl      | 837 ++++++++++++++++
 .../fipsmodule/modes/asm/ghash-neon-armv8.pl  | 294 ++++++
 4 files changed, 2249 insertions(+)
 create mode 100755 crypto/curve25519/make_curve25519_tables.py
 create mode 100644 crypto/fipsmodule/aes/asm/vpaes-armv7.pl
 create mode 100755 crypto/fipsmodule/aes/asm/vpaes-armv8.pl
 create mode 100644 crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl

diff --git a/crypto/curve25519/make_curve25519_tables.py b/crypto/curve25519/make_curve25519_tables.py
new file mode 100755
index 0000000..50dee2a
--- /dev/null
+++ b/crypto/curve25519/make_curve25519_tables.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+import StringIO
+import subprocess
+
+# Base field Z_p
+p = 2**255 - 19
+
+def modp_inv(x):
+    return pow(x, p-2, p)
+
+# Square root of -1
+modp_sqrt_m1 = pow(2, (p-1) // 4, p)
+
+# Compute corresponding x-coordinate, with low bit corresponding to
+# sign, or return None on failure
+def recover_x(y, sign):
+    if y >= p:
+        return None
+    x2 = (y*y-1) * modp_inv(d*y*y+1)
+    if x2 == 0:
+        if sign:
+            return None
+        else:
+            return 0
+
+    # Compute square root of x2
+    x = pow(x2, (p+3) // 8, p)
+    if (x*x - x2) % p != 0:
+        x = x * modp_sqrt_m1 % p
+    if (x*x - x2) % p != 0:
+        return None
+
+    if (x & 1) != sign:
+        x = p - x
+    return x
+
+# Curve constant
+d = -121665 * modp_inv(121666) % p
+
+# Base point
+g_y = 4 * modp_inv(5) % p
+g_x = recover_x(g_y, 0)
+
+# Points are represented as affine tuples (x, y).
+
+def point_add(P, Q):
+    x1, y1 = P
+    x2, y2 = Q
+    x3 = ((x1*y2 + y1*x2) * modp_inv(1 + d*x1*x2*y1*y2)) % p
+    y3 = ((y1*y2 + x1*x2) * modp_inv(1 - d*x1*x2*y1*y2)) % p
+    return (x3, y3)
+
+# Computes Q = s * P
+def point_mul(s, P):
+    Q = (0, 1)  # Neutral element
+    while s > 0:
+        if s & 1:
+            Q = point_add(Q, P)
+        P = point_add(P, P)
+        s >>= 1
+    return Q
+
+def to_bytes(x):
+    ret = bytearray(32)
+    for i in range(len(ret)):
+        ret[i] = x % 256
+        x >>= 8
+    assert x == 0
+    return ret
+
+def to_ge_precomp(P):
+    # typedef struct {
+    #   fe_loose yplusx;
+    #   fe_loose yminusx;
+    #   fe_loose xy2d;
+    # } ge_precomp;
+    x, y = P
+    return ((y + x) % p, (y - x) % p, (x * y * 2 * d) % p)
+
+def to_base_25_5(x):
+    limbs = (26, 25, 26, 25, 26, 25, 26, 25, 26, 25)
+    ret = []
+    for l in limbs:
+        ret.append(x & ((1<<l) - 1))
+        x >>= l
+    assert x == 0
+    return ret
+
+def to_base_51(x):
+    ret = []
+    for _ in range(5):
+        ret.append(x & ((1<<51) - 1))
+        x >>= 51
+    assert x == 0
+    return ret
+
+def to_literal(x):
+    ret = "{{\n#if defined(BORINGSSL_CURVE25519_64BIT)\n"
+    ret += ", ".join(map(str, to_base_51(x)))
+    ret += "\n#else\n"
+    ret += ", ".join(map(str, to_base_25_5(x)))
+    ret += "\n#endif\n}}"
+    return ret
+
+def main():
+    d2 = (2 * d) % p
+
+    small_precomp = bytearray()
+    for i in range(1, 16):
+        s = (i&1) | ((i&2) << (64-1)) | ((i&4) << (128-2)) | ((i&8) << (192-3))
+        P = point_mul(s, (g_x, g_y))
+        small_precomp += to_bytes(P[0])
+        small_precomp += to_bytes(P[1])
+
+    large_precomp = []
+    for i in range(32):
+        large_precomp.append([])
+        for j in range(8):
+            P = point_mul((j + 1) << (i * 8), (g_x, g_y))
+            large_precomp[-1].append(to_ge_precomp(P))
+
+    bi_precomp = []
+    for i in range(8):
+        P = point_mul(2*i + 1, (g_x, g_y))
+        bi_precomp.append(to_ge_precomp(P))
+
+
+    buf = StringIO.StringIO()
+    buf.write("""/* Copyright (c) 2020, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// This file is generated from
+//    ./make_curve25519_tables.py > curve25519_tables.h
+
+
+static const fe d = """)
+    buf.write(to_literal(d))
+    buf.write(""";
+
+static const fe sqrtm1 = """)
+    buf.write(to_literal(modp_sqrt_m1))
+    buf.write(""";
+
+static const fe d2 = """)
+    buf.write(to_literal(d2))
+    buf.write(""";
+
+#if defined(OPENSSL_SMALL)
+
+// This block of code replaces the standard base-point table with a much smaller
+// one. The standard table is 30,720 bytes while this one is just 960.
+//
+// This table contains 15 pairs of group elements, (x, y), where each field
+// element is serialised with |fe_tobytes|. If |i| is the index of the group
+// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀
+// is the most significant bit). The value of the group element is then:
+// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator.
+static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""")
+    for i, b in enumerate(small_precomp):
+        buf.write("0x%02x, " % b)
+    buf.write("""
+};
+
+#else
+
+// k25519Precomp[i][j] = (j+1)*256^i*B
+static const ge_precomp k25519Precomp[32][8] = {
+""")
+    for child in large_precomp:
+        buf.write("{\n")
+        for val in child:
+            buf.write("{\n")
+            for term in val:
+                buf.write(to_literal(term) + ",\n")
+            buf.write("},\n")
+        buf.write("},\n")
+    buf.write("""};
+
+#endif  // OPENSSL_SMALL
+
+// Bi[i] = (2*i+1)*B
+static const ge_precomp Bi[8] = {
+""")
+    for val in bi_precomp:
+        buf.write("{\n")
+        for term in val:
+                buf.write(to_literal(term) + ",\n")
+        buf.write("},\n")
+    buf.write("""};
+""")
+
+    proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE)
+    proc.communicate(buf.getvalue())
+
+if __name__ == "__main__":
+    main()
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
new file mode 100644
index 0000000..d36a97a
--- /dev/null
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -0,0 +1,896 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# Adapted from the original x86_64 version and <appro@openssl.org>'s ARMv8
+# version.
+#
+# armv7, aarch64, and x86_64 differ in several ways:
+#
+# * x86_64 SSSE3 instructions are two-address (destination operand is also a
+#   source), while NEON is three-address (destination operand is separate from
+#   two sources).
+#
+# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16.
+#
+# * x86_64 instructions can take memory references, while ARM is a load/store
+#   architecture. This means we sometimes need a spare register.
+#
+# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb),
+#   while armv7 only has a 64-bit byte shuffle (vtbl).
+#
+# This means this armv7 version must be a mix of both aarch64 and x86_64
+# implementations. armv7 and aarch64 have analogous SIMD instructions, so we
+# base the instructions on aarch64. However, we cannot use aarch64's register
+# allocation. x86_64's register count matches, but x86_64 is two-address.
+# vpaes-armv8.pl already accounts for this in the comments, which use
+# three-address AVX instructions instead of the original SSSE3 ones. We base
+# register usage on these comments, which are preserved in this file.
+#
+# This means we do not use separate input and output registers as in aarch64 and
+# cannot pin as many constants in the preheat functions. However, the load/store
+# architecture means we must still deviate from x86_64 in places.
+#
+# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source
+# and destination and 128-bit table. Fortunately, armv7 also allows addressing
+# upper and lower halves of each 128-bit register. The lower half of q{N} is
+# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent
+# instruction,
+#
+#     vtbl.8 q0, q1, q2   @ Index each of q2's 16 bytes into q1. Store in q0.
+#
+# we write:
+#
+#     vtbl.8 d0, q1, d4   @ Index each of d4's 8 bytes into q1. Store in d0.
+#     vtbl.8 d1, q1, d5   @ Index each of d5's 8 bytes into q1. Store in d1.
+#
+# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and
+# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note,
+# however, that destination (q0) and table (q1) registers may no longer match.
+# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
+# two-address pshufb always matched these operands, so this is common.)
+#
+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
+# expands to an ADD or SUB of the pc register to find an address. That immediate
+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
+# This means larger values must be more aligned.
+#
+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
+# use either encoding (do we actually need to support this?). In ARM mode, the
+# distances get large enough to require 16-byte alignment. Moving constants
+# closer to their use resolves most of this, but common constants in
+# _vpaes_consts are used by the whole file. Affected ADR instructions must be
+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
+# constraint have been commented.
+#
+# For details on ARM's immediate value encoding scheme, see
+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
+#
+# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
+#
+# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
+#
+# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones).
+#   aarch64 names registers like v0, and denotes half-width operations in an
+#   instruction suffix (see below).
+#
+# * aarch64 embeds size and lane information in register suffixes. v0.16b is
+#   16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s.
+#   armv7 embeds the total size in the register name (see above) and the size of
+#   each element in an instruction suffix, which may look like vmov.i8,
+#   vshr.u8, or vtbl.8, depending on instruction.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir=$1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my $code = "";
+
+$code.=<<___;
+.syntax	unified
+
+.arch	armv7-a
+.fpu	neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	_vpaes_consts,%object
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	@ mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		@ sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:	@ inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	@ input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	@ sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	@ sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	@ sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.asciz  "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+___
+
+{
+my ($inp,$out,$key) = map("r$_", (0..2));
+
+my ($invlo,$invhi) = map("q$_", (10..11));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15));
+
+$code.=<<___;
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+.type	_vpaes_preheat,%function
+.align	4
+_vpaes_preheat:
+	adr	r10, .Lk_inv
+	vmov.i8	q9, #0x0f		@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [$key] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	r9, $key
+	ldr	r8, [$key,#240]		@ pull rounds
+	adr	r11, .Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, .Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
+	vtbl.8	q1#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	q1#hi, {q2}, q1#hi
+	vtbl.8	q2#lo, {q3}, q0#lo	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	q2#hi, {q3}, q0#hi
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	q4#lo, {$sb1t}, q2#lo	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	q4#hi, {$sb1t}, q2#hi
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	vtbl.8	q0#lo, {$sb1u}, q3#lo	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	q0#hi, {$sb1u}, q3#hi
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	q5#lo, {$sb2t}, q2#lo	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	q5#hi, {$sb2t}, q2#hi
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	q2#lo, {$sb2u}, q3#lo	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	q2#hi, {$sb2u}, q3#hi
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	vtbl.8	q3#lo, {q0}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	q3#hi, {q0}, q1#hi
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	q5#lo, {q0}, q4#lo	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	q5#hi, {q0}, q4#hi
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	q4#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	q4#hi, {q3}, q1#hi
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		\$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+.Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	q5#lo, {$invhi}, q1#lo	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	q5#hi, {$invhi}, q1#hi
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	q3#lo, {$invlo}, q0#lo	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	q3#hi, {$invlo}, q0#hi
+	vtbl.8	q4#lo, {$invlo}, q1#lo	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	q4#hi, {$invlo}, q1#hi
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	q2#lo, {$invlo}, q3#lo	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	q2#hi, {$invlo}, q3#hi
+	vtbl.8	q3#lo, {$invlo}, q4#lo	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	q3#hi, {$invlo}, q4#hi
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	.Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, .Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64 {q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64 {q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	vtbl.8	q4#lo, {q1}, q2#lo	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	q4#hi, {q1}, q2#hi
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	q2#lo, {q0}, q3#lo	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	q2#hi, {q0}, q3#hi
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	q0#hi, {q2}, q1#hi
+	bx	lr
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	GFp_vpaes_encrypt
+.type	GFp_vpaes_encrypt,%function
+.align	4
+GFp_vpaes_encrypt:
+	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+	@ alignment.
+	stmdb	sp!, {r7-r11,lr}
+	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8-d11}
+
+	vld1.64	{q0}, [$inp]
+	bl	_vpaes_preheat
+	bl	_vpaes_encrypt_core
+	vst1.64	{q0}, [$out]
+
+	vldmia	sp!, {d8-d11}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3");
+my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12));
+
+$code.=<<___;
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+.type	_vpaes_key_consts,%object
+.align	4
+_vpaes_key_consts:
+.Lk_rcon:	@ rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	@ output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	@ deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size	_vpaes_key_consts,.-_vpaes_key_consts
+
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	r11, .Lk_rcon
+	vmov.i8	$s63, #0x5b			@ .Lk_s63
+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	$s0F, #0x0f			@ .Lk_s0F
+	vld1.64	{$invlo,$invhi}, [r10]		@ .Lk_inv
+	vld1.64	{$rcon}, [r11]			@ .Lk_rcon
+	bx	lr
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [$inp]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [$out]		@ vmovdqu	%xmm0,	(%rdx)
+
+	@ *ring*: Decryption removed.
+
+.Lschedule_go:
+	cmp	$bits, #192		@ cmp	\$192,	%esi
+	bhi	.Lschedule_256
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+.Lschedule_128:
+	mov	$inp, #10		@ mov	\$10, %esi
+
+.Loop_schedule_128:
+	bl 	_vpaes_schedule_round
+	subs	$inp, $inp, #1		@ dec	%esi
+	beq 	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b 	.Loop_schedule_128
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+.Lschedule_256:
+	vld1.64	{q0}, [$inp]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	$inp, #7			@ mov	\$7, %esi
+
+.Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	$inp, $inp, #1			@ dec	%esi
+	beq 	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, q0#hi[1]		@ vpshufd	\$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+.Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	$out, $out, #32		@ add		\$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	q0#hi, {q2}, q1#hi
+
+.Lschedule_mangle_last_dec:
+	sub	$out, $out, #16			@ add	\$-16,	%rdx
+	veor	q0, q0, $s63			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [$out]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, $rcon, q4, #15		@ vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	$rcon, $rcon, $rcon, #15	@ vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, q0#hi[1]			@ vpshufd	\$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, .Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	\$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	\$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, $s0F			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	q2#lo, {$invhi}, q1#lo		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	q2#hi, {$invhi}, q1#hi
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	q3#lo, {$invlo}, q0#lo		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	q3#hi, {$invlo}, q0#hi
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	q4#lo, {$invlo}, q1#lo		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	q4#hi, {$invlo}, q1#hi
+	 veor	q7, q7, $s63			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	q3#lo, {$invlo}, q3#lo		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	q3#hi, {$invlo}, q3#hi
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	q2#lo, {$invlo}, q4#lo		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	q2#hi, {$invlo}, q4#hi
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	q4#lo, {q15}, q3#lo		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	q4#hi, {q15}, q3#hi
+	vtbl.8	q1#lo, {q14}, q2#lo		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	q1#hi, {q14}, q2#hi
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, $s0F		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
+	vtbl.8	q2#lo, {q14}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	q2#hi, {q14}, q1#hi
+	vtbl.8	q0#lo, {q15}, q0#lo	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	q0#hi, {q15}, q0#hi
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	tst	$dir, $dir
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, $s63		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16		@ add		\$16,	%rdx
+	vtbl.8	q4#lo, {q2}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	q4#hi, {q2}, q5#hi
+	vtbl.8	q1#lo, {q4}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	q1#hi, {q4}, q5#hi
+	vtbl.8	q3#lo, {q1}, q5#lo	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	q3#hi, {q1}, q5#hi
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	q2#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	q2#hi, {q3}, q1#hi
+	add	r8, r8, #64-16		@ add	\$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	\$0x30,	%r8
+	vst1.64	{q2}, [$out]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	GFp_vpaes_set_encrypt_key
+.type	GFp_vpaes_set_encrypt_key,%function
+.align	4
+GFp_vpaes_set_encrypt_key:
+	stmdb	sp!, {r7-r11, lr}
+	vstmdb	sp!, {d8-d15}
+
+	lsr	r9, $bits, #5		@ shr	\$5,%eax
+	add	r9, r9, #5		@ \$5,%eax
+	str	r9, [$out,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		@ mov	\$0,%ecx
+	mov	r8, #0x30		@ mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8-d15}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
+___
+}
+
+{
+my ($out, $inp) = map("r$_", (0..1));
+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
+
+$code .= <<___;
+
+@ Additional constants for converting to bsaes.
+.type	_vpaes_convert_consts,%object
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+	.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+	.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	GFp_vpaes_encrypt_key_to_bsaes
+.type	GFp_vpaes_encrypt_key_to_bsaes,%function
+.align	4
+GFp_vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	$s0F, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{$mc_forward}, [r2]
+	vmov.i8	$s63, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	$s63_raw, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [$inp,#240]
+	add	r2, r2, #1
+	str	r2, [$out,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [$inp]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [$inp]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8  q2#lo, {q0}, q1#lo
+	vtbl.8  q2#hi, {q0}, q1#hi
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	q1#lo, {q0}, $mc_forward#lo
+	vtbl.8	q1#hi, {q0}, $mc_forward#hi
+	vmov	q0, q1
+	vtbl.8	q2#lo, {q1}, $mc_forward#lo
+	vtbl.8	q2#hi, {q1}, $mc_forward#hi
+	veor	q0, q0, q2
+	vtbl.8	q1#lo, {q2}, $mc_forward#lo
+	vtbl.8	q1#hi, {q2}, $mc_forward#hi
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, $s63
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ $s63_raw, not $s63.
+	veor	q0, q0, $s63_raw
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes
+___
+}
+
+{
+# Register-passed parameters.
+my ($inp, $out, $len, $key) = map("r$_", 0..3);
+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
+# $tmp. $ctr is r7 because it must be preserved across calls.
+my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                 const AES_KEY *key, const uint8_t ivec[16]);
+$code .= <<___;
+.globl	GFp_vpaes_ctr32_encrypt_blocks
+.type	GFp_vpaes_ctr32_encrypt_blocks,%function
+.align	4
+GFp_vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7-r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8-d15}
+
+	cmp	$len, #0
+	@ $ivec is passed on the stack.
+	ldr	$ivec, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
+	mov	$tmp, $key
+	mov	$key, $len
+	mov	$len, $tmp
+___
+my ($len, $key) = ($key, $len);
+$code .= <<___;
+
+	@ Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	vld1.8	{q7}, [$ivec]
+
+	bl	_vpaes_preheat
+	rev	$ctr, $ctr		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [$inp]!		@ Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [$out]!
+	subs	$len, $len, #1
+	@ Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$tmp, $ctr
+	vmov.32	q7#hi[1], $tmp
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8-d15}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
new file mode 100755
index 0000000..b31bbb8
--- /dev/null
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,837 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# ARMv8 NEON adaptation by <appro@openssl.org>
+#
+# Reason for undertaken effort is that there is at least one popular
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
+#
+#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
+# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
+# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
+# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
+# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
+#
+# (*)	ECB denotes approximate result for parallelizable modes
+#	such as CBC decrypt, CTR, etc.;
+# (**)	these results are worse than scalar compiler-generated
+#	code, but it's constant-time and therefore preferred;
+# (***)	presented for reference/comparison purposes;
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include <GFp/arm_arch.h>
+
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	// mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:// mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		// sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	// inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	// input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	// sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	// sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	// sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	// decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	// decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	// decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	// rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	// output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+___
+
+{
+my ($inp,$out,$key) = map("x$_",(0..2));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
+
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	GFp_vpaes_encrypt
+.type	GFp_vpaes_encrypt,%function
+.align	4
+GFp_vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [$inp]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b,  v17.16b
+	 ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	 tbl	v9.16b,  {$iptlo}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	 tbl	v10.16b, {$ipthi}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	 eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	 tbl	v12.16b, {$sb1t}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sb1u}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	 tbl	v13.16b, {$sb2t}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	 tbl	v10.16b, {$sb2u}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	 tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	 eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	 tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	 eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	 tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	 eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	 eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b, v17.16b
+	 ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	 tbl	v13.16b, {$invhi},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sbot}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
+
+$code.=<<___;
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, :pg_hi21:.Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, :pg_hi21:.Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, :pg_hi21:.Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, :pg_hi21:.Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
+
+	cmp	$bits, #192			// cmp	\$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	$inp, #10			// mov	\$10, %esi
+
+.Loop_schedule_128:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl 	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	$inp, $inp, #8
+	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	$inp, #4			// mov	\$4,	%esi
+
+.Loop_schedule_192:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	$inp, #7			// mov	\$7, %esi
+
+.Loop_schedule_256:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, :pg_hi21:.Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	$dir, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, :pg_hi21:.Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	$out, $out, #32			// add	\$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16			// add	\$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #64-16			// add	\$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	GFp_vpaes_set_encrypt_key
+.type	GFp_vpaes_set_encrypt_key,%function
+.align	4
+GFp_vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		// mov	\$0,%ecx
+	mov	x8, #0x30		// mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4));
+my ($ctr, $ctr_tmp) = ("w6", "w7");
+
+# void GFp_vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                     const AES_KEY *key, const uint8_t ivec[16]);
+$code.=<<___;
+.globl	GFp_vpaes_ctr32_encrypt_blocks
+.type	GFp_vpaes_ctr32_encrypt_blocks,%function
+.align	4
+GFp_vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	$len, .Lctr32_done
+
+	// Note, unlike the other functions, $len here is measured in blocks,
+	// not bytes.
+	mov	x17, $len
+	mov	x2,  $key
+
+	// Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	ld1	{v7.16b}, [$ivec]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	$ctr, $ctr		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [$inp], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [$out], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v7.s[3], $ctr_tmp
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [$inp], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	$ctr_tmp, $ctr, #1
+	add	$ctr, $ctr, #2
+	rev	$ctr_tmp, $ctr_tmp
+	mov	v14.s[3], $ctr_tmp
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT";
diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
new file mode 100644
index 0000000..7e52ad6
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,294 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+#     vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+#     shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+    my $dir = $1;
+    my $xlate;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" $xlate $flavour $output";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
+	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
+	ext	$r.8b, $b.8b, $b.8b, #1		// B1
+	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
+	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
+	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
+	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
+	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
+	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
+	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
+	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
+	ext	$r.8b, $b.8b, $b.8b, #3		// B3
+	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
+	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	\$t0#hi, \$t0#hi, \$k48
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
+	//
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	\$t1#hi, \$t1#hi, \$k32
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
+	//
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	\$t2#hi, \$t2#hi, \$k16
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
+	//
+	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	\$t3#hi, #0
+	//
+	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
+	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
+	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
+	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
+	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
+	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
+	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
+	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
+	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
+	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
+	eor	$t0.16b, $t0.16b, $t1.16b
+	eor	$t2.16b, $t2.16b, $t3.16b
+	eor	$r.16b, $r.16b, $t0.16b
+	eor	$r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+#include <GFp/arm_arch.h>
+
+.text
+
+.global	GFp_gcm_init_neon
+.type	GFp_gcm_init_neon,%function
+.align	4
+GFp_gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{$t1.2d}, [x1]			// load H
+	movi	$t3.16b, #0xe1
+	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
+	ext	$INlo.16b, $t1.16b, $t1.16b, #8
+	ushr	$t2.2d, $t3.2d, #63
+	dup	$t1.4s, $t1.s[1]
+	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
+	ushr	$t2.2d, $INlo.2d, #63
+	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
+	and	$t2.16b, $t2.16b, $t0.16b
+	shl	$INlo.2d, $INlo.2d, #1
+	ext	$t2.16b, $t2.16b, $t2.16b, #8
+	and	$t0.16b, $t0.16b, $t1.16b
+	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
+	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
+	st1	{$Hlo.2d}, [x0]			// store Htable[0]
+	ret
+.size	GFp_gcm_init_neon,.-GFp_gcm_init_neon
+
+.global	GFp_gcm_gmult_neon
+.type	GFp_gcm_gmult_neon,%function
+.align	4
+GFp_gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$INlo.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+	mov	$len, #16
+	b	.Lgmult_neon
+.size	GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
+
+.global	GFp_gcm_ghash_neon
+.type	GFp_gcm_ghash_neon,%function
+.align	4
+GFp_gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$Xl.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{$INlo.16b}, [$inp], #16	// load inp
+	rev64	$INlo.16b, $INlo.16b		// byteswap inp
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into $INlo and $INhi. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	$INhi.d[0], $INlo.d[1]
+___
+&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
+$code .= <<___;
+	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
+___
+&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
+$code .= <<___;
+	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
+	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
+	eor	$Xm.16b, $Xm.16b, $Xh.16b
+	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	$Xh.d[0], $Xm.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	$t1.2d, $Xl.2d, #57		// 1st phase
+	shl	$t2.2d, $Xl.2d, #62
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	shl	$t1.2d, $Xl.2d, #63
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	$t2.16b, $t2.16b, $Xm.16b
+	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
+	eor	$Xh.16b, $Xh.16b,$Xl.16b
+	eor	$Xl.16b, $Xl.16b,$t2.16b	//
+	ushr	$t2.2d, $t2.2d, #6
+	ushr	$Xl.2d, $Xl.2d, #1		//
+	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
+	eor	$Xl.16b, $Xl.16b, $t2.16b	//
+
+	subs	$len, $len, #16
+	bne	.Loop_neon
+
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	st1	{$Xl.16b}, [$Xi]
+
+	ret
+.size	GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT"; # enforce flush
-- 
Efraim Flashner   <efraim@flashner.co.il>   רנשלפ םירפא
GPG key = A28B F40C 3E55 1372 662D  14F7 41AA E7DC CA3D 8351
Confidentiality cannot be guaranteed on emails sent or received unencrypted
-												gnu: rust-ring-0.16: Add missing files.

* gnu/packages/crates-io.scm (rust-ring-0.16)[source]: Add patch with
missing files. Adjust snippet to delete a pre-generated file.
[arguments]: Add a custom phase to rebuild the pre-generated-file.
[native-inputs]: Add clang, python-2.
* gnu/packages/patches/rust-ring-0.16-missing-files.patch: New file.
* gnu/local.mk (dist_patch_DATA): Register it.

											
										
										
											2023-05-22 19:23:20 +00:00
+								These 4 files exist in the git repository for rust-ring, and are from
 								the same commit where 0.16.20 is taken from. They were not added to the
 								include list in Cargo.toml, so they were not added to the tarball.
 								---
 								 crypto/curve25519/make_curve25519_tables.py   | 222 +++++
 								 crypto/fipsmodule/aes/asm/vpaes-armv7.pl      | 896 ++++++++++++++++++
 								 crypto/fipsmodule/aes/asm/vpaes-armv8.pl      | 837 ++++++++++++++++
 								 .../fipsmodule/modes/asm/ghash-neon-armv8.pl  | 294 ++++++
 files changed, 2249 insertions(+)
 								 create mode 100755 crypto/curve25519/make_curve25519_tables.py
 								 create mode 100644 crypto/fipsmodule/aes/asm/vpaes-armv7.pl
 								 create mode 100755 crypto/fipsmodule/aes/asm/vpaes-armv8.pl
 								 create mode 100644 crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
 								diff --git a/crypto/curve25519/make_curve25519_tables.py b/crypto/curve25519/make_curve25519_tables.py
 								new file mode 100755
 								index 0000000..50dee2a
 								--- /dev/null
 								+++ b/crypto/curve25519/make_curve25519_tables.py
@@ -0,0 +1,222 @@
 								+#!/usr/bin/env python
 								+# coding=utf-8
 								+# Copyright (c) 2020, Google Inc.
 								+#
 								+# Permission to use, copy, modify, and/or distribute this software for any
 								+# purpose with or without fee is hereby granted, provided that the above
 								+# copyright notice and this permission notice appear in all copies.
 								+#
 								+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 								+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 								+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 								+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 								+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 								+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 								+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 								+
 								+import StringIO
 								+import subprocess
 								+
 								+# Base field Z_p
 								+p = 2**255 - 19
 								+
 								+def modp_inv(x):
 								+    return pow(x, p-2, p)
 								+
 								+# Square root of -1
 								+modp_sqrt_m1 = pow(2, (p-1) // 4, p)
 								+
 								+# Compute corresponding x-coordinate, with low bit corresponding to
 								+# sign, or return None on failure
 								+def recover_x(y, sign):
 								+    if y >= p:
 								+        return None
 								+    x2 = (y*y-1) * modp_inv(d*y*y+1)
 								+    if x2 == 0:
 								+        if sign:
 								+            return None
 								+        else:
 								+            return 0
 								+
 								+    # Compute square root of x2
 								+    x = pow(x2, (p+3) // 8, p)
 								+    if (x*x - x2) % p != 0:
 								+        x = x * modp_sqrt_m1 % p
 								+    if (x*x - x2) % p != 0:
 								+        return None
 								+
 								+    if (x & 1) != sign:
 								+        x = p - x
 								+    return x
 								+
 								+# Curve constant
 								+d = -121665 * modp_inv(121666) % p
 								+
 								+# Base point
 								+g_y = 4 * modp_inv(5) % p
 								+g_x = recover_x(g_y, 0)
 								+
 								+# Points are represented as affine tuples (x, y).
 								+
 								+def point_add(P, Q):
 								+    x1, y1 = P
 								+    x2, y2 = Q
 								+    x3 = ((x1*y2 + y1*x2) * modp_inv(1 + d*x1*x2*y1*y2)) % p
 								+    y3 = ((y1*y2 + x1*x2) * modp_inv(1 - d*x1*x2*y1*y2)) % p
 								+    return (x3, y3)
 								+
 								+# Computes Q = s * P
 								+def point_mul(s, P):
 								+    Q = (0, 1)  # Neutral element
 								+    while s > 0:
 								+        if s & 1:
 								+            Q = point_add(Q, P)
 								+        P = point_add(P, P)
 								+        s >>= 1
 								+    return Q
 								+
 								+def to_bytes(x):
 								+    ret = bytearray(32)
 								+    for i in range(len(ret)):
 								+        ret[i] = x % 256
 								+        x >>= 8
 								+    assert x == 0
 								+    return ret
 								+
 								+def to_ge_precomp(P):
 								+    # typedef struct {
 								+    #   fe_loose yplusx;
 								+    #   fe_loose yminusx;
 								+    #   fe_loose xy2d;
 								+    # } ge_precomp;
 								+    x, y = P
 								+    return ((y + x) % p, (y - x) % p, (x * y * 2 * d) % p)
 								+
 								+def to_base_25_5(x):
 								+    limbs = (26, 25, 26, 25, 26, 25, 26, 25, 26, 25)
 								+    ret = []
 								+    for l in limbs:
 								+        ret.append(x & ((1<<l) - 1))
 								+        x >>= l
 								+    assert x == 0
 								+    return ret
 								+
 								+def to_base_51(x):
 								+    ret = []
 								+    for _ in range(5):
 								+        ret.append(x & ((1<<51) - 1))
 								+        x >>= 51
 								+    assert x == 0
 								+    return ret
 								+
 								+def to_literal(x):
 								+    ret = "{{\n#if defined(BORINGSSL_CURVE25519_64BIT)\n"
 								+    ret += ", ".join(map(str, to_base_51(x)))
 								+    ret += "\n#else\n"
 								+    ret += ", ".join(map(str, to_base_25_5(x)))
 								+    ret += "\n#endif\n}}"
 								+    return ret
 								+
 								+def main():
 								+    d2 = (2 * d) % p
 								+
 								+    small_precomp = bytearray()
 								+    for i in range(1, 16):
 								+        s = (i&1) | ((i&2) << (64-1)) | ((i&4) << (128-2)) | ((i&8) << (192-3))
 								+        P = point_mul(s, (g_x, g_y))
 								+        small_precomp += to_bytes(P[0])
 								+        small_precomp += to_bytes(P[1])
 								+
 								+    large_precomp = []
 								+    for i in range(32):
 								+        large_precomp.append([])
 								+        for j in range(8):
 								+            P = point_mul((j + 1) << (i * 8), (g_x, g_y))
 								+            large_precomp[-1].append(to_ge_precomp(P))
 								+
 								+    bi_precomp = []
 								+    for i in range(8):
 								+        P = point_mul(2*i + 1, (g_x, g_y))
 								+        bi_precomp.append(to_ge_precomp(P))
 								+
 								+
 								+    buf = StringIO.StringIO()
 								+    buf.write("""/* Copyright (c) 2020, Google Inc.
 								+ *
 								+ * Permission to use, copy, modify, and/or distribute this software for any
 								+ * purpose with or without fee is hereby granted, provided that the above
 								+ * copyright notice and this permission notice appear in all copies.
 								+ *
 								+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 								+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 								+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 								+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 								+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 								+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 								+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 								+
 								+// This file is generated from
 								+//    ./make_curve25519_tables.py > curve25519_tables.h
 								+
 								+
 								+static const fe d = """)
 								+    buf.write(to_literal(d))
 								+    buf.write(""";
 								+
 								+static const fe sqrtm1 = """)
 								+    buf.write(to_literal(modp_sqrt_m1))
 								+    buf.write(""";
 								+
 								+static const fe d2 = """)
 								+    buf.write(to_literal(d2))
 								+    buf.write(""";
 								+
 								+#if defined(OPENSSL_SMALL)
 								+
 								+// This block of code replaces the standard base-point table with a much smaller
 								+// one. The standard table is 30,720 bytes while this one is just 960.
 								+//
 								+// This table contains 15 pairs of group elements, (x, y), where each field
 								+// element is serialised with |fe_tobytes|. If |i| is the index of the group
 								+// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀
 								+// is the most significant bit). The value of the group element is then:
 								+// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator.
 								+static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""")
 								+    for i, b in enumerate(small_precomp):
 								+        buf.write("0x%02x, " % b)
 								+    buf.write("""
 								+};
 								+
 								+#else
 								+
 								+// k25519Precomp[i][j] = (j+1)*256^i*B
 								+static const ge_precomp k25519Precomp[32][8] = {
 								+""")
 								+    for child in large_precomp:
 								+        buf.write("{\n")
 								+        for val in child:
 								+            buf.write("{\n")
 								+            for term in val:
 								+                buf.write(to_literal(term) + ",\n")
 								+            buf.write("},\n")
 								+        buf.write("},\n")
 								+    buf.write("""};
 								+
 								+#endif  // OPENSSL_SMALL
 								+
 								+// Bi[i] = (2*i+1)*B
 								+static const ge_precomp Bi[8] = {
 								+""")
 								+    for val in bi_precomp:
 								+        buf.write("{\n")
 								+        for term in val:
 								+                buf.write(to_literal(term) + ",\n")
 								+        buf.write("},\n")
 								+    buf.write("""};
 								+""")
 								+
 								+    proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE)
 								+    proc.communicate(buf.getvalue())
 								+
 								+if __name__ == "__main__":
 								+    main()
 								diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
 								new file mode 100644
 								index 0000000..d36a97a
 								--- /dev/null
 								+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -0,0 +1,896 @@
 								+#! /usr/bin/env perl
 								+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
 								+#
 								+# Licensed under the OpenSSL license (the "License").  You may not use
 								+# this file except in compliance with the License.  You can obtain a copy
 								+# in the file LICENSE in the source distribution or at
 								+# https://www.openssl.org/source/license.html
 								+
 								+
 								+######################################################################
 								+## Constant-time SSSE3 AES core implementation.
 								+## version 0.1
 								+##
 								+## By Mike Hamburg (Stanford University), 2009
 								+## Public domain.
 								+##
 								+## For details see http://shiftleft.org/papers/vector_aes/ and
 								+## http://crypto.stanford.edu/vpaes/.
 								+##
 								+######################################################################
 								+# Adapted from the original x86_64 version and <appro@openssl.org>'s ARMv8
 								+# version.
 								+#
 								+# armv7, aarch64, and x86_64 differ in several ways:
 								+#
 								+# * x86_64 SSSE3 instructions are two-address (destination operand is also a
 								+#   source), while NEON is three-address (destination operand is separate from
 								+#   two sources).
 								+#
 								+# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16.
 								+#
 								+# * x86_64 instructions can take memory references, while ARM is a load/store
 								+#   architecture. This means we sometimes need a spare register.
 								+#
 								+# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb),
 								+#   while armv7 only has a 64-bit byte shuffle (vtbl).
 								+#
 								+# This means this armv7 version must be a mix of both aarch64 and x86_64
 								+# implementations. armv7 and aarch64 have analogous SIMD instructions, so we
 								+# base the instructions on aarch64. However, we cannot use aarch64's register
 								+# allocation. x86_64's register count matches, but x86_64 is two-address.
 								+# vpaes-armv8.pl already accounts for this in the comments, which use
 								+# three-address AVX instructions instead of the original SSSE3 ones. We base
 								+# register usage on these comments, which are preserved in this file.
 								+#
 								+# This means we do not use separate input and output registers as in aarch64 and
 								+# cannot pin as many constants in the preheat functions. However, the load/store
 								+# architecture means we must still deviate from x86_64 in places.
 								+#
 								+# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source
 								+# and destination and 128-bit table. Fortunately, armv7 also allows addressing
 								+# upper and lower halves of each 128-bit register. The lower half of q{N} is
 								+# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent
 								+# instruction,
 								+#
 								+#     vtbl.8 q0, q1, q2   @ Index each of q2's 16 bytes into q1. Store in q0.
 								+#
 								+# we write:
 								+#
 								+#     vtbl.8 d0, q1, d4   @ Index each of d4's 8 bytes into q1. Store in d0.
 								+#     vtbl.8 d1, q1, d5   @ Index each of d5's 8 bytes into q1. Store in d1.
 								+#
 								+# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and
 								+# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note,
 								+# however, that destination (q0) and table (q1) registers may no longer match.
 								+# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
 								+# two-address pshufb always matched these operands, so this is common.)
 								+#
 								+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
 								+# expands to an ADD or SUB of the pc register to find an address. That immediate
 								+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
 								+# This means larger values must be more aligned.
 								+#
 								+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
 								+# use either encoding (do we actually need to support this?). In ARM mode, the
 								+# distances get large enough to require 16-byte alignment. Moving constants
 								+# closer to their use resolves most of this, but common constants in
 								+# _vpaes_consts are used by the whole file. Affected ADR instructions must be
 								+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
 								+# constraint have been commented.
 								+#
 								+# For details on ARM's immediate value encoding scheme, see
 								+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
 								+#
 								+# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
 								+#
 								+# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
 								+#
 								+# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones).
 								+#   aarch64 names registers like v0, and denotes half-width operations in an
 								+#   instruction suffix (see below).
 								+#
 								+# * aarch64 embeds size and lane information in register suffixes. v0.16b is
 								+#   16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s.
 								+#   armv7 embeds the total size in the register name (see above) and the size of
 								+#   each element in an instruction suffix, which may look like vmov.i8,
 								+#   vshr.u8, or vtbl.8, depending on instruction.
 								+
 								+use strict;
 								+
 								+my $flavour = shift;
 								+my $output;
 								+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 								+
 								+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
 								+my $dir=$1;
 								+my $xlate;
 								+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
 								+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
 								+die "can't locate arm-xlate.pl";
 								+
 								+open OUT,"| \"$^X\" $xlate $flavour $output";
 								+*STDOUT=*OUT;
 								+
 								+my $code = "";
 								+
 								+$code.=<<___;
 								+.syntax	unified
 								+
 								+.arch	armv7-a
 								+.fpu	neon
 								+
 								+#if defined(__thumb2__)
 								+.thumb
 								+#else
 								+.code	32
 								+#endif
 								+
 								+.text
 								+
 								+.type	_vpaes_consts,%object
 								+.align	7	@ totally strategic alignment
 								+_vpaes_consts:
 								+.Lk_mc_forward:	@ mc_forward
 								+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
 								+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
 								+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
 								+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
 								+.Lk_mc_backward:@ mc_backward
 								+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
 								+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
 								+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
 								+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
 								+.Lk_sr:		@ sr
 								+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
 								+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
 								+	.quad	0x0F060D040B020900, 0x070E050C030A0108
 								+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
 								+
 								+@
 								+@ "Hot" constants
 								+@
 								+.Lk_inv:	@ inv, inva
 								+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
 								+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
 								+.Lk_ipt:	@ input transform (lo, hi)
 								+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
 								+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
 								+.Lk_sbo:	@ sbou, sbot
 								+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
 								+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
 								+.Lk_sb1:	@ sb1u, sb1t
 								+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
 								+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
 								+.Lk_sb2:	@ sb2u, sb2t
 								+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
 								+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
 								+
 								+.asciz  "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
 								+.size	_vpaes_consts,.-_vpaes_consts
 								+.align	6
 								+___
 								+
 								+{
 								+my ($inp,$out,$key) = map("r$_", (0..2));
 								+
 								+my ($invlo,$invhi) = map("q$_", (10..11));
 								+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15));
 								+
 								+$code.=<<___;
 								+@@
 								+@@  _aes_preheat
 								+@@
 								+@@  Fills q9-q15 as specified below.
 								+@@
 								+.type	_vpaes_preheat,%function
 								+.align	4
 								+_vpaes_preheat:
 								+	adr	r10, .Lk_inv
 								+	vmov.i8	q9, #0x0f		@ .Lk_s0F
 								+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
 								+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
 								+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
 								+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
 								+	bx	lr
 								+
 								+@@
 								+@@  _aes_encrypt_core
 								+@@
 								+@@  AES-encrypt q0.
 								+@@
 								+@@  Inputs:
 								+@@     q0 = input
 								+@@     q9-q15 as in _vpaes_preheat
 								+@@    [$key] = scheduled keys
 								+@@
 								+@@  Output in q0
 								+@@  Clobbers  q1-q5, r8-r11
 								+@@  Preserves q6-q8 so you get some local vectors
 								+@@
 								+@@
 								+.type	_vpaes_encrypt_core,%function
 								+.align 4
 								+_vpaes_encrypt_core:
 								+	mov	r9, $key
 								+	ldr	r8, [$key,#240]		@ pull rounds
 								+	adr	r11, .Lk_ipt
 								+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
 								+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
 								+	vld1.64	{q2, q3}, [r11]
 								+	adr	r11, .Lk_mc_forward+16
 								+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
 								+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
 								+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
 								+	vtbl.8	q1#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm1
 								+	vtbl.8	q1#hi, {q2}, q1#hi
 								+	vtbl.8	q2#lo, {q3}, q0#lo	@ vpshufb	%xmm0,	%xmm3,	%xmm2
 								+	vtbl.8	q2#hi, {q3}, q0#hi
 								+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
 								+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
 								+
 								+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
 								+	@ subs in .Lenc_loop.
 								+	tst	r8, r8
 								+	b	.Lenc_entry
 								+
 								+.align 4
 								+.Lenc_loop:
 								+	@ middle of middle round
 								+	add	r10, r11, #0x40
 								+	vtbl.8	q4#lo, {$sb1t}, q2#lo	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
 								+	vtbl.8	q4#hi, {$sb1t}, q2#hi
 								+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
 								+	vtbl.8	q0#lo, {$sb1u}, q3#lo	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
 								+	vtbl.8	q0#hi, {$sb1u}, q3#hi
 								+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	vtbl.8	q5#lo, {$sb2t}, q2#lo	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
 								+	vtbl.8	q5#hi, {$sb2t}, q2#hi
 								+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	vtbl.8	q2#lo, {$sb2u}, q3#lo	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
 								+	vtbl.8	q2#hi, {$sb2u}, q3#hi
 								+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
 								+	vtbl.8	q3#lo, {q0}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
 								+	vtbl.8	q3#hi, {q0}, q1#hi
 								+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
 								+	@ Write to q5 instead of q0, so the table and destination registers do
 								+	@ not overlap.
 								+	vtbl.8	q5#lo, {q0}, q4#lo	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
 								+	vtbl.8	q5#hi, {q0}, q4#hi
 								+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
 								+	vtbl.8	q4#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
 								+	vtbl.8	q4#hi, {q3}, q1#hi
 								+	@ Here we restore the original q0/q5 usage.
 								+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
 								+	and	r11, r11, #~(1<<6)	@ and		\$0x30,	%r11		# ... mod 4
 								+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
 								+	subs	r8, r8, #1		@ nr--
 								+
 								+.Lenc_entry:
 								+	@ top of round
 								+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
 								+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
 								+	vtbl.8	q5#lo, {$invhi}, q1#lo	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
 								+	vtbl.8	q5#hi, {$invhi}, q1#hi
 								+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
 								+	vtbl.8	q3#lo, {$invlo}, q0#lo	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
 								+	vtbl.8	q3#hi, {$invlo}, q0#hi
 								+	vtbl.8	q4#lo, {$invlo}, q1#lo	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
 								+	vtbl.8	q4#hi, {$invlo}, q1#hi
 								+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
 								+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
 								+	vtbl.8	q2#lo, {$invlo}, q3#lo	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
 								+	vtbl.8	q2#hi, {$invlo}, q3#hi
 								+	vtbl.8	q3#lo, {$invlo}, q4#lo	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
 								+	vtbl.8	q3#hi, {$invlo}, q4#hi
 								+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
 								+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
 								+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
 								+	bne	.Lenc_loop
 								+
 								+	@ middle of last round
 								+	add	r10, r11, #0x80
 								+
 								+	adr	r11, .Lk_sbo
 								+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
 								+	@ overlap table and destination registers.
 								+	vld1.64 {q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
 								+	vld1.64 {q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
 								+	vtbl.8	q4#lo, {q1}, q2#lo	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
 								+	vtbl.8	q4#hi, {q1}, q2#hi
 								+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
 								+	@ Write to q2 instead of q0 below, to avoid overlapping table and
 								+	@ destination registers.
 								+	vtbl.8	q2#lo, {q0}, q3#lo	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
 								+	vtbl.8	q2#hi, {q0}, q3#hi
 								+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	@ Here we restore the original q0/q2 usage.
 								+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0
 								+	vtbl.8	q0#hi, {q2}, q1#hi
 								+	bx	lr
 								+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
 								+
 								+.globl	GFp_vpaes_encrypt
 								+.type	GFp_vpaes_encrypt,%function
 								+.align	4
 								+GFp_vpaes_encrypt:
 								+	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
 								+	@ alignment.
 								+	stmdb	sp!, {r7-r11,lr}
 								+	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
 								+	vstmdb	sp!, {d8-d11}
 								+
 								+	vld1.64	{q0}, [$inp]
 								+	bl	_vpaes_preheat
 								+	bl	_vpaes_encrypt_core
 								+	vst1.64	{q0}, [$out]
 								+
 								+	vldmia	sp!, {d8-d11}
 								+	ldmia	sp!, {r7-r11, pc}	@ return
 								+.size	GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
 								+___
 								+}
 								+{
 								+my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3");
 								+my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12));
 								+
 								+$code.=<<___;
 								+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 								+@@                                                    @@
 								+@@                  AES key schedule                  @@
 								+@@                                                    @@
 								+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 								+
 								+@ This function diverges from both x86_64 and armv7 in which constants are
 								+@ pinned. x86_64 has a common preheat function for all operations. aarch64
 								+@ separates them because it has enough registers to pin nearly all constants.
 								+@ armv7 does not have enough registers, but needing explicit loads and stores
 								+@ also complicates using x86_64's register allocation directly.
 								+@
 								+@ We pin some constants for convenience and leave q14 and q15 free to load
 								+@ others on demand.
 								+
 								+@
 								+@  Key schedule constants
 								+@
 								+.type	_vpaes_key_consts,%object
 								+.align	4
 								+_vpaes_key_consts:
 								+.Lk_rcon:	@ rcon
 								+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
 								+
 								+.Lk_opt:	@ output transform
 								+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
 								+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
 								+.Lk_deskew:	@ deskew tables: inverts the sbox's "skew"
 								+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
 								+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
 								+.size	_vpaes_key_consts,.-_vpaes_key_consts
 								+
 								+.type	_vpaes_key_preheat,%function
 								+.align	4
 								+_vpaes_key_preheat:
 								+	adr	r11, .Lk_rcon
 								+	vmov.i8	$s63, #0x5b			@ .Lk_s63
 								+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
 								+	vmov.i8	$s0F, #0x0f			@ .Lk_s0F
 								+	vld1.64	{$invlo,$invhi}, [r10]		@ .Lk_inv
 								+	vld1.64	{$rcon}, [r11]			@ .Lk_rcon
 								+	bx	lr
 								+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
 								+
 								+.type	_vpaes_schedule_core,%function
 								+.align	4
 								+_vpaes_schedule_core:
 								+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
 								+	@ so save an extra register.
 								+	stmdb	sp!, {r3,lr}
 								+
 								+	bl	_vpaes_key_preheat	@ load the tables
 								+
 								+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
 								+	vld1.64	{q0}, [$inp]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
 								+
 								+	@ input transform
 								+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
 								+	@ overlap table and destination.
 								+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
 								+	bl	_vpaes_schedule_transform
 								+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
 								+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
 								+
 								+	add	r8, r8, r10
 								+
 								+	@ encrypting, output zeroth round key after transform
 								+	vst1.64	{q0}, [$out]		@ vmovdqu	%xmm0,	(%rdx)
 								+
 								+	@ *ring*: Decryption removed.
 								+
 								+.Lschedule_go:
 								+	cmp	$bits, #192		@ cmp	\$192,	%esi
 								+	bhi	.Lschedule_256
 								+	@ 128: fall though
 								+
 								+@@
 								+@@  .schedule_128
 								+@@
 								+@@  128-bit specific part of key schedule.
 								+@@
 								+@@  This schedule is really simple, because all its parts
 								+@@  are accomplished by the subroutines.
 								+@@
 								+.Lschedule_128:
 								+	mov	$inp, #10		@ mov	\$10, %esi
 								+
 								+.Loop_schedule_128:
 								+	bl 	_vpaes_schedule_round
 								+	subs	$inp, $inp, #1		@ dec	%esi
 								+	beq 	.Lschedule_mangle_last
 								+	bl	_vpaes_schedule_mangle	@ write output
 								+	b 	.Loop_schedule_128
 								+
 								+@@
 								+@@  .aes_schedule_256
 								+@@
 								+@@  256-bit specific part of key schedule.
 								+@@
 								+@@  The structure here is very similar to the 128-bit
 								+@@  schedule, but with an additional "low side" in
 								+@@  q6.  The low side's rounds are the same as the
 								+@@  high side's, except no rcon and no rotation.
 								+@@
 								+.align	4
 								+.Lschedule_256:
 								+	vld1.64	{q0}, [$inp]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
 								+	bl	_vpaes_schedule_transform	@ input transform
 								+	mov	$inp, #7			@ mov	\$7, %esi
 								+
 								+.Loop_schedule_256:
 								+	bl	_vpaes_schedule_mangle		@ output low result
 								+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
 								+
 								+	@ high round
 								+	bl	_vpaes_schedule_round
 								+	subs	$inp, $inp, #1			@ dec	%esi
 								+	beq 	.Lschedule_mangle_last
 								+	bl	_vpaes_schedule_mangle
 								+
 								+	@ low round. swap xmm7 and xmm6
 								+	vdup.32	q0, q0#hi[1]		@ vpshufd	\$0xFF,	%xmm0,	%xmm0
 								+	vmov.i8	q4, #0
 								+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
 								+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
 								+	bl	_vpaes_schedule_low_round
 								+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
 								+
 								+	b	.Loop_schedule_256
 								+
 								+@@
 								+@@  .aes_schedule_mangle_last
 								+@@
 								+@@  Mangler for last round of key schedule
 								+@@  Mangles q0
 								+@@    when encrypting, outputs out(q0) ^ 63
 								+@@    when decrypting, outputs unskew(q0)
 								+@@
 								+@@  Always called right before return... jumps to cleanup and exits
 								+@@
 								+.align	4
 								+.Lschedule_mangle_last:
 								+	@ schedule last round key from xmm0
 								+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
 								+
 								+	@ encrypting
 								+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
 								+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
 								+	add	$out, $out, #32		@ add		\$32,	%rdx
 								+	vmov	q2, q0
 								+	vtbl.8	q0#lo, {q2}, q1#lo	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
 								+	vtbl.8	q0#hi, {q2}, q1#hi
 								+
 								+.Lschedule_mangle_last_dec:
 								+	sub	$out, $out, #16			@ add	\$-16,	%rdx
 								+	veor	q0, q0, $s63			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
 								+	bl	_vpaes_schedule_transform	@ output transform
 								+	vst1.64	{q0}, [$out]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
 								+
 								+	@ cleanup
 								+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
 								+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
 								+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
 								+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
 								+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
 								+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
 								+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
 								+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
 								+	ldmia	sp!, {r3,pc}		@ return
 								+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
 								+
 								+@@
 								+@@  .aes_schedule_round
 								+@@
 								+@@  Runs one main round of the key schedule on q0, q7
 								+@@
 								+@@  Specifically, runs subbytes on the high dword of q0
 								+@@  then rotates it by one byte and xors into the low dword of
 								+@@  q7.
 								+@@
 								+@@  Adds rcon from low byte of q8, then rotates q8 for
 								+@@  next rcon.
 								+@@
 								+@@  Smears the dwords of q7 by xoring the low into the
 								+@@  second low, result into third, result into highest.
 								+@@
 								+@@  Returns results in q7 = q0.
 								+@@  Clobbers q1-q4, r11.
 								+@@
 								+.type	_vpaes_schedule_round,%function
 								+.align	4
 								+_vpaes_schedule_round:
 								+	@ extract rcon from xmm8
 								+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
 								+	vext.8	q1, $rcon, q4, #15		@ vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
 								+	vext.8	$rcon, $rcon, $rcon, #15	@ vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
 								+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
 								+
 								+	@ rotate
 								+	vdup.32	q0, q0#hi[1]			@ vpshufd	\$0xFF,	%xmm0,	%xmm0
 								+	vext.8	q0, q0, q0, #1			@ vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
 								+
 								+	@ fall through...
 								+
 								+	@ low round: same as high round, but no rotation and no rcon.
 								+_vpaes_schedule_low_round:
 								+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
 								+	@ We pin other values in _vpaes_key_preheat, so load them now.
 								+	adr	r11, .Lk_sb1
 								+	vld1.64	{q14,q15}, [r11]
 								+
 								+	@ smear xmm7
 								+	vext.8	q1, q4, q7, #12			@ vpslldq	\$4,	%xmm7,	%xmm1
 								+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
 								+	vext.8	q4, q4, q7, #8			@ vpslldq	\$8,	%xmm7,	%xmm4
 								+
 								+	@ subbytes
 								+	vand	q1, q0, $s0F			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
 								+	vshr.u8	q0, q0, #4			@ vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
 								+	 veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
 								+	vtbl.8	q2#lo, {$invhi}, q1#lo		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
 								+	vtbl.8	q2#hi, {$invhi}, q1#hi
 								+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
 								+	vtbl.8	q3#lo, {$invlo}, q0#lo		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
 								+	vtbl.8	q3#hi, {$invlo}, q0#hi
 								+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
 								+	vtbl.8	q4#lo, {$invlo}, q1#lo		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
 								+	vtbl.8	q4#hi, {$invlo}, q1#hi
 								+	 veor	q7, q7, $s63			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
 								+	vtbl.8	q3#lo, {$invlo}, q3#lo		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
 								+	vtbl.8	q3#hi, {$invlo}, q3#hi
 								+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
 								+	vtbl.8	q2#lo, {$invlo}, q4#lo		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
 								+	vtbl.8	q2#hi, {$invlo}, q4#hi
 								+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
 								+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
 								+	vtbl.8	q4#lo, {q15}, q3#lo		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
 								+	vtbl.8	q4#hi, {q15}, q3#hi
 								+	vtbl.8	q1#lo, {q14}, q2#lo		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
 								+	vtbl.8	q1#hi, {q14}, q2#hi
 								+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
 								+
 								+	@ add in smeared stuff
 								+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
 								+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
 								+	bx	lr
 								+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
 								+
 								+@@
 								+@@  .aes_schedule_transform
 								+@@
 								+@@  Linear-transform q0 according to tables at [r11]
 								+@@
 								+@@  Requires that q9 = 0x0F0F... as in preheat
 								+@@  Output in q0
 								+@@  Clobbers q1, q2, q14, q15
 								+@@
 								+.type	_vpaes_schedule_transform,%function
 								+.align	4
 								+_vpaes_schedule_transform:
 								+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
 								+					@ vmovdqa	16(%r11),	%xmm1 # hi
 								+	vand	q1, q0, $s0F		@ vpand	%xmm9,	%xmm0,	%xmm1
 								+	vshr.u8	q0, q0, #4		@ vpsrlb	\$4,	%xmm0,	%xmm0
 								+	vtbl.8	q2#lo, {q14}, q1#lo	@ vpshufb	%xmm1,	%xmm2,	%xmm2
 								+	vtbl.8	q2#hi, {q14}, q1#hi
 								+	vtbl.8	q0#lo, {q15}, q0#lo	@ vpshufb	%xmm0,	%xmm1,	%xmm0
 								+	vtbl.8	q0#hi, {q15}, q0#hi
 								+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
 								+	bx	lr
 								+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
 								+
 								+@@
 								+@@  .aes_schedule_mangle
 								+@@
 								+@@  Mangles q0 from (basis-transformed) standard version
 								+@@  to our version.
 								+@@
 								+@@  On encrypt,
 								+@@    xor with 0x63
 								+@@    multiply by circulant 0,1,1,1
 								+@@    apply shiftrows transform
 								+@@
 								+@@  On decrypt,
 								+@@    xor with 0x63
 								+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
 								+@@    deskew
 								+@@    apply shiftrows transform
 								+@@
 								+@@
 								+@@  Writes out to [r2], and increments or decrements it
 								+@@  Keeps track of round number mod 4 in r8
 								+@@  Preserves q0
 								+@@  Clobbers q1-q5
 								+@@
 								+.type	_vpaes_schedule_mangle,%function
 								+.align	4
 								+_vpaes_schedule_mangle:
 								+	tst	$dir, $dir
 								+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
 								+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
 								+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
 								+
 								+	@ encrypting
 								+	@ Write to q2 so we do not overlap table and destination below.
 								+	veor	q2, q0, $s63		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
 								+	add	$out, $out, #16		@ add		\$16,	%rdx
 								+	vtbl.8	q4#lo, {q2}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm4
 								+	vtbl.8	q4#hi, {q2}, q5#hi
 								+	vtbl.8	q1#lo, {q4}, q5#lo	@ vpshufb	%xmm5,	%xmm4,	%xmm1
 								+	vtbl.8	q1#hi, {q4}, q5#hi
 								+	vtbl.8	q3#lo, {q1}, q5#lo	@ vpshufb	%xmm5,	%xmm1,	%xmm3
 								+	vtbl.8	q3#hi, {q1}, q5#hi
 								+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
 								+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
 								+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
 								+
 								+.Lschedule_mangle_both:
 								+	@ Write to q2 so table and destination do not overlap.
 								+	vtbl.8	q2#lo, {q3}, q1#lo	@ vpshufb	%xmm1,	%xmm3,	%xmm3
 								+	vtbl.8	q2#hi, {q3}, q1#hi
 								+	add	r8, r8, #64-16		@ add	\$-16,	%r8
 								+	and	r8, r8, #~(1<<6)	@ and	\$0x30,	%r8
 								+	vst1.64	{q2}, [$out]		@ vmovdqu	%xmm3,	(%rdx)
 								+	bx	lr
 								+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
 								+
 								+.globl	GFp_vpaes_set_encrypt_key
 								+.type	GFp_vpaes_set_encrypt_key,%function
 								+.align	4
 								+GFp_vpaes_set_encrypt_key:
 								+	stmdb	sp!, {r7-r11, lr}
 								+	vstmdb	sp!, {d8-d15}
 								+
 								+	lsr	r9, $bits, #5		@ shr	\$5,%eax
 								+	add	r9, r9, #5		@ \$5,%eax
 								+	str	r9, [$out,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
 								+
 								+	mov	$dir, #0		@ mov	\$0,%ecx
 								+	mov	r8, #0x30		@ mov	\$0x30,%r8d
 								+	bl	_vpaes_schedule_core
 								+	eor	r0, r0, r0
 								+
 								+	vldmia	sp!, {d8-d15}
 								+	ldmia	sp!, {r7-r11, pc}	@ return
 								+.size	GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
 								+___
 								+}
 								+
 								+{
 								+my ($out, $inp) = map("r$_", (0..1));
 								+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
 								+
 								+$code .= <<___;
 								+
 								+@ Additional constants for converting to bsaes.
 								+.type	_vpaes_convert_consts,%object
 								+.align	4
 								+_vpaes_convert_consts:
 								+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
 								+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
 								+@ table. This was computed with the following script:
 								+@
 								+@   def u64s_to_u128(x, y):
 								+@       return x | (y << 64)
 								+@   def u128_to_u64s(w):
 								+@       return w & ((1<<64)-1), w >> 64
 								+@   def get_byte(w, i):
 								+@       return (w >> (i*8)) & 0xff
 								+@   def apply_table(table, b):
 								+@       lo = b & 0xf
 								+@       hi = b >> 4
 								+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
 								+@   def opt(b):
 								+@       table = [
 								+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
 								+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
 								+@       ]
 								+@       return apply_table(table, b)
 								+@   def rot_byte(b, n):
 								+@       return 0xff & ((b << n) | (b >> (8-n)))
 								+@   def skew(x):
 								+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
 								+@               rot_byte(x, 4))
 								+@   table = [0, 0]
 								+@   for i in range(16):
 								+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
 								+@       table[1] |= skew(opt(i<<4)) << (i*8)
 								+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
 								+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
 								+.Lk_opt_then_skew:
 								+	.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
 								+	.quad	0x1f30062936192f00, 0xb49bad829db284ab
 								+
 								+@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
 								+.globl	GFp_vpaes_encrypt_key_to_bsaes
 								+.type	GFp_vpaes_encrypt_key_to_bsaes,%function
 								+.align	4
 								+GFp_vpaes_encrypt_key_to_bsaes:
 								+	stmdb	sp!, {r11, lr}
 								+
 								+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
 								+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
 								+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
 								+	@ contain the transformations not in the bsaes representation. This
 								+	@ function inverts those transforms.
 								+	@
 								+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
 								+	@ representation, which does not match the other aes_nohw_*
 								+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
 								+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
 								+	@ cost of extra REV and VREV32 operations in little-endian ARM.
 								+
 								+	vmov.i8	$s0F, #0x0f		@ Required by _vpaes_schedule_transform
 								+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
 								+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
 								+
 								+	vld1.64	{$mc_forward}, [r2]
 								+	vmov.i8	$s63, #0x5b		@ .Lk_s63 from vpaes-x86_64
 								+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
 								+	vmov.i8	$s63_raw, #0x63		@ .LK_s63 without .Lk_ipt applied
 								+
 								+	@ vpaes stores one fewer round count than bsaes, but the number of keys
 								+	@ is the same.
 								+	ldr	r2, [$inp,#240]
 								+	add	r2, r2, #1
 								+	str	r2, [$out,#240]
 								+
 								+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
 								+	@ Invert this with .Lk_opt.
 								+	vld1.64	{q0}, [$inp]!
 								+	bl	_vpaes_schedule_transform
 								+	vrev32.8	q0, q0
 								+	vst1.64	{q0}, [$out]!
 								+
 								+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
 								+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
 								+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
 								+.Loop_enc_key_to_bsaes:
 								+	vld1.64	{q0}, [$inp]!
 								+
 								+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
 								+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
 								+	@ We use r3 rather than r8 to avoid a callee-saved register.
 								+	vld1.64	{q1}, [r3]
 								+	vtbl.8  q2#lo, {q0}, q1#lo
 								+	vtbl.8  q2#hi, {q0}, q1#hi
 								+	add	r3, r3, #16
 								+	and	r3, r3, #~(1<<6)
 								+	vmov	q0, q2
 								+
 								+	@ Handle the last key differently.
 								+	subs	r2, r2, #1
 								+	beq	.Loop_enc_key_to_bsaes_last
 								+
 								+	@ Multiply by the circulant. This is its own inverse.
 								+	vtbl.8	q1#lo, {q0}, $mc_forward#lo
 								+	vtbl.8	q1#hi, {q0}, $mc_forward#hi
 								+	vmov	q0, q1
 								+	vtbl.8	q2#lo, {q1}, $mc_forward#lo
 								+	vtbl.8	q2#hi, {q1}, $mc_forward#hi
 								+	veor	q0, q0, q2
 								+	vtbl.8	q1#lo, {q2}, $mc_forward#lo
 								+	vtbl.8	q1#hi, {q2}, $mc_forward#hi
 								+	veor	q0, q0, q1
 								+
 								+	@ XOR and finish.
 								+	veor	q0, q0, $s63
 								+	bl	_vpaes_schedule_transform
 								+	vrev32.8	q0, q0
 								+	vst1.64	{q0}, [$out]!
 								+	b	.Loop_enc_key_to_bsaes
 								+
 								+.Loop_enc_key_to_bsaes_last:
 								+	@ The final key does not have a basis transform (note
 								+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
 								+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
 								+	@ loop. Note that, because we act on the original representation, we use
 								+	@ $s63_raw, not $s63.
 								+	veor	q0, q0, $s63_raw
 								+	vrev32.8	q0, q0
 								+	vst1.64	{q0}, [$out]
 								+
 								+	@ Wipe registers which contained key material.
 								+	veor	q0, q0, q0
 								+	veor	q1, q1, q1
 								+	veor	q2, q2, q2
 								+
 								+	ldmia	sp!, {r11, pc}	@ return
 								+.size	GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes
 								+___
 								+}
 								+
 								+{
 								+# Register-passed parameters.
 								+my ($inp, $out, $len, $key) = map("r$_", 0..3);
 								+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
 								+# $tmp. $ctr is r7 because it must be preserved across calls.
 								+my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
 								+
 								+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
 								+#                                 const AES_KEY *key, const uint8_t ivec[16]);
 								+$code .= <<___;
 								+.globl	GFp_vpaes_ctr32_encrypt_blocks
 								+.type	GFp_vpaes_ctr32_encrypt_blocks,%function
 								+.align	4
 								+GFp_vpaes_ctr32_encrypt_blocks:
 								+	mov	ip, sp
 								+	stmdb	sp!, {r7-r11, lr}
 								+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
 								+	vstmdb	sp!, {d8-d15}
 								+
 								+	cmp	$len, #0
 								+	@ $ivec is passed on the stack.
 								+	ldr	$ivec, [ip]
 								+	beq	.Lctr32_done
 								+
 								+	@ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
 								+	mov	$tmp, $key
 								+	mov	$key, $len
 								+	mov	$len, $tmp
 								+___
 								+my ($len, $key) = ($key, $len);
 								+$code .= <<___;
 								+
 								+	@ Load the IV and counter portion.
 								+	ldr	$ctr, [$ivec, #12]
 								+	vld1.8	{q7}, [$ivec]
 								+
 								+	bl	_vpaes_preheat
 								+	rev	$ctr, $ctr		@ The counter is big-endian.
 								+
 								+.Lctr32_loop:
 								+	vmov	q0, q7
 								+	vld1.8	{q6}, [$inp]!		@ Load input ahead of time
 								+	bl	_vpaes_encrypt_core
 								+	veor	q0, q0, q6		@ XOR input and result
 								+	vst1.8	{q0}, [$out]!
 								+	subs	$len, $len, #1
 								+	@ Update the counter.
 								+	add	$ctr, $ctr, #1
 								+	rev	$tmp, $ctr
 								+	vmov.32	q7#hi[1], $tmp
 								+	bne	.Lctr32_loop
 								+
 								+.Lctr32_done:
 								+	vldmia	sp!, {d8-d15}
 								+	ldmia	sp!, {r7-r11, pc}	@ return
 								+.size	GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
 								+___
 								+}
 								+
 								+foreach (split("\n",$code)) {
 								+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
 								+	print $_,"\n";
 								+}
 								+
 								+close STDOUT;
 								diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
 								new file mode 100755
 								index 0000000..b31bbb8
 								--- /dev/null
 								+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,837 @@
 								+#! /usr/bin/env perl
 								+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
 								+#
 								+# Licensed under the OpenSSL license (the "License").  You may not use
 								+# this file except in compliance with the License.  You can obtain a copy
 								+# in the file LICENSE in the source distribution or at
 								+# https://www.openssl.org/source/license.html
 								+
 								+
 								+######################################################################
 								+## Constant-time SSSE3 AES core implementation.
 								+## version 0.1
 								+##
 								+## By Mike Hamburg (Stanford University), 2009
 								+## Public domain.
 								+##
 								+## For details see http://shiftleft.org/papers/vector_aes/ and
 								+## http://crypto.stanford.edu/vpaes/.
 								+##
 								+######################################################################
 								+# ARMv8 NEON adaptation by <appro@openssl.org>
 								+#
 								+# Reason for undertaken effort is that there is at least one popular
 								+# SoC based on Cortex-A53 that doesn't have crypto extensions.
 								+#
 								+#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
 								+# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
 								+# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
 								+# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
 								+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
 								+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
 								+# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
 								+#
 								+# (*)	ECB denotes approximate result for parallelizable modes
 								+#	such as CBC decrypt, CTR, etc.;
 								+# (**)	these results are worse than scalar compiler-generated
 								+#	code, but it's constant-time and therefore preferred;
 								+# (***)	presented for reference/comparison purposes;
 								+
 								+$flavour = shift;
 								+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 								+
 								+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 								+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
 								+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
 								+die "can't locate arm-xlate.pl";
 								+
 								+open OUT,"| \"$^X\" $xlate $flavour $output";
 								+*STDOUT=*OUT;
 								+
 								+$code.=<<___;
 								+#include <GFp/arm_arch.h>
 								+
 								+.section	.rodata
 								+
 								+.type	_vpaes_consts,%object
 								+.align	7	// totally strategic alignment
 								+_vpaes_consts:
 								+.Lk_mc_forward:	// mc_forward
 								+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
 								+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
 								+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
 								+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
 								+.Lk_mc_backward:// mc_backward
 								+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
 								+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
 								+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
 								+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
 								+.Lk_sr:		// sr
 								+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
 								+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
 								+	.quad	0x0F060D040B020900, 0x070E050C030A0108
 								+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
 								+
 								+//
 								+// "Hot" constants
 								+//
 								+.Lk_inv:	// inv, inva
 								+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
 								+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
 								+.Lk_ipt:	// input transform (lo, hi)
 								+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
 								+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
 								+.Lk_sbo:	// sbou, sbot
 								+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
 								+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
 								+.Lk_sb1:	// sb1u, sb1t
 								+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
 								+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
 								+.Lk_sb2:	// sb2u, sb2t
 								+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
 								+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
 								+
 								+//
 								+//  Key schedule constants
 								+//
 								+.Lk_dksd:	// decryption key schedule: invskew x*D
 								+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
 								+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
 								+.Lk_dksb:	// decryption key schedule: invskew x*B
 								+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
 								+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
 								+.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
 								+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
 								+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
 								+.Lk_dks9:	// decryption key schedule: invskew x*9
 								+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
 								+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
 								+
 								+.Lk_rcon:	// rcon
 								+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
 								+
 								+.Lk_opt:	// output transform
 								+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
 								+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
 								+.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
 								+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
 								+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
 								+
 								+.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
 								+.size	_vpaes_consts,.-_vpaes_consts
 								+.align	6
 								+
 								+.text
 								+___
 								+
 								+{
 								+my ($inp,$out,$key) = map("x$_",(0..2));
 								+
 								+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
 								+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
 								+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
 								+
 								+$code.=<<___;
 								+##
 								+##  _aes_preheat
 								+##
 								+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
 								+##  and %xmm9-%xmm15 as specified below.
 								+##
 								+.type	_vpaes_encrypt_preheat,%function
 								+.align	4
 								+_vpaes_encrypt_preheat:
 								+	adrp	x10, :pg_hi21:.Lk_inv
 								+	add	x10, x10, :lo12:.Lk_inv
 								+	movi	v17.16b, #0x0f
 								+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
 								+	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
 								+	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
 								+	ret
 								+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
 								+
 								+##
 								+##  _aes_encrypt_core
 								+##
 								+##  AES-encrypt %xmm0.
 								+##
 								+##  Inputs:
 								+##     %xmm0 = input
 								+##     %xmm9-%xmm15 as in _vpaes_preheat
 								+##    (%rdx) = scheduled keys
 								+##
 								+##  Output in %xmm0
 								+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
 								+##  Preserves %xmm6 - %xmm8 so you get some local vectors
 								+##
 								+##
 								+.type	_vpaes_encrypt_core,%function
 								+.align 4
 								+_vpaes_encrypt_core:
 								+	mov	x9, $key
 								+	ldr	w8, [$key,#240]			// pull rounds
 								+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
 								+	add	x11, x11, :lo12:.Lk_mc_forward+16
 								+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
 								+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
 								+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
 								+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
 								+	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
 								+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
 								+	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
 								+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
 								+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
 								+	b	.Lenc_entry
 								+
 								+.align 4
 								+.Lenc_loop:
 								+	// middle of middle round
 								+	add	x10, x11, #0x40
 								+	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
 								+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
 								+	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
 								+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
 								+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
 								+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
 								+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
 								+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
 								+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
 								+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
 								+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
 								+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
 								+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
 								+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
 								+	sub	w8, w8, #1			// nr--
 								+
 								+.Lenc_entry:
 								+	// top of round
 								+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
 								+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
 								+	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
 								+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
 								+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
 								+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
 								+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
 								+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
 								+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
 								+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
 								+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
 								+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
 								+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
 								+	cbnz	w8, .Lenc_loop
 								+
 								+	// middle of last round
 								+	add	x10, x11, #0x80
 								+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
 								+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
 								+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
 								+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
 								+	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
 								+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
 								+	ret
 								+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
 								+
 								+.globl	GFp_vpaes_encrypt
 								+.type	GFp_vpaes_encrypt,%function
 								+.align	4
 								+GFp_vpaes_encrypt:
 								+	AARCH64_SIGN_LINK_REGISTER
 								+	stp	x29,x30,[sp,#-16]!
 								+	add	x29,sp,#0
 								+
 								+	ld1	{v7.16b}, [$inp]
 								+	bl	_vpaes_encrypt_preheat
 								+	bl	_vpaes_encrypt_core
 								+	st1	{v0.16b}, [$out]
 								+
 								+	ldp	x29,x30,[sp],#16
 								+	AARCH64_VALIDATE_LINK_REGISTER
 								+	ret
 								+.size	GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
 								+
 								+.type	_vpaes_encrypt_2x,%function
 								+.align 4
 								+_vpaes_encrypt_2x:
 								+	mov	x9, $key
 								+	ldr	w8, [$key,#240]			// pull rounds
 								+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
 								+	add	x11, x11, :lo12:.Lk_mc_forward+16
 								+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
 								+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
 								+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
 								+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
 								+	 and	v9.16b,  v15.16b,  v17.16b
 								+	 ushr	v8.16b,  v15.16b,  #4
 								+	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
 								+	 tbl	v9.16b,  {$iptlo}, v9.16b
 								+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
 								+	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
 								+	 tbl	v10.16b, {$ipthi}, v8.16b
 								+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
 								+	 eor	v8.16b,  v9.16b,   v16.16b
 								+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
 								+	 eor	v8.16b,  v8.16b,   v10.16b
 								+	b	.Lenc_2x_entry
 								+
 								+.align 4
 								+.Lenc_2x_loop:
 								+	// middle of middle round
 								+	add	x10, x11, #0x40
 								+	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
 								+	 tbl	v12.16b, {$sb1t}, v10.16b
 								+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
 								+	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
 								+	 tbl	v8.16b,  {$sb1u}, v11.16b
 								+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	 eor	v12.16b, v12.16b, v16.16b
 								+	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
 								+	 tbl	v13.16b, {$sb2t}, v10.16b
 								+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	 eor	v8.16b,  v8.16b,  v12.16b
 								+	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
 								+	 tbl	v10.16b, {$sb2u}, v11.16b
 								+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
 								+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
 								+	 tbl	v11.16b, {v8.16b}, v1.16b
 								+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
 								+	 eor	v10.16b, v10.16b, v13.16b
 								+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
 								+	 tbl	v8.16b,  {v8.16b}, v4.16b
 								+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
 								+	 eor	v11.16b, v11.16b, v10.16b
 								+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
 								+	 tbl	v12.16b, {v11.16b},v1.16b
 								+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
 								+	 eor	v8.16b,  v8.16b,  v11.16b
 								+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
 								+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
 								+	 eor	v8.16b,  v8.16b,  v12.16b
 								+	sub	w8, w8, #1			// nr--
 								+
 								+.Lenc_2x_entry:
 								+	// top of round
 								+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
 								+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
 								+	 and	v9.16b,  v8.16b, v17.16b
 								+	 ushr	v8.16b,  v8.16b, #4
 								+	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
 								+	 tbl	v13.16b, {$invhi},v9.16b
 								+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
 								+	 eor	v9.16b,  v9.16b,  v8.16b
 								+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
 								+	 tbl	v11.16b, {$invlo},v8.16b
 								+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
 								+	 tbl	v12.16b, {$invlo},v9.16b
 								+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
 								+	 eor	v11.16b, v11.16b, v13.16b
 								+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
 								+	 eor	v12.16b, v12.16b, v13.16b
 								+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
 								+	 tbl	v10.16b, {$invlo},v11.16b
 								+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
 								+	 tbl	v11.16b, {$invlo},v12.16b
 								+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
 								+	 eor	v10.16b, v10.16b, v9.16b
 								+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
 								+	 eor	v11.16b, v11.16b, v8.16b
 								+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
 								+	cbnz	w8, .Lenc_2x_loop
 								+
 								+	// middle of last round
 								+	add	x10, x11, #0x80
 								+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
 								+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
 								+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
 								+	 tbl	v12.16b, {$sbou}, v10.16b
 								+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
 								+	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
 								+	 tbl	v8.16b,  {$sbot}, v11.16b
 								+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
 								+	 eor	v12.16b, v12.16b, v16.16b
 								+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
 								+	 eor	v8.16b,  v8.16b,  v12.16b
 								+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
 								+	 tbl	v1.16b,  {v8.16b},v1.16b
 								+	ret
 								+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
 								+___
 								+}
 								+{
 								+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
 								+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
 								+
 								+$code.=<<___;
 								+########################################################
 								+##                                                    ##
 								+##                  AES key schedule                  ##
 								+##                                                    ##
 								+########################################################
 								+.type	_vpaes_key_preheat,%function
 								+.align	4
 								+_vpaes_key_preheat:
 								+	adrp	x10, :pg_hi21:.Lk_inv
 								+	add	x10, x10, :lo12:.Lk_inv
 								+	movi	v16.16b, #0x5b			// .Lk_s63
 								+	adrp	x11, :pg_hi21:.Lk_sb1
 								+	add	x11, x11, :lo12:.Lk_sb1
 								+	movi	v17.16b, #0x0f			// .Lk_s0F
 								+	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
 								+	adrp	x10, :pg_hi21:.Lk_dksd
 								+	add	x10, x10, :lo12:.Lk_dksd
 								+	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
 								+	adrp	x11, :pg_hi21:.Lk_mc_forward
 								+	add	x11, x11, :lo12:.Lk_mc_forward
 								+	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
 								+	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
 								+	ld1	{v8.2d}, [x10]			// .Lk_rcon
 								+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
 								+	ret
 								+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
 								+
 								+.type	_vpaes_schedule_core,%function
 								+.align	4
 								+_vpaes_schedule_core:
 								+	AARCH64_SIGN_LINK_REGISTER
 								+	stp	x29, x30, [sp,#-16]!
 								+	add	x29,sp,#0
 								+
 								+	bl	_vpaes_key_preheat		// load the tables
 								+
 								+	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
 								+
 								+	// input transform
 								+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
 								+	bl	_vpaes_schedule_transform
 								+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
 								+
 								+	adrp	x10, :pg_hi21:.Lk_sr		// lea	.Lk_sr(%rip),%r10
 								+	add	x10, x10, :lo12:.Lk_sr
 								+
 								+	add	x8, x8, x10
 								+
 								+	// encrypting, output zeroth round key after transform
 								+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
 								+
 								+	cmp	$bits, #192			// cmp	\$192,	%esi
 								+	b.hi	.Lschedule_256
 								+	b.eq	.Lschedule_192
 								+	// 128: fall though
 								+
 								+##
 								+##  .schedule_128
 								+##
 								+##  128-bit specific part of key schedule.
 								+##
 								+##  This schedule is really simple, because all its parts
 								+##  are accomplished by the subroutines.
 								+##
 								+.Lschedule_128:
 								+	mov	$inp, #10			// mov	\$10, %esi
 								+
 								+.Loop_schedule_128:
 								+	sub	$inp, $inp, #1			// dec	%esi
 								+	bl 	_vpaes_schedule_round
 								+	cbz 	$inp, .Lschedule_mangle_last
 								+	bl	_vpaes_schedule_mangle		// write output
 								+	b 	.Loop_schedule_128
 								+
 								+##
 								+##  .aes_schedule_192
 								+##
 								+##  192-bit specific part of key schedule.
 								+##
 								+##  The main body of this schedule is the same as the 128-bit
 								+##  schedule, but with more smearing.  The long, high side is
 								+##  stored in %xmm7 as before, and the short, low side is in
 								+##  the high bits of %xmm6.
 								+##
 								+##  This schedule is somewhat nastier, however, because each
 								+##  round produces 192 bits of key material, or 1.5 round keys.
 								+##  Therefore, on each cycle we do 2 rounds and produce 3 round
 								+##  keys.
 								+##
 								+.align	4
 								+.Lschedule_192:
 								+	sub	$inp, $inp, #8
 								+	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
 								+	bl	_vpaes_schedule_transform	// input transform
 								+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
 								+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
 								+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
 								+	mov	$inp, #4			// mov	\$4,	%esi
 								+
 								+.Loop_schedule_192:
 								+	sub	$inp, $inp, #1			// dec	%esi
 								+	bl	_vpaes_schedule_round
 								+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
 								+	bl	_vpaes_schedule_mangle		// save key n
 								+	bl	_vpaes_schedule_192_smear
 								+	bl	_vpaes_schedule_mangle		// save key n+1
 								+	bl	_vpaes_schedule_round
 								+	cbz 	$inp, .Lschedule_mangle_last
 								+	bl	_vpaes_schedule_mangle		// save key n+2
 								+	bl	_vpaes_schedule_192_smear
 								+	b	.Loop_schedule_192
 								+
 								+##
 								+##  .aes_schedule_256
 								+##
 								+##  256-bit specific part of key schedule.
 								+##
 								+##  The structure here is very similar to the 128-bit
 								+##  schedule, but with an additional "low side" in
 								+##  %xmm6.  The low side's rounds are the same as the
 								+##  high side's, except no rcon and no rotation.
 								+##
 								+.align	4
 								+.Lschedule_256:
 								+	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
 								+	bl	_vpaes_schedule_transform	// input transform
 								+	mov	$inp, #7			// mov	\$7, %esi
 								+
 								+.Loop_schedule_256:
 								+	sub	$inp, $inp, #1			// dec	%esi
 								+	bl	_vpaes_schedule_mangle		// output low result
 								+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
 								+
 								+	// high round
 								+	bl	_vpaes_schedule_round
 								+	cbz 	$inp, .Lschedule_mangle_last
 								+	bl	_vpaes_schedule_mangle
 								+
 								+	// low round. swap xmm7 and xmm6
 								+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
 								+	movi	v4.16b, #0
 								+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
 								+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
 								+	bl	_vpaes_schedule_low_round
 								+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
 								+
 								+	b	.Loop_schedule_256
 								+
 								+##
 								+##  .aes_schedule_mangle_last
 								+##
 								+##  Mangler for last round of key schedule
 								+##  Mangles %xmm0
 								+##    when encrypting, outputs out(%xmm0) ^ 63
 								+##    when decrypting, outputs unskew(%xmm0)
 								+##
 								+##  Always called right before return... jumps to cleanup and exits
 								+##
 								+.align	4
 								+.Lschedule_mangle_last:
 								+	// schedule last round key from xmm0
 								+	adrp	x11, :pg_hi21:.Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
 								+	add	x11, x11, :lo12:.Lk_deskew
 								+
 								+	cbnz	$dir, .Lschedule_mangle_last_dec
 								+
 								+	// encrypting
 								+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
 								+	adrp	x11, :pg_hi21:.Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
 								+	add	x11, x11, :lo12:.Lk_opt
 								+	add	$out, $out, #32			// add	\$32,	%rdx
 								+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
 								+
 								+.Lschedule_mangle_last_dec:
 								+	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
 								+	sub	$out, $out, #16			// add	\$-16,	%rdx
 								+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
 								+	bl	_vpaes_schedule_transform	// output transform
 								+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
 								+
 								+	// cleanup
 								+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
 								+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
 								+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
 								+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
 								+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
 								+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
 								+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
 								+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
 								+	ldp	x29, x30, [sp],#16
 								+	AARCH64_VALIDATE_LINK_REGISTER
 								+	ret
 								+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
 								+
 								+##
 								+##  .aes_schedule_192_smear
 								+##
 								+##  Smear the short, low side in the 192-bit key schedule.
 								+##
 								+##  Inputs:
 								+##    %xmm7: high side, b  a  x  y
 								+##    %xmm6:  low side, d  c  0  0
 								+##    %xmm13: 0
 								+##
 								+##  Outputs:
 								+##    %xmm6: b+c+d  b+c  0  0
 								+##    %xmm0: b+c+d  b+c  b  a
 								+##
 								+.type	_vpaes_schedule_192_smear,%function
 								+.align	4
 								+_vpaes_schedule_192_smear:
 								+	movi	v1.16b, #0
 								+	dup	v0.4s, v7.s[3]
 								+	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
 								+	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
 								+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
 								+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
 								+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
 								+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
 								+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
 								+	ret
 								+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
 								+
 								+##
 								+##  .aes_schedule_round
 								+##
 								+##  Runs one main round of the key schedule on %xmm0, %xmm7
 								+##
 								+##  Specifically, runs subbytes on the high dword of %xmm0
 								+##  then rotates it by one byte and xors into the low dword of
 								+##  %xmm7.
 								+##
 								+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
 								+##  next rcon.
 								+##
 								+##  Smears the dwords of %xmm7 by xoring the low into the
 								+##  second low, result into third, result into highest.
 								+##
 								+##  Returns results in %xmm7 = %xmm0.
 								+##  Clobbers %xmm1-%xmm4, %r11.
 								+##
 								+.type	_vpaes_schedule_round,%function
 								+.align	4
 								+_vpaes_schedule_round:
 								+	// extract rcon from xmm8
 								+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
 								+	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
 								+	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
 								+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
 								+
 								+	// rotate
 								+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
 								+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
 								+
 								+	// fall through...
 								+
 								+	// low round: same as high round, but no rotation and no rcon.
 								+_vpaes_schedule_low_round:
 								+	// smear xmm7
 								+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
 								+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
 								+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
 								+
 								+	// subbytes
 								+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
 								+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
 								+	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
 								+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
 								+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
 								+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
 								+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
 								+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
 								+	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
 								+	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
 								+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
 								+	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
 								+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
 								+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
 								+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
 								+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
 								+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
 								+
 								+	// add in smeared stuff
 								+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
 								+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
 								+	ret
 								+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
 								+
 								+##
 								+##  .aes_schedule_transform
 								+##
 								+##  Linear-transform %xmm0 according to tables at (%r11)
 								+##
 								+##  Requires that %xmm9 = 0x0F0F... as in preheat
 								+##  Output in %xmm0
 								+##  Clobbers %xmm1, %xmm2
 								+##
 								+.type	_vpaes_schedule_transform,%function
 								+.align	4
 								+_vpaes_schedule_transform:
 								+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
 								+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
 								+						// vmovdqa	(%r11),	%xmm2 	# lo
 								+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
 								+						// vmovdqa	16(%r11),	%xmm1 # hi
 								+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
 								+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
 								+	ret
 								+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
 								+
 								+##
 								+##  .aes_schedule_mangle
 								+##
 								+##  Mangle xmm0 from (basis-transformed) standard version
 								+##  to our version.
 								+##
 								+##  On encrypt,
 								+##    xor with 0x63
 								+##    multiply by circulant 0,1,1,1
 								+##    apply shiftrows transform
 								+##
 								+##  On decrypt,
 								+##    xor with 0x63
 								+##    multiply by "inverse mixcolumns" circulant E,B,D,9
 								+##    deskew
 								+##    apply shiftrows transform
 								+##
 								+##
 								+##  Writes out to (%rdx), and increments or decrements it
 								+##  Keeps track of round number mod 4 in %r8
 								+##  Preserves xmm0
 								+##  Clobbers xmm1-xmm5
 								+##
 								+.type	_vpaes_schedule_mangle,%function
 								+.align	4
 								+_vpaes_schedule_mangle:
 								+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
 								+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
 								+
 								+	// encrypting
 								+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
 								+	add	$out, $out, #16			// add	\$16,	%rdx
 								+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
 								+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
 								+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
 								+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
 								+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
 								+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
 								+
 								+.Lschedule_mangle_both:
 								+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
 								+	add	x8, x8, #64-16			// add	\$-16,	%r8
 								+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
 								+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
 								+	ret
 								+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
 								+
 								+.globl	GFp_vpaes_set_encrypt_key
 								+.type	GFp_vpaes_set_encrypt_key,%function
 								+.align	4
 								+GFp_vpaes_set_encrypt_key:
 								+	AARCH64_SIGN_LINK_REGISTER
 								+	stp	x29,x30,[sp,#-16]!
 								+	add	x29,sp,#0
 								+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
 								+
 								+	lsr	w9, $bits, #5		// shr	\$5,%eax
 								+	add	w9, w9, #5		// \$5,%eax
 								+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
 								+
 								+	mov	$dir, #0		// mov	\$0,%ecx
 								+	mov	x8, #0x30		// mov	\$0x30,%r8d
 								+	bl	_vpaes_schedule_core
 								+	eor	x0, x0, x0
 								+
 								+	ldp	d8,d9,[sp],#16
 								+	ldp	x29,x30,[sp],#16
 								+	AARCH64_VALIDATE_LINK_REGISTER
 								+	ret
 								+.size	GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
 								+___
 								+}
 								+{
 								+my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4));
 								+my ($ctr, $ctr_tmp) = ("w6", "w7");
 								+
 								+# void GFp_vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
 								+#                                     const AES_KEY *key, const uint8_t ivec[16]);
 								+$code.=<<___;
 								+.globl	GFp_vpaes_ctr32_encrypt_blocks
 								+.type	GFp_vpaes_ctr32_encrypt_blocks,%function
 								+.align	4
 								+GFp_vpaes_ctr32_encrypt_blocks:
 								+	AARCH64_SIGN_LINK_REGISTER
 								+	stp	x29,x30,[sp,#-16]!
 								+	add	x29,sp,#0
 								+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
 								+	stp	d10,d11,[sp,#-16]!
 								+	stp	d12,d13,[sp,#-16]!
 								+	stp	d14,d15,[sp,#-16]!
 								+
 								+	cbz	$len, .Lctr32_done
 								+
 								+	// Note, unlike the other functions, $len here is measured in blocks,
 								+	// not bytes.
 								+	mov	x17, $len
 								+	mov	x2,  $key
 								+
 								+	// Load the IV and counter portion.
 								+	ldr	$ctr, [$ivec, #12]
 								+	ld1	{v7.16b}, [$ivec]
 								+
 								+	bl	_vpaes_encrypt_preheat
 								+	tst	x17, #1
 								+	rev	$ctr, $ctr		// The counter is big-endian.
 								+	b.eq	.Lctr32_prep_loop
 								+
 								+	// Handle one block so the remaining block count is even for
 								+	// _vpaes_encrypt_2x.
 								+	ld1	{v6.16b}, [$inp], #16	// Load input ahead of time
 								+	bl	_vpaes_encrypt_core
 								+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
 								+	st1	{v0.16b}, [$out], #16
 								+	subs	x17, x17, #1
 								+	// Update the counter.
 								+	add	$ctr, $ctr, #1
 								+	rev	$ctr_tmp, $ctr
 								+	mov	v7.s[3], $ctr_tmp
 								+	b.ls	.Lctr32_done
 								+
 								+.Lctr32_prep_loop:
 								+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
 								+	// uses v14 and v15.
 								+	mov	v15.16b, v7.16b
 								+	mov	v14.16b, v7.16b
 								+	add	$ctr, $ctr, #1
 								+	rev	$ctr_tmp, $ctr
 								+	mov	v15.s[3], $ctr_tmp
 								+
 								+.Lctr32_loop:
 								+	ld1	{v6.16b,v7.16b}, [$inp], #32	// Load input ahead of time
 								+	bl	_vpaes_encrypt_2x
 								+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
 								+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
 								+	st1	{v0.16b,v1.16b}, [$out], #32
 								+	subs	x17, x17, #2
 								+	// Update the counter.
 								+	add	$ctr_tmp, $ctr, #1
 								+	add	$ctr, $ctr, #2
 								+	rev	$ctr_tmp, $ctr_tmp
 								+	mov	v14.s[3], $ctr_tmp
 								+	rev	$ctr_tmp, $ctr
 								+	mov	v15.s[3], $ctr_tmp
 								+	b.hi	.Lctr32_loop
 								+
 								+.Lctr32_done:
 								+	ldp	d14,d15,[sp],#16
 								+	ldp	d12,d13,[sp],#16
 								+	ldp	d10,d11,[sp],#16
 								+	ldp	d8,d9,[sp],#16
 								+	ldp	x29,x30,[sp],#16
 								+	AARCH64_VALIDATE_LINK_REGISTER
 								+	ret
 								+.size	GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
 								+___
 								+}
 								+
 								+print $code;
 								+
 								+close STDOUT or die "error closing STDOUT";
 								diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
 								new file mode 100644
 								index 0000000..7e52ad6
 								--- /dev/null
 								+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,294 @@
 								+#! /usr/bin/env perl
 								+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
 								+#
 								+# Licensed under the OpenSSL license (the "License").  You may not use
 								+# this file except in compliance with the License.  You can obtain a copy
 								+# in the file LICENSE in the source distribution or at
 								+# https://www.openssl.org/source/license.html
 								+
 								+# ====================================================================
 								+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 								+# project. The module is, however, dual licensed under OpenSSL and
 								+# CRYPTOGAMS licenses depending on where you obtain it. For further
 								+# details see http://www.openssl.org/~appro/cryptogams/.
 								+# ====================================================================
 								+
 								+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
 								+# implements the multiplication algorithm described in:
 								+#
 								+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 								+# Polynomial Multiplication on ARM Processors using the NEON Engine.
 								+#
 								+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 								+#
 								+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
 								+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
 								+# NEON, the low and high halves of the 128-bit register q0 are accessible as
 								+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
 								+# vN. Where the 32-bit version would use the upper half, this file must keep
 								+# halves in separate registers.
 								+#
 								+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
 								+# instruction name, while AArch64 uses suffixes on the registers. For instance,
 								+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
 								+#
 								+#     vshl.i64 q0, q0, #1
 								+#
 								+# in 64-bit, it would be written:
 								+#
 								+#     shl v0.2d, v0.2d, #1
 								+#
 								+# See Programmer's Guide for ARMv8-A, section 7 for details.
 								+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
 								+#
 								+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
 								+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
 								+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
 								+# polynomial and is conditioned on the PMULL extension. This file emulates the
 								+# latter with the former.
 								+
 								+use strict;
 								+
 								+my $flavour = shift;
 								+my $output;
 								+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
 								+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
 								+
 								+if ($flavour && $flavour ne "void") {
 								+    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
 								+    my $dir = $1;
 								+    my $xlate;
 								+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
 								+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
 								+    die "can't locate arm-xlate.pl";
 								+
 								+    open OUT,"| \"$^X\" $xlate $flavour $output";
 								+    *STDOUT=*OUT;
 								+} else {
 								+    open OUT,">$output";
 								+    *STDOUT=*OUT;
 								+}
 								+
 								+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
 								+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
 								+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
 								+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
 								+# to spare.
 								+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
 								+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
 								+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
 								+
 								+my $code = "";
 								+
 								+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
 								+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
 								+sub clmul64x64 {
 								+my ($r, $a, $b) = @_;
 								+$code .= <<___;
 								+	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
 								+	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
 								+	ext	$r.8b, $b.8b, $b.8b, #1		// B1
 								+	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
 								+	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
 								+	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
 								+	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
 								+	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
 								+	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
 								+	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
 								+	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
 								+	ext	$r.8b, $b.8b, $b.8b, #3		// B3
 								+	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
 								+	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
 								+
 								+	// Here we diverge from the 32-bit version. It computes the following
 								+	// (instructions reordered for clarity):
 								+	//
 								+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
 								+	//     vand	\$t0#hi, \$t0#hi, \$k48
 								+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
 								+	//
 								+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
 								+	//     vand	\$t1#hi, \$t1#hi, \$k32
 								+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
 								+	//
 								+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
 								+	//     vand	\$t2#hi, \$t2#hi, \$k16
 								+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
 								+	//
 								+	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
 								+	//     vmov.i64	\$t3#hi, #0
 								+	//
 								+	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
 								+	// upper halves of SIMD registers, so we must split each half into
 								+	// separate registers. To compensate, we pair computations up and
 								+	// parallelize.
 								+
 								+	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
 								+	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
 								+	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
 								+
 								+	// This can probably be scheduled more efficiently. For now, we just
 								+	// pair up independent instructions.
 								+	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
 								+	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
 								+	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
 								+	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
 								+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
 								+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
 								+	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
 								+	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
 								+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
 								+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
 								+	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
 								+	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
 								+	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
 								+	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
 								+
 								+	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
 								+	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
 								+	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
 								+	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
 								+	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
 								+	eor	$t0.16b, $t0.16b, $t1.16b
 								+	eor	$t2.16b, $t2.16b, $t3.16b
 								+	eor	$r.16b, $r.16b, $t0.16b
 								+	eor	$r.16b, $r.16b, $t2.16b
 								+___
 								+}
 								+
 								+$code .= <<___;
 								+#include <GFp/arm_arch.h>
 								+
 								+.text
 								+
 								+.global	GFp_gcm_init_neon
 								+.type	GFp_gcm_init_neon,%function
 								+.align	4
 								+GFp_gcm_init_neon:
 								+	AARCH64_VALID_CALL_TARGET
 								+	// This function is adapted from gcm_init_v8. xC2 is t3.
 								+	ld1	{$t1.2d}, [x1]			// load H
 								+	movi	$t3.16b, #0xe1
 								+	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
 								+	ext	$INlo.16b, $t1.16b, $t1.16b, #8
 								+	ushr	$t2.2d, $t3.2d, #63
 								+	dup	$t1.4s, $t1.s[1]
 								+	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
 								+	ushr	$t2.2d, $INlo.2d, #63
 								+	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
 								+	and	$t2.16b, $t2.16b, $t0.16b
 								+	shl	$INlo.2d, $INlo.2d, #1
 								+	ext	$t2.16b, $t2.16b, $t2.16b, #8
 								+	and	$t0.16b, $t0.16b, $t1.16b
 								+	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
 								+	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
 								+	st1	{$Hlo.2d}, [x0]			// store Htable[0]
 								+	ret
 								+.size	GFp_gcm_init_neon,.-GFp_gcm_init_neon
 								+
 								+.global	GFp_gcm_gmult_neon
 								+.type	GFp_gcm_gmult_neon,%function
 								+.align	4
 								+GFp_gcm_gmult_neon:
 								+	AARCH64_VALID_CALL_TARGET
 								+	ld1	{$INlo.16b}, [$Xi]		// load Xi
 								+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
 								+	ld1	{$Hhi.1d}, [$Htbl]
 								+	adrp	x9, :pg_hi21:.Lmasks		// load constants
 								+	add	x9, x9, :lo12:.Lmasks
 								+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
 								+	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
 								+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
 								+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
 								+
 								+	mov	$len, #16
 								+	b	.Lgmult_neon
 								+.size	GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
 								+
 								+.global	GFp_gcm_ghash_neon
 								+.type	GFp_gcm_ghash_neon,%function
 								+.align	4
 								+GFp_gcm_ghash_neon:
 								+	AARCH64_VALID_CALL_TARGET
 								+	ld1	{$Xl.16b}, [$Xi]		// load Xi
 								+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
 								+	ld1	{$Hhi.1d}, [$Htbl]
 								+	adrp	x9, :pg_hi21:.Lmasks		// load constants
 								+	add	x9, x9, :lo12:.Lmasks
 								+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
 								+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
 								+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
 								+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
 								+
 								+.Loop_neon:
 								+	ld1	{$INlo.16b}, [$inp], #16	// load inp
 								+	rev64	$INlo.16b, $INlo.16b		// byteswap inp
 								+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
 								+	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
 								+
 								+.Lgmult_neon:
 								+	// Split the input into $INlo and $INhi. (The upper halves are unused,
 								+	// so it is okay to leave them alone.)
 								+	ins	$INhi.d[0], $INlo.d[1]
 								+___
 								+&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
 								+$code .= <<___;
 								+	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
 								+___
 								+&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
 								+&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
 								+$code .= <<___;
 								+	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
 								+	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
 								+	eor	$Xm.16b, $Xm.16b, $Xh.16b
 								+	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
 								+	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
 								+	// This is a no-op due to the ins instruction below.
 								+	// ins	$Xh.d[0], $Xm.d[1]
 								+
 								+	// equivalent of reduction_avx from ghash-x86_64.pl
 								+	shl	$t1.2d, $Xl.2d, #57		// 1st phase
 								+	shl	$t2.2d, $Xl.2d, #62
 								+	eor	$t2.16b, $t2.16b, $t1.16b	//
 								+	shl	$t1.2d, $Xl.2d, #63
 								+	eor	$t2.16b, $t2.16b, $t1.16b	//
 								+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
 								+	eor	$t2.16b, $t2.16b, $Xm.16b
 								+	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
 								+	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
 								+
 								+	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
 								+	eor	$Xh.16b, $Xh.16b,$Xl.16b
 								+	eor	$Xl.16b, $Xl.16b,$t2.16b	//
 								+	ushr	$t2.2d, $t2.2d, #6
 								+	ushr	$Xl.2d, $Xl.2d, #1		//
 								+	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
 								+	eor	$Xl.16b, $Xl.16b, $t2.16b	//
 								+
 								+	subs	$len, $len, #16
 								+	bne	.Loop_neon
 								+
 								+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
 								+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
 								+	st1	{$Xl.16b}, [$Xi]
 								+
 								+	ret
 								+.size	GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
 								+
 								+.section	.rodata
 								+.align	4
 								+.Lmasks:
 								+.quad	0x0000ffffffffffff	// k48
 								+.quad	0x00000000ffffffff	// k32
 								+.quad	0x000000000000ffff	// k16
 								+.quad	0x0000000000000000	// k0
 								+.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
 								+.align  2
 								+___
 								+
 								+foreach (split("\n",$code)) {
 								+	s/\`([^\`]*)\`/eval $1/geo;
 								+
 								+	print $_,"\n";
 								+}
 								+close STDOUT or die "error closing STDOUT"; # enforce flush
 								--
 								Efraim Flashner   <efraim@flashner.co.il>   רנשלפ םירפא
 								GPG key = A28B F40C 3E55 1372 662D  14F7 41AA E7DC CA3D 8351
 								Confidentiality cannot be guaranteed on emails sent or received unencrypted