2294 lines
78 KiB
Diff
2294 lines
78 KiB
Diff
|
These 4 files exist in the git repository for rust-ring, and are from
|
|||
|
the same commit where 0.16.20 is taken from. They were not added to the
|
|||
|
include list in Cargo.toml, so they were not added to the tarball.
|
|||
|
|
|||
|
---
|
|||
|
crypto/curve25519/make_curve25519_tables.py | 222 +++++
|
|||
|
crypto/fipsmodule/aes/asm/vpaes-armv7.pl | 896 ++++++++++++++++++
|
|||
|
crypto/fipsmodule/aes/asm/vpaes-armv8.pl | 837 ++++++++++++++++
|
|||
|
.../fipsmodule/modes/asm/ghash-neon-armv8.pl | 294 ++++++
|
|||
|
4 files changed, 2249 insertions(+)
|
|||
|
create mode 100755 crypto/curve25519/make_curve25519_tables.py
|
|||
|
create mode 100644 crypto/fipsmodule/aes/asm/vpaes-armv7.pl
|
|||
|
create mode 100755 crypto/fipsmodule/aes/asm/vpaes-armv8.pl
|
|||
|
create mode 100644 crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
|
|||
|
|
|||
|
diff --git a/crypto/curve25519/make_curve25519_tables.py b/crypto/curve25519/make_curve25519_tables.py
|
|||
|
new file mode 100755
|
|||
|
index 0000000..50dee2a
|
|||
|
--- /dev/null
|
|||
|
+++ b/crypto/curve25519/make_curve25519_tables.py
|
|||
|
@@ -0,0 +1,222 @@
|
|||
|
+#!/usr/bin/env python
|
|||
|
+# coding=utf-8
|
|||
|
+# Copyright (c) 2020, Google Inc.
|
|||
|
+#
|
|||
|
+# Permission to use, copy, modify, and/or distribute this software for any
|
|||
|
+# purpose with or without fee is hereby granted, provided that the above
|
|||
|
+# copyright notice and this permission notice appear in all copies.
|
|||
|
+#
|
|||
|
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|||
|
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|||
|
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|||
|
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|||
|
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|||
|
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|||
|
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|||
|
+
|
|||
|
+import StringIO
|
|||
|
+import subprocess
|
|||
|
+
|
|||
|
+# Base field Z_p
|
|||
|
+p = 2**255 - 19
|
|||
|
+
|
|||
|
+def modp_inv(x):
|
|||
|
+ return pow(x, p-2, p)
|
|||
|
+
|
|||
|
+# Square root of -1
|
|||
|
+modp_sqrt_m1 = pow(2, (p-1) // 4, p)
|
|||
|
+
|
|||
|
+# Compute corresponding x-coordinate, with low bit corresponding to
|
|||
|
+# sign, or return None on failure
|
|||
|
+def recover_x(y, sign):
|
|||
|
+ if y >= p:
|
|||
|
+ return None
|
|||
|
+ x2 = (y*y-1) * modp_inv(d*y*y+1)
|
|||
|
+ if x2 == 0:
|
|||
|
+ if sign:
|
|||
|
+ return None
|
|||
|
+ else:
|
|||
|
+ return 0
|
|||
|
+
|
|||
|
+ # Compute square root of x2
|
|||
|
+ x = pow(x2, (p+3) // 8, p)
|
|||
|
+ if (x*x - x2) % p != 0:
|
|||
|
+ x = x * modp_sqrt_m1 % p
|
|||
|
+ if (x*x - x2) % p != 0:
|
|||
|
+ return None
|
|||
|
+
|
|||
|
+ if (x & 1) != sign:
|
|||
|
+ x = p - x
|
|||
|
+ return x
|
|||
|
+
|
|||
|
+# Curve constant
|
|||
|
+d = -121665 * modp_inv(121666) % p
|
|||
|
+
|
|||
|
+# Base point
|
|||
|
+g_y = 4 * modp_inv(5) % p
|
|||
|
+g_x = recover_x(g_y, 0)
|
|||
|
+
|
|||
|
+# Points are represented as affine tuples (x, y).
|
|||
|
+
|
|||
|
+def point_add(P, Q):
|
|||
|
+ x1, y1 = P
|
|||
|
+ x2, y2 = Q
|
|||
|
+ x3 = ((x1*y2 + y1*x2) * modp_inv(1 + d*x1*x2*y1*y2)) % p
|
|||
|
+ y3 = ((y1*y2 + x1*x2) * modp_inv(1 - d*x1*x2*y1*y2)) % p
|
|||
|
+ return (x3, y3)
|
|||
|
+
|
|||
|
+# Computes Q = s * P
|
|||
|
+def point_mul(s, P):
|
|||
|
+ Q = (0, 1) # Neutral element
|
|||
|
+ while s > 0:
|
|||
|
+ if s & 1:
|
|||
|
+ Q = point_add(Q, P)
|
|||
|
+ P = point_add(P, P)
|
|||
|
+ s >>= 1
|
|||
|
+ return Q
|
|||
|
+
|
|||
|
+def to_bytes(x):
|
|||
|
+ ret = bytearray(32)
|
|||
|
+ for i in range(len(ret)):
|
|||
|
+ ret[i] = x % 256
|
|||
|
+ x >>= 8
|
|||
|
+ assert x == 0
|
|||
|
+ return ret
|
|||
|
+
|
|||
|
+def to_ge_precomp(P):
|
|||
|
+ # typedef struct {
|
|||
|
+ # fe_loose yplusx;
|
|||
|
+ # fe_loose yminusx;
|
|||
|
+ # fe_loose xy2d;
|
|||
|
+ # } ge_precomp;
|
|||
|
+ x, y = P
|
|||
|
+ return ((y + x) % p, (y - x) % p, (x * y * 2 * d) % p)
|
|||
|
+
|
|||
|
+def to_base_25_5(x):
|
|||
|
+ limbs = (26, 25, 26, 25, 26, 25, 26, 25, 26, 25)
|
|||
|
+ ret = []
|
|||
|
+ for l in limbs:
|
|||
|
+ ret.append(x & ((1<<l) - 1))
|
|||
|
+ x >>= l
|
|||
|
+ assert x == 0
|
|||
|
+ return ret
|
|||
|
+
|
|||
|
+def to_base_51(x):
|
|||
|
+ ret = []
|
|||
|
+ for _ in range(5):
|
|||
|
+ ret.append(x & ((1<<51) - 1))
|
|||
|
+ x >>= 51
|
|||
|
+ assert x == 0
|
|||
|
+ return ret
|
|||
|
+
|
|||
|
+def to_literal(x):
|
|||
|
+ ret = "{{\n#if defined(BORINGSSL_CURVE25519_64BIT)\n"
|
|||
|
+ ret += ", ".join(map(str, to_base_51(x)))
|
|||
|
+ ret += "\n#else\n"
|
|||
|
+ ret += ", ".join(map(str, to_base_25_5(x)))
|
|||
|
+ ret += "\n#endif\n}}"
|
|||
|
+ return ret
|
|||
|
+
|
|||
|
+def main():
|
|||
|
+ d2 = (2 * d) % p
|
|||
|
+
|
|||
|
+ small_precomp = bytearray()
|
|||
|
+ for i in range(1, 16):
|
|||
|
+ s = (i&1) | ((i&2) << (64-1)) | ((i&4) << (128-2)) | ((i&8) << (192-3))
|
|||
|
+ P = point_mul(s, (g_x, g_y))
|
|||
|
+ small_precomp += to_bytes(P[0])
|
|||
|
+ small_precomp += to_bytes(P[1])
|
|||
|
+
|
|||
|
+ large_precomp = []
|
|||
|
+ for i in range(32):
|
|||
|
+ large_precomp.append([])
|
|||
|
+ for j in range(8):
|
|||
|
+ P = point_mul((j + 1) << (i * 8), (g_x, g_y))
|
|||
|
+ large_precomp[-1].append(to_ge_precomp(P))
|
|||
|
+
|
|||
|
+ bi_precomp = []
|
|||
|
+ for i in range(8):
|
|||
|
+ P = point_mul(2*i + 1, (g_x, g_y))
|
|||
|
+ bi_precomp.append(to_ge_precomp(P))
|
|||
|
+
|
|||
|
+
|
|||
|
+ buf = StringIO.StringIO()
|
|||
|
+ buf.write("""/* Copyright (c) 2020, Google Inc.
|
|||
|
+ *
|
|||
|
+ * Permission to use, copy, modify, and/or distribute this software for any
|
|||
|
+ * purpose with or without fee is hereby granted, provided that the above
|
|||
|
+ * copyright notice and this permission notice appear in all copies.
|
|||
|
+ *
|
|||
|
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|||
|
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|||
|
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|||
|
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|||
|
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|||
|
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|||
|
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
|||
|
+
|
|||
|
+// This file is generated from
|
|||
|
+// ./make_curve25519_tables.py > curve25519_tables.h
|
|||
|
+
|
|||
|
+
|
|||
|
+static const fe d = """)
|
|||
|
+ buf.write(to_literal(d))
|
|||
|
+ buf.write(""";
|
|||
|
+
|
|||
|
+static const fe sqrtm1 = """)
|
|||
|
+ buf.write(to_literal(modp_sqrt_m1))
|
|||
|
+ buf.write(""";
|
|||
|
+
|
|||
|
+static const fe d2 = """)
|
|||
|
+ buf.write(to_literal(d2))
|
|||
|
+ buf.write(""";
|
|||
|
+
|
|||
|
+#if defined(OPENSSL_SMALL)
|
|||
|
+
|
|||
|
+// This block of code replaces the standard base-point table with a much smaller
|
|||
|
+// one. The standard table is 30,720 bytes while this one is just 960.
|
|||
|
+//
|
|||
|
+// This table contains 15 pairs of group elements, (x, y), where each field
|
|||
|
+// element is serialised with |fe_tobytes|. If |i| is the index of the group
|
|||
|
+// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀
|
|||
|
+// is the most significant bit). The value of the group element is then:
|
|||
|
+// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator.
|
|||
|
+static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""")
|
|||
|
+ for i, b in enumerate(small_precomp):
|
|||
|
+ buf.write("0x%02x, " % b)
|
|||
|
+ buf.write("""
|
|||
|
+};
|
|||
|
+
|
|||
|
+#else
|
|||
|
+
|
|||
|
+// k25519Precomp[i][j] = (j+1)*256^i*B
|
|||
|
+static const ge_precomp k25519Precomp[32][8] = {
|
|||
|
+""")
|
|||
|
+ for child in large_precomp:
|
|||
|
+ buf.write("{\n")
|
|||
|
+ for val in child:
|
|||
|
+ buf.write("{\n")
|
|||
|
+ for term in val:
|
|||
|
+ buf.write(to_literal(term) + ",\n")
|
|||
|
+ buf.write("},\n")
|
|||
|
+ buf.write("},\n")
|
|||
|
+ buf.write("""};
|
|||
|
+
|
|||
|
+#endif // OPENSSL_SMALL
|
|||
|
+
|
|||
|
+// Bi[i] = (2*i+1)*B
|
|||
|
+static const ge_precomp Bi[8] = {
|
|||
|
+""")
|
|||
|
+ for val in bi_precomp:
|
|||
|
+ buf.write("{\n")
|
|||
|
+ for term in val:
|
|||
|
+ buf.write(to_literal(term) + ",\n")
|
|||
|
+ buf.write("},\n")
|
|||
|
+ buf.write("""};
|
|||
|
+""")
|
|||
|
+
|
|||
|
+ proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE)
|
|||
|
+ proc.communicate(buf.getvalue())
|
|||
|
+
|
|||
|
+if __name__ == "__main__":
|
|||
|
+ main()
|
|||
|
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
|
|||
|
new file mode 100644
|
|||
|
index 0000000..d36a97a
|
|||
|
--- /dev/null
|
|||
|
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
|
|||
|
@@ -0,0 +1,896 @@
|
|||
|
+#! /usr/bin/env perl
|
|||
|
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|||
|
+#
|
|||
|
+# Licensed under the OpenSSL license (the "License"). You may not use
|
|||
|
+# this file except in compliance with the License. You can obtain a copy
|
|||
|
+# in the file LICENSE in the source distribution or at
|
|||
|
+# https://www.openssl.org/source/license.html
|
|||
|
+
|
|||
|
+
|
|||
|
+######################################################################
|
|||
|
+## Constant-time SSSE3 AES core implementation.
|
|||
|
+## version 0.1
|
|||
|
+##
|
|||
|
+## By Mike Hamburg (Stanford University), 2009
|
|||
|
+## Public domain.
|
|||
|
+##
|
|||
|
+## For details see http://shiftleft.org/papers/vector_aes/ and
|
|||
|
+## http://crypto.stanford.edu/vpaes/.
|
|||
|
+##
|
|||
|
+######################################################################
|
|||
|
+# Adapted from the original x86_64 version and <appro@openssl.org>'s ARMv8
|
|||
|
+# version.
|
|||
|
+#
|
|||
|
+# armv7, aarch64, and x86_64 differ in several ways:
|
|||
|
+#
|
|||
|
+# * x86_64 SSSE3 instructions are two-address (destination operand is also a
|
|||
|
+# source), while NEON is three-address (destination operand is separate from
|
|||
|
+# two sources).
|
|||
|
+#
|
|||
|
+# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16.
|
|||
|
+#
|
|||
|
+# * x86_64 instructions can take memory references, while ARM is a load/store
|
|||
|
+# architecture. This means we sometimes need a spare register.
|
|||
|
+#
|
|||
|
+# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb),
|
|||
|
+# while armv7 only has a 64-bit byte shuffle (vtbl).
|
|||
|
+#
|
|||
|
+# This means this armv7 version must be a mix of both aarch64 and x86_64
|
|||
|
+# implementations. armv7 and aarch64 have analogous SIMD instructions, so we
|
|||
|
+# base the instructions on aarch64. However, we cannot use aarch64's register
|
|||
|
+# allocation. x86_64's register count matches, but x86_64 is two-address.
|
|||
|
+# vpaes-armv8.pl already accounts for this in the comments, which use
|
|||
|
+# three-address AVX instructions instead of the original SSSE3 ones. We base
|
|||
|
+# register usage on these comments, which are preserved in this file.
|
|||
|
+#
|
|||
|
+# This means we do not use separate input and output registers as in aarch64 and
|
|||
|
+# cannot pin as many constants in the preheat functions. However, the load/store
|
|||
|
+# architecture means we must still deviate from x86_64 in places.
|
|||
|
+#
|
|||
|
+# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source
|
|||
|
+# and destination and 128-bit table. Fortunately, armv7 also allows addressing
|
|||
|
+# upper and lower halves of each 128-bit register. The lower half of q{N} is
|
|||
|
+# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent
|
|||
|
+# instruction,
|
|||
|
+#
|
|||
|
+# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0.
|
|||
|
+#
|
|||
|
+# we write:
|
|||
|
+#
|
|||
|
+# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0.
|
|||
|
+# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1.
|
|||
|
+#
|
|||
|
+# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and
|
|||
|
+# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note,
|
|||
|
+# however, that destination (q0) and table (q1) registers may no longer match.
|
|||
|
+# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
|
|||
|
+# two-address pshufb always matched these operands, so this is common.)
|
|||
|
+#
|
|||
|
+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
|
|||
|
+# expands to an ADD or SUB of the pc register to find an address. That immediate
|
|||
|
+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
|
|||
|
+# This means larger values must be more aligned.
|
|||
|
+#
|
|||
|
+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
|
|||
|
+# use either encoding (do we actually need to support this?). In ARM mode, the
|
|||
|
+# distances get large enough to require 16-byte alignment. Moving constants
|
|||
|
+# closer to their use resolves most of this, but common constants in
|
|||
|
+# _vpaes_consts are used by the whole file. Affected ADR instructions must be
|
|||
|
+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
|
|||
|
+# constraint have been commented.
|
|||
|
+#
|
|||
|
+# For details on ARM's immediate value encoding scheme, see
|
|||
|
+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
|
|||
|
+#
|
|||
|
+# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
|
|||
|
+#
|
|||
|
+# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
|
|||
|
+#
|
|||
|
+# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones).
|
|||
|
+# aarch64 names registers like v0, and denotes half-width operations in an
|
|||
|
+# instruction suffix (see below).
|
|||
|
+#
|
|||
|
+# * aarch64 embeds size and lane information in register suffixes. v0.16b is
|
|||
|
+# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s.
|
|||
|
+# armv7 embeds the total size in the register name (see above) and the size of
|
|||
|
+# each element in an instruction suffix, which may look like vmov.i8,
|
|||
|
+# vshr.u8, or vtbl.8, depending on instruction.
|
|||
|
+
|
|||
|
+use strict;
|
|||
|
+
|
|||
|
+my $flavour = shift;
|
|||
|
+my $output;
|
|||
|
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|||
|
+
|
|||
|
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
|
|||
|
+my $dir=$1;
|
|||
|
+my $xlate;
|
|||
|
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|||
|
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|||
|
+die "can't locate arm-xlate.pl";
|
|||
|
+
|
|||
|
+open OUT,"| \"$^X\" $xlate $flavour $output";
|
|||
|
+*STDOUT=*OUT;
|
|||
|
+
|
|||
|
+my $code = "";
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+.syntax unified
|
|||
|
+
|
|||
|
+.arch armv7-a
|
|||
|
+.fpu neon
|
|||
|
+
|
|||
|
+#if defined(__thumb2__)
|
|||
|
+.thumb
|
|||
|
+#else
|
|||
|
+.code 32
|
|||
|
+#endif
|
|||
|
+
|
|||
|
+.text
|
|||
|
+
|
|||
|
+.type _vpaes_consts,%object
|
|||
|
+.align 7 @ totally strategic alignment
|
|||
|
+_vpaes_consts:
|
|||
|
+.Lk_mc_forward: @ mc_forward
|
|||
|
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
|
|||
|
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
|
|||
|
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
|
|||
|
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
|
|||
|
+.Lk_mc_backward:@ mc_backward
|
|||
|
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
|
|||
|
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
|
|||
|
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
|
|||
|
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
|
|||
|
+.Lk_sr: @ sr
|
|||
|
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
|
|||
|
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
|
|||
|
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
|
|||
|
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
|
|||
|
+
|
|||
|
+@
|
|||
|
+@ "Hot" constants
|
|||
|
+@
|
|||
|
+.Lk_inv: @ inv, inva
|
|||
|
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
|
|||
|
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
|
|||
|
+.Lk_ipt: @ input transform (lo, hi)
|
|||
|
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
|
|||
|
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
|
|||
|
+.Lk_sbo: @ sbou, sbot
|
|||
|
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
|
|||
|
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
|
|||
|
+.Lk_sb1: @ sb1u, sb1t
|
|||
|
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
|
|||
|
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
|
|||
|
+.Lk_sb2: @ sb2u, sb2t
|
|||
|
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
|
|||
|
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
|
|||
|
+
|
|||
|
+.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
|
|||
|
+.size _vpaes_consts,.-_vpaes_consts
|
|||
|
+.align 6
|
|||
|
+___
|
|||
|
+
|
|||
|
+{
|
|||
|
+my ($inp,$out,$key) = map("r$_", (0..2));
|
|||
|
+
|
|||
|
+my ($invlo,$invhi) = map("q$_", (10..11));
|
|||
|
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15));
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+@@
|
|||
|
+@@ _aes_preheat
|
|||
|
+@@
|
|||
|
+@@ Fills q9-q15 as specified below.
|
|||
|
+@@
|
|||
|
+.type _vpaes_preheat,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_preheat:
|
|||
|
+ adr r10, .Lk_inv
|
|||
|
+ vmov.i8 q9, #0x0f @ .Lk_s0F
|
|||
|
+ vld1.64 {q10,q11}, [r10]! @ .Lk_inv
|
|||
|
+ add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo
|
|||
|
+ vld1.64 {q12,q13}, [r10]! @ .Lk_sb1
|
|||
|
+ vld1.64 {q14,q15}, [r10] @ .Lk_sb2
|
|||
|
+ bx lr
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ _aes_encrypt_core
|
|||
|
+@@
|
|||
|
+@@ AES-encrypt q0.
|
|||
|
+@@
|
|||
|
+@@ Inputs:
|
|||
|
+@@ q0 = input
|
|||
|
+@@ q9-q15 as in _vpaes_preheat
|
|||
|
+@@ [$key] = scheduled keys
|
|||
|
+@@
|
|||
|
+@@ Output in q0
|
|||
|
+@@ Clobbers q1-q5, r8-r11
|
|||
|
+@@ Preserves q6-q8 so you get some local vectors
|
|||
|
+@@
|
|||
|
+@@
|
|||
|
+.type _vpaes_encrypt_core,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_encrypt_core:
|
|||
|
+ mov r9, $key
|
|||
|
+ ldr r8, [$key,#240] @ pull rounds
|
|||
|
+ adr r11, .Lk_ipt
|
|||
|
+ @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
|
|||
|
+ @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
|
|||
|
+ vld1.64 {q2, q3}, [r11]
|
|||
|
+ adr r11, .Lk_mc_forward+16
|
|||
|
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
|
|||
|
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
|
|||
|
+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0
|
|||
|
+ vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1
|
|||
|
+ vtbl.8 q1#hi, {q2}, q1#hi
|
|||
|
+ vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2
|
|||
|
+ vtbl.8 q2#hi, {q3}, q0#hi
|
|||
|
+ veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
|
|||
|
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
|
|||
|
+
|
|||
|
+ @ .Lenc_entry ends with a bnz instruction which is normally paired with
|
|||
|
+ @ subs in .Lenc_loop.
|
|||
|
+ tst r8, r8
|
|||
|
+ b .Lenc_entry
|
|||
|
+
|
|||
|
+.align 4
|
|||
|
+.Lenc_loop:
|
|||
|
+ @ middle of middle round
|
|||
|
+ add r10, r11, #0x40
|
|||
|
+ vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
|
|||
|
+ vtbl.8 q4#hi, {$sb1t}, q2#hi
|
|||
|
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
|
|||
|
+ vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
|
|||
|
+ vtbl.8 q0#hi, {$sb1u}, q3#hi
|
|||
|
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
|
|||
|
+ vtbl.8 q5#hi, {$sb2t}, q2#hi
|
|||
|
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
|
|||
|
+ vtbl.8 q2#hi, {$sb2u}, q3#hi
|
|||
|
+ vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
|
|||
|
+ vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
|
|||
|
+ vtbl.8 q3#hi, {q0}, q1#hi
|
|||
|
+ veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
|
|||
|
+ @ Write to q5 instead of q0, so the table and destination registers do
|
|||
|
+ @ not overlap.
|
|||
|
+ vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
|
|||
|
+ vtbl.8 q5#hi, {q0}, q4#hi
|
|||
|
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
|
|||
|
+ vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
|
|||
|
+ vtbl.8 q4#hi, {q3}, q1#hi
|
|||
|
+ @ Here we restore the original q0/q5 usage.
|
|||
|
+ veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
|
|||
|
+ and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4
|
|||
|
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
|
|||
|
+ subs r8, r8, #1 @ nr--
|
|||
|
+
|
|||
|
+.Lenc_entry:
|
|||
|
+ @ top of round
|
|||
|
+ vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
|
|||
|
+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i
|
|||
|
+ vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
|
|||
|
+ vtbl.8 q5#hi, {$invhi}, q1#hi
|
|||
|
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
|
|||
|
+ vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
|
|||
|
+ vtbl.8 q3#hi, {$invlo}, q0#hi
|
|||
|
+ vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
|
|||
|
+ vtbl.8 q4#hi, {$invlo}, q1#hi
|
|||
|
+ veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
|
|||
|
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
|
|||
|
+ vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
|
|||
|
+ vtbl.8 q2#hi, {$invlo}, q3#hi
|
|||
|
+ vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
|
|||
|
+ vtbl.8 q3#hi, {$invlo}, q4#hi
|
|||
|
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
|
|||
|
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
|
|||
|
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
|
|||
|
+ bne .Lenc_loop
|
|||
|
+
|
|||
|
+ @ middle of last round
|
|||
|
+ add r10, r11, #0x80
|
|||
|
+
|
|||
|
+ adr r11, .Lk_sbo
|
|||
|
+ @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
|
|||
|
+ @ overlap table and destination registers.
|
|||
|
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
|
|||
|
+ vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
|
|||
|
+ vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
|
|||
|
+ vtbl.8 q4#hi, {q1}, q2#hi
|
|||
|
+ vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
|
|||
|
+ @ Write to q2 instead of q0 below, to avoid overlapping table and
|
|||
|
+ @ destination registers.
|
|||
|
+ vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
|
|||
|
+ vtbl.8 q2#hi, {q0}, q3#hi
|
|||
|
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ @ Here we restore the original q0/q2 usage.
|
|||
|
+ vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0
|
|||
|
+ vtbl.8 q0#hi, {q2}, q1#hi
|
|||
|
+ bx lr
|
|||
|
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
|
|||
|
+
|
|||
|
+.globl GFp_vpaes_encrypt
|
|||
|
+.type GFp_vpaes_encrypt,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_encrypt:
|
|||
|
+ @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
|
|||
|
+ @ alignment.
|
|||
|
+ stmdb sp!, {r7-r11,lr}
|
|||
|
+ @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
|
|||
|
+ vstmdb sp!, {d8-d11}
|
|||
|
+
|
|||
|
+ vld1.64 {q0}, [$inp]
|
|||
|
+ bl _vpaes_preheat
|
|||
|
+ bl _vpaes_encrypt_core
|
|||
|
+ vst1.64 {q0}, [$out]
|
|||
|
+
|
|||
|
+ vldmia sp!, {d8-d11}
|
|||
|
+ ldmia sp!, {r7-r11, pc} @ return
|
|||
|
+.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
|
|||
|
+___
|
|||
|
+}
|
|||
|
+{
|
|||
|
+my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3");
|
|||
|
+my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12));
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|||
|
+@@ @@
|
|||
|
+@@ AES key schedule @@
|
|||
|
+@@ @@
|
|||
|
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|||
|
+
|
|||
|
+@ This function diverges from both x86_64 and armv7 in which constants are
|
|||
|
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
|
|||
|
+@ separates them because it has enough registers to pin nearly all constants.
|
|||
|
+@ armv7 does not have enough registers, but needing explicit loads and stores
|
|||
|
+@ also complicates using x86_64's register allocation directly.
|
|||
|
+@
|
|||
|
+@ We pin some constants for convenience and leave q14 and q15 free to load
|
|||
|
+@ others on demand.
|
|||
|
+
|
|||
|
+@
|
|||
|
+@ Key schedule constants
|
|||
|
+@
|
|||
|
+.type _vpaes_key_consts,%object
|
|||
|
+.align 4
|
|||
|
+_vpaes_key_consts:
|
|||
|
+.Lk_rcon: @ rcon
|
|||
|
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
|
|||
|
+
|
|||
|
+.Lk_opt: @ output transform
|
|||
|
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
|
|||
|
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
|
|||
|
+.Lk_deskew: @ deskew tables: inverts the sbox's "skew"
|
|||
|
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
|
|||
|
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
|
|||
|
+.size _vpaes_key_consts,.-_vpaes_key_consts
|
|||
|
+
|
|||
|
+.type _vpaes_key_preheat,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_key_preheat:
|
|||
|
+ adr r11, .Lk_rcon
|
|||
|
+ vmov.i8 $s63, #0x5b @ .Lk_s63
|
|||
|
+ adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
|
|||
|
+ vmov.i8 $s0F, #0x0f @ .Lk_s0F
|
|||
|
+ vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv
|
|||
|
+ vld1.64 {$rcon}, [r11] @ .Lk_rcon
|
|||
|
+ bx lr
|
|||
|
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
|
|||
|
+
|
|||
|
+.type _vpaes_schedule_core,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_core:
|
|||
|
+ @ We only need to save lr, but ARM requires an 8-byte stack alignment,
|
|||
|
+ @ so save an extra register.
|
|||
|
+ stmdb sp!, {r3,lr}
|
|||
|
+
|
|||
|
+ bl _vpaes_key_preheat @ load the tables
|
|||
|
+
|
|||
|
+ adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
|
|||
|
+ vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
|
|||
|
+
|
|||
|
+ @ input transform
|
|||
|
+ @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
|
|||
|
+ @ overlap table and destination.
|
|||
|
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm3
|
|||
|
+ bl _vpaes_schedule_transform
|
|||
|
+ adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
|
|||
|
+ vmov q7, q0 @ vmovdqa %xmm0, %xmm7
|
|||
|
+
|
|||
|
+ add r8, r8, r10
|
|||
|
+
|
|||
|
+ @ encrypting, output zeroth round key after transform
|
|||
|
+ vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx)
|
|||
|
+
|
|||
|
+ @ *ring*: Decryption removed.
|
|||
|
+
|
|||
|
+.Lschedule_go:
|
|||
|
+ cmp $bits, #192 @ cmp \$192, %esi
|
|||
|
+ bhi .Lschedule_256
|
|||
|
+ @ 128: fall though
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .schedule_128
|
|||
|
+@@
|
|||
|
+@@ 128-bit specific part of key schedule.
|
|||
|
+@@
|
|||
|
+@@ This schedule is really simple, because all its parts
|
|||
|
+@@ are accomplished by the subroutines.
|
|||
|
+@@
|
|||
|
+.Lschedule_128:
|
|||
|
+ mov $inp, #10 @ mov \$10, %esi
|
|||
|
+
|
|||
|
+.Loop_schedule_128:
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ subs $inp, $inp, #1 @ dec %esi
|
|||
|
+ beq .Lschedule_mangle_last
|
|||
|
+ bl _vpaes_schedule_mangle @ write output
|
|||
|
+ b .Loop_schedule_128
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .aes_schedule_256
|
|||
|
+@@
|
|||
|
+@@ 256-bit specific part of key schedule.
|
|||
|
+@@
|
|||
|
+@@ The structure here is very similar to the 128-bit
|
|||
|
+@@ schedule, but with an additional "low side" in
|
|||
|
+@@ q6. The low side's rounds are the same as the
|
|||
|
+@@ high side's, except no rcon and no rotation.
|
|||
|
+@@
|
|||
|
+.align 4
|
|||
|
+.Lschedule_256:
|
|||
|
+ vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
|||
|
+ bl _vpaes_schedule_transform @ input transform
|
|||
|
+ mov $inp, #7 @ mov \$7, %esi
|
|||
|
+
|
|||
|
+.Loop_schedule_256:
|
|||
|
+ bl _vpaes_schedule_mangle @ output low result
|
|||
|
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
|
|||
|
+
|
|||
|
+ @ high round
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ subs $inp, $inp, #1 @ dec %esi
|
|||
|
+ beq .Lschedule_mangle_last
|
|||
|
+ bl _vpaes_schedule_mangle
|
|||
|
+
|
|||
|
+ @ low round. swap xmm7 and xmm6
|
|||
|
+ vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0
|
|||
|
+ vmov.i8 q4, #0
|
|||
|
+ vmov q5, q7 @ vmovdqa %xmm7, %xmm5
|
|||
|
+ vmov q7, q6 @ vmovdqa %xmm6, %xmm7
|
|||
|
+ bl _vpaes_schedule_low_round
|
|||
|
+ vmov q7, q5 @ vmovdqa %xmm5, %xmm7
|
|||
|
+
|
|||
|
+ b .Loop_schedule_256
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .aes_schedule_mangle_last
|
|||
|
+@@
|
|||
|
+@@ Mangler for last round of key schedule
|
|||
|
+@@ Mangles q0
|
|||
|
+@@ when encrypting, outputs out(q0) ^ 63
|
|||
|
+@@ when decrypting, outputs unskew(q0)
|
|||
|
+@@
|
|||
|
+@@ Always called right before return... jumps to cleanup and exits
|
|||
|
+@@
|
|||
|
+.align 4
|
|||
|
+.Lschedule_mangle_last:
|
|||
|
+ @ schedule last round key from xmm0
|
|||
|
+ adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew
|
|||
|
+
|
|||
|
+ @ encrypting
|
|||
|
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
|
|||
|
+ adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform
|
|||
|
+ add $out, $out, #32 @ add \$32, %rdx
|
|||
|
+ vmov q2, q0
|
|||
|
+ vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
|
|||
|
+ vtbl.8 q0#hi, {q2}, q1#hi
|
|||
|
+
|
|||
|
+.Lschedule_mangle_last_dec:
|
|||
|
+ sub $out, $out, #16 @ add \$-16, %rdx
|
|||
|
+ veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0
|
|||
|
+ bl _vpaes_schedule_transform @ output transform
|
|||
|
+ vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key
|
|||
|
+
|
|||
|
+ @ cleanup
|
|||
|
+ veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
|
|||
|
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
|
|||
|
+ veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
|
|||
|
+ veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
|
|||
|
+ veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
|
|||
|
+ veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
|
|||
|
+ veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
|
|||
|
+ veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
|
|||
|
+ ldmia sp!, {r3,pc} @ return
|
|||
|
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .aes_schedule_round
|
|||
|
+@@
|
|||
|
+@@ Runs one main round of the key schedule on q0, q7
|
|||
|
+@@
|
|||
|
+@@ Specifically, runs subbytes on the high dword of q0
|
|||
|
+@@ then rotates it by one byte and xors into the low dword of
|
|||
|
+@@ q7.
|
|||
|
+@@
|
|||
|
+@@ Adds rcon from low byte of q8, then rotates q8 for
|
|||
|
+@@ next rcon.
|
|||
|
+@@
|
|||
|
+@@ Smears the dwords of q7 by xoring the low into the
|
|||
|
+@@ second low, result into third, result into highest.
|
|||
|
+@@
|
|||
|
+@@ Returns results in q7 = q0.
|
|||
|
+@@ Clobbers q1-q4, r11.
|
|||
|
+@@
|
|||
|
+.type _vpaes_schedule_round,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_round:
|
|||
|
+ @ extract rcon from xmm8
|
|||
|
+ vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
|
|||
|
+ vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1
|
|||
|
+ vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8
|
|||
|
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
|
|||
|
+
|
|||
|
+ @ rotate
|
|||
|
+ vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0
|
|||
|
+ vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0
|
|||
|
+
|
|||
|
+ @ fall through...
|
|||
|
+
|
|||
|
+ @ low round: same as high round, but no rotation and no rcon.
|
|||
|
+_vpaes_schedule_low_round:
|
|||
|
+ @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
|
|||
|
+ @ We pin other values in _vpaes_key_preheat, so load them now.
|
|||
|
+ adr r11, .Lk_sb1
|
|||
|
+ vld1.64 {q14,q15}, [r11]
|
|||
|
+
|
|||
|
+ @ smear xmm7
|
|||
|
+ vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1
|
|||
|
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
|
|||
|
+ vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4
|
|||
|
+
|
|||
|
+ @ subbytes
|
|||
|
+ vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
|
|||
|
+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i
|
|||
|
+ veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
|
|||
|
+ vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
|
|||
|
+ vtbl.8 q2#hi, {$invhi}, q1#hi
|
|||
|
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
|
|||
|
+ vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
|
|||
|
+ vtbl.8 q3#hi, {$invlo}, q0#hi
|
|||
|
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
|
|||
|
+ vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
|
|||
|
+ vtbl.8 q4#hi, {$invlo}, q1#hi
|
|||
|
+ veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7
|
|||
|
+ vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
|
|||
|
+ vtbl.8 q3#hi, {$invlo}, q3#hi
|
|||
|
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
|
|||
|
+ vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
|
|||
|
+ vtbl.8 q2#hi, {$invlo}, q4#hi
|
|||
|
+ veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
|
|||
|
+ veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
|
|||
|
+ vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
|
|||
|
+ vtbl.8 q4#hi, {q15}, q3#hi
|
|||
|
+ vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
|
|||
|
+ vtbl.8 q1#hi, {q14}, q2#hi
|
|||
|
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
|
|||
|
+
|
|||
|
+ @ add in smeared stuff
|
|||
|
+ veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
|
|||
|
+ veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
|
|||
|
+ bx lr
|
|||
|
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .aes_schedule_transform
|
|||
|
+@@
|
|||
|
+@@ Linear-transform q0 according to tables at [r11]
|
|||
|
+@@
|
|||
|
+@@ Requires that q9 = 0x0F0F... as in preheat
|
|||
|
+@@ Output in q0
|
|||
|
+@@ Clobbers q1, q2, q14, q15
|
|||
|
+@@
|
|||
|
+.type _vpaes_schedule_transform,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_transform:
|
|||
|
+ vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
|
|||
|
+ @ vmovdqa 16(%r11), %xmm1 # hi
|
|||
|
+ vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1
|
|||
|
+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0
|
|||
|
+ vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2
|
|||
|
+ vtbl.8 q2#hi, {q14}, q1#hi
|
|||
|
+ vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0
|
|||
|
+ vtbl.8 q0#hi, {q15}, q0#hi
|
|||
|
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
|
|||
|
+ bx lr
|
|||
|
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
|
|||
|
+
|
|||
|
+@@
|
|||
|
+@@ .aes_schedule_mangle
|
|||
|
+@@
|
|||
|
+@@ Mangles q0 from (basis-transformed) standard version
|
|||
|
+@@ to our version.
|
|||
|
+@@
|
|||
|
+@@ On encrypt,
|
|||
|
+@@ xor with 0x63
|
|||
|
+@@ multiply by circulant 0,1,1,1
|
|||
|
+@@ apply shiftrows transform
|
|||
|
+@@
|
|||
|
+@@ On decrypt,
|
|||
|
+@@ xor with 0x63
|
|||
|
+@@ multiply by "inverse mixcolumns" circulant E,B,D,9
|
|||
|
+@@ deskew
|
|||
|
+@@ apply shiftrows transform
|
|||
|
+@@
|
|||
|
+@@
|
|||
|
+@@ Writes out to [r2], and increments or decrements it
|
|||
|
+@@ Keeps track of round number mod 4 in r8
|
|||
|
+@@ Preserves q0
|
|||
|
+@@ Clobbers q1-q5
|
|||
|
+@@
|
|||
|
+.type _vpaes_schedule_mangle,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_mangle:
|
|||
|
+ tst $dir, $dir
|
|||
|
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
|
|||
|
+ adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
|
|||
|
+ vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
|
|||
|
+
|
|||
|
+ @ encrypting
|
|||
|
+ @ Write to q2 so we do not overlap table and destination below.
|
|||
|
+ veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4
|
|||
|
+ add $out, $out, #16 @ add \$16, %rdx
|
|||
|
+ vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4
|
|||
|
+ vtbl.8 q4#hi, {q2}, q5#hi
|
|||
|
+ vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1
|
|||
|
+ vtbl.8 q1#hi, {q4}, q5#hi
|
|||
|
+ vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3
|
|||
|
+ vtbl.8 q3#hi, {q1}, q5#hi
|
|||
|
+ veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
|
|||
|
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
|
|||
|
+ veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
|
|||
|
+
|
|||
|
+.Lschedule_mangle_both:
|
|||
|
+ @ Write to q2 so table and destination do not overlap.
|
|||
|
+ vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3
|
|||
|
+ vtbl.8 q2#hi, {q3}, q1#hi
|
|||
|
+ add r8, r8, #64-16 @ add \$-16, %r8
|
|||
|
+ and r8, r8, #~(1<<6) @ and \$0x30, %r8
|
|||
|
+ vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx)
|
|||
|
+ bx lr
|
|||
|
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
|
|||
|
+
|
|||
|
+.globl GFp_vpaes_set_encrypt_key
|
|||
|
+.type GFp_vpaes_set_encrypt_key,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_set_encrypt_key:
|
|||
|
+ stmdb sp!, {r7-r11, lr}
|
|||
|
+ vstmdb sp!, {d8-d15}
|
|||
|
+
|
|||
|
+ lsr r9, $bits, #5 @ shr \$5,%eax
|
|||
|
+ add r9, r9, #5 @ \$5,%eax
|
|||
|
+ str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
|
|||
|
+
|
|||
|
+ mov $dir, #0 @ mov \$0,%ecx
|
|||
|
+ mov r8, #0x30 @ mov \$0x30,%r8d
|
|||
|
+ bl _vpaes_schedule_core
|
|||
|
+ eor r0, r0, r0
|
|||
|
+
|
|||
|
+ vldmia sp!, {d8-d15}
|
|||
|
+ ldmia sp!, {r7-r11, pc} @ return
|
|||
|
+.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
|
|||
|
+___
|
|||
|
+}
|
|||
|
+
|
|||
|
+{
|
|||
|
+my ($out, $inp) = map("r$_", (0..1));
|
|||
|
+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
|
|||
|
+
|
|||
|
+$code .= <<___;
|
|||
|
+
|
|||
|
+@ Additional constants for converting to bsaes.
|
|||
|
+.type _vpaes_convert_consts,%object
|
|||
|
+.align 4
|
|||
|
+_vpaes_convert_consts:
|
|||
|
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
|
|||
|
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
|
|||
|
+@ table. This was computed with the following script:
|
|||
|
+@
|
|||
|
+@ def u64s_to_u128(x, y):
|
|||
|
+@ return x | (y << 64)
|
|||
|
+@ def u128_to_u64s(w):
|
|||
|
+@ return w & ((1<<64)-1), w >> 64
|
|||
|
+@ def get_byte(w, i):
|
|||
|
+@ return (w >> (i*8)) & 0xff
|
|||
|
+@ def apply_table(table, b):
|
|||
|
+@ lo = b & 0xf
|
|||
|
+@ hi = b >> 4
|
|||
|
+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
|
|||
|
+@ def opt(b):
|
|||
|
+@ table = [
|
|||
|
+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
|
|||
|
+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
|
|||
|
+@ ]
|
|||
|
+@ return apply_table(table, b)
|
|||
|
+@ def rot_byte(b, n):
|
|||
|
+@ return 0xff & ((b << n) | (b >> (8-n)))
|
|||
|
+@ def skew(x):
|
|||
|
+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
|
|||
|
+@ rot_byte(x, 4))
|
|||
|
+@ table = [0, 0]
|
|||
|
+@ for i in range(16):
|
|||
|
+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
|
|||
|
+@ table[1] |= skew(opt(i<<4)) << (i*8)
|
|||
|
+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
|
|||
|
+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
|
|||
|
+.Lk_opt_then_skew:
|
|||
|
+ .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
|
|||
|
+ .quad 0x1f30062936192f00, 0xb49bad829db284ab
|
|||
|
+
|
|||
|
+@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
|
|||
|
+.globl GFp_vpaes_encrypt_key_to_bsaes
|
|||
|
+.type GFp_vpaes_encrypt_key_to_bsaes,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_encrypt_key_to_bsaes:
|
|||
|
+ stmdb sp!, {r11, lr}
|
|||
|
+
|
|||
|
+ @ See _vpaes_schedule_core for the key schedule logic. In particular,
|
|||
|
+ @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
|
|||
|
+ @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
|
|||
|
+ @ contain the transformations not in the bsaes representation. This
|
|||
|
+ @ function inverts those transforms.
|
|||
|
+ @
|
|||
|
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
|
|||
|
+ @ representation, which does not match the other aes_nohw_*
|
|||
|
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
|
|||
|
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
|
|||
|
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
|
|||
|
+
|
|||
|
+ vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform
|
|||
|
+ adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
|
|||
|
+ add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
|
|||
|
+
|
|||
|
+ vld1.64 {$mc_forward}, [r2]
|
|||
|
+ vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64
|
|||
|
+ adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
|
|||
|
+ vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied
|
|||
|
+
|
|||
|
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
|
|||
|
+ @ is the same.
|
|||
|
+ ldr r2, [$inp,#240]
|
|||
|
+ add r2, r2, #1
|
|||
|
+ str r2, [$out,#240]
|
|||
|
+
|
|||
|
+ @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
|
|||
|
+ @ Invert this with .Lk_opt.
|
|||
|
+ vld1.64 {q0}, [$inp]!
|
|||
|
+ bl _vpaes_schedule_transform
|
|||
|
+ vrev32.8 q0, q0
|
|||
|
+ vst1.64 {q0}, [$out]!
|
|||
|
+
|
|||
|
+ @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
|
|||
|
+ @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
|
|||
|
+ @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
|
|||
|
+.Loop_enc_key_to_bsaes:
|
|||
|
+ vld1.64 {q0}, [$inp]!
|
|||
|
+
|
|||
|
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
|
|||
|
+ @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
|
|||
|
+ @ We use r3 rather than r8 to avoid a callee-saved register.
|
|||
|
+ vld1.64 {q1}, [r3]
|
|||
|
+ vtbl.8 q2#lo, {q0}, q1#lo
|
|||
|
+ vtbl.8 q2#hi, {q0}, q1#hi
|
|||
|
+ add r3, r3, #16
|
|||
|
+ and r3, r3, #~(1<<6)
|
|||
|
+ vmov q0, q2
|
|||
|
+
|
|||
|
+ @ Handle the last key differently.
|
|||
|
+ subs r2, r2, #1
|
|||
|
+ beq .Loop_enc_key_to_bsaes_last
|
|||
|
+
|
|||
|
+ @ Multiply by the circulant. This is its own inverse.
|
|||
|
+ vtbl.8 q1#lo, {q0}, $mc_forward#lo
|
|||
|
+ vtbl.8 q1#hi, {q0}, $mc_forward#hi
|
|||
|
+ vmov q0, q1
|
|||
|
+ vtbl.8 q2#lo, {q1}, $mc_forward#lo
|
|||
|
+ vtbl.8 q2#hi, {q1}, $mc_forward#hi
|
|||
|
+ veor q0, q0, q2
|
|||
|
+ vtbl.8 q1#lo, {q2}, $mc_forward#lo
|
|||
|
+ vtbl.8 q1#hi, {q2}, $mc_forward#hi
|
|||
|
+ veor q0, q0, q1
|
|||
|
+
|
|||
|
+ @ XOR and finish.
|
|||
|
+ veor q0, q0, $s63
|
|||
|
+ bl _vpaes_schedule_transform
|
|||
|
+ vrev32.8 q0, q0
|
|||
|
+ vst1.64 {q0}, [$out]!
|
|||
|
+ b .Loop_enc_key_to_bsaes
|
|||
|
+
|
|||
|
+.Loop_enc_key_to_bsaes_last:
|
|||
|
+ @ The final key does not have a basis transform (note
|
|||
|
+ @ .Lschedule_mangle_last inverts the original transform). It only XORs
|
|||
|
+ @ 0x63 and applies ShiftRows. The latter was already inverted in the
|
|||
|
+ @ loop. Note that, because we act on the original representation, we use
|
|||
|
+ @ $s63_raw, not $s63.
|
|||
|
+ veor q0, q0, $s63_raw
|
|||
|
+ vrev32.8 q0, q0
|
|||
|
+ vst1.64 {q0}, [$out]
|
|||
|
+
|
|||
|
+ @ Wipe registers which contained key material.
|
|||
|
+ veor q0, q0, q0
|
|||
|
+ veor q1, q1, q1
|
|||
|
+ veor q2, q2, q2
|
|||
|
+
|
|||
|
+ ldmia sp!, {r11, pc} @ return
|
|||
|
+.size GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes
|
|||
|
+___
|
|||
|
+}
|
|||
|
+
|
|||
|
+{
|
|||
|
+# Register-passed parameters.
|
|||
|
+my ($inp, $out, $len, $key) = map("r$_", 0..3);
|
|||
|
+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
|
|||
|
+# $tmp. $ctr is r7 because it must be preserved across calls.
|
|||
|
+my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
|
|||
|
+
|
|||
|
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
|||
|
+# const AES_KEY *key, const uint8_t ivec[16]);
|
|||
|
+$code .= <<___;
|
|||
|
+.globl GFp_vpaes_ctr32_encrypt_blocks
|
|||
|
+.type GFp_vpaes_ctr32_encrypt_blocks,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_ctr32_encrypt_blocks:
|
|||
|
+ mov ip, sp
|
|||
|
+ stmdb sp!, {r7-r11, lr}
|
|||
|
+ @ This function uses q4-q7 (d8-d15), which are callee-saved.
|
|||
|
+ vstmdb sp!, {d8-d15}
|
|||
|
+
|
|||
|
+ cmp $len, #0
|
|||
|
+ @ $ivec is passed on the stack.
|
|||
|
+ ldr $ivec, [ip]
|
|||
|
+ beq .Lctr32_done
|
|||
|
+
|
|||
|
+ @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
|
|||
|
+ mov $tmp, $key
|
|||
|
+ mov $key, $len
|
|||
|
+ mov $len, $tmp
|
|||
|
+___
|
|||
|
+my ($len, $key) = ($key, $len);
|
|||
|
+$code .= <<___;
|
|||
|
+
|
|||
|
+ @ Load the IV and counter portion.
|
|||
|
+ ldr $ctr, [$ivec, #12]
|
|||
|
+ vld1.8 {q7}, [$ivec]
|
|||
|
+
|
|||
|
+ bl _vpaes_preheat
|
|||
|
+ rev $ctr, $ctr @ The counter is big-endian.
|
|||
|
+
|
|||
|
+.Lctr32_loop:
|
|||
|
+ vmov q0, q7
|
|||
|
+ vld1.8 {q6}, [$inp]! @ Load input ahead of time
|
|||
|
+ bl _vpaes_encrypt_core
|
|||
|
+ veor q0, q0, q6 @ XOR input and result
|
|||
|
+ vst1.8 {q0}, [$out]!
|
|||
|
+ subs $len, $len, #1
|
|||
|
+ @ Update the counter.
|
|||
|
+ add $ctr, $ctr, #1
|
|||
|
+ rev $tmp, $ctr
|
|||
|
+ vmov.32 q7#hi[1], $tmp
|
|||
|
+ bne .Lctr32_loop
|
|||
|
+
|
|||
|
+.Lctr32_done:
|
|||
|
+ vldmia sp!, {d8-d15}
|
|||
|
+ ldmia sp!, {r7-r11, pc} @ return
|
|||
|
+.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
|
|||
|
+___
|
|||
|
+}
|
|||
|
+
|
|||
|
+foreach (split("\n",$code)) {
|
|||
|
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
|||
|
+ print $_,"\n";
|
|||
|
+}
|
|||
|
+
|
|||
|
+close STDOUT;
|
|||
|
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
|
|||
|
new file mode 100755
|
|||
|
index 0000000..b31bbb8
|
|||
|
--- /dev/null
|
|||
|
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
|
|||
|
@@ -0,0 +1,837 @@
|
|||
|
+#! /usr/bin/env perl
|
|||
|
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|||
|
+#
|
|||
|
+# Licensed under the OpenSSL license (the "License"). You may not use
|
|||
|
+# this file except in compliance with the License. You can obtain a copy
|
|||
|
+# in the file LICENSE in the source distribution or at
|
|||
|
+# https://www.openssl.org/source/license.html
|
|||
|
+
|
|||
|
+
|
|||
|
+######################################################################
|
|||
|
+## Constant-time SSSE3 AES core implementation.
|
|||
|
+## version 0.1
|
|||
|
+##
|
|||
|
+## By Mike Hamburg (Stanford University), 2009
|
|||
|
+## Public domain.
|
|||
|
+##
|
|||
|
+## For details see http://shiftleft.org/papers/vector_aes/ and
|
|||
|
+## http://crypto.stanford.edu/vpaes/.
|
|||
|
+##
|
|||
|
+######################################################################
|
|||
|
+# ARMv8 NEON adaptation by <appro@openssl.org>
|
|||
|
+#
|
|||
|
+# Reason for undertaken effort is that there is at least one popular
|
|||
|
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
|
|||
|
+#
|
|||
|
+# CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
|
|||
|
+# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
|
|||
|
+# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
|
|||
|
+# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
|
|||
|
+# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
|
|||
|
+# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
|
|||
|
+# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
|
|||
|
+#
|
|||
|
+# (*) ECB denotes approximate result for parallelizable modes
|
|||
|
+# such as CBC decrypt, CTR, etc.;
|
|||
|
+# (**) these results are worse than scalar compiler-generated
|
|||
|
+# code, but it's constant-time and therefore preferred;
|
|||
|
+# (***) presented for reference/comparison purposes;
|
|||
|
+
|
|||
|
+$flavour = shift;
|
|||
|
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|||
|
+
|
|||
|
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|||
|
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|||
|
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|||
|
+die "can't locate arm-xlate.pl";
|
|||
|
+
|
|||
|
+open OUT,"| \"$^X\" $xlate $flavour $output";
|
|||
|
+*STDOUT=*OUT;
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+#include <GFp/arm_arch.h>
|
|||
|
+
|
|||
|
+.section .rodata
|
|||
|
+
|
|||
|
+.type _vpaes_consts,%object
|
|||
|
+.align 7 // totally strategic alignment
|
|||
|
+_vpaes_consts:
|
|||
|
+.Lk_mc_forward: // mc_forward
|
|||
|
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
|
|||
|
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
|
|||
|
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
|
|||
|
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
|
|||
|
+.Lk_mc_backward:// mc_backward
|
|||
|
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
|
|||
|
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
|
|||
|
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
|
|||
|
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
|
|||
|
+.Lk_sr: // sr
|
|||
|
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
|
|||
|
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
|
|||
|
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
|
|||
|
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
|
|||
|
+
|
|||
|
+//
|
|||
|
+// "Hot" constants
|
|||
|
+//
|
|||
|
+.Lk_inv: // inv, inva
|
|||
|
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
|
|||
|
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
|
|||
|
+.Lk_ipt: // input transform (lo, hi)
|
|||
|
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
|
|||
|
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
|
|||
|
+.Lk_sbo: // sbou, sbot
|
|||
|
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
|
|||
|
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
|
|||
|
+.Lk_sb1: // sb1u, sb1t
|
|||
|
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
|
|||
|
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
|
|||
|
+.Lk_sb2: // sb2u, sb2t
|
|||
|
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
|
|||
|
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
|
|||
|
+
|
|||
|
+//
|
|||
|
+// Key schedule constants
|
|||
|
+//
|
|||
|
+.Lk_dksd: // decryption key schedule: invskew x*D
|
|||
|
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
|
|||
|
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
|
|||
|
+.Lk_dksb: // decryption key schedule: invskew x*B
|
|||
|
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
|
|||
|
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
|
|||
|
+.Lk_dkse: // decryption key schedule: invskew x*E + 0x63
|
|||
|
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
|
|||
|
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
|
|||
|
+.Lk_dks9: // decryption key schedule: invskew x*9
|
|||
|
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
|
|||
|
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
|
|||
|
+
|
|||
|
+.Lk_rcon: // rcon
|
|||
|
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
|
|||
|
+
|
|||
|
+.Lk_opt: // output transform
|
|||
|
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
|
|||
|
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
|
|||
|
+.Lk_deskew: // deskew tables: inverts the sbox's "skew"
|
|||
|
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
|
|||
|
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
|
|||
|
+
|
|||
|
+.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
|
|||
|
+.size _vpaes_consts,.-_vpaes_consts
|
|||
|
+.align 6
|
|||
|
+
|
|||
|
+.text
|
|||
|
+___
|
|||
|
+
|
|||
|
+{
|
|||
|
+my ($inp,$out,$key) = map("x$_",(0..2));
|
|||
|
+
|
|||
|
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
|
|||
|
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
|
|||
|
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+##
|
|||
|
+## _aes_preheat
|
|||
|
+##
|
|||
|
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
|
|||
|
+## and %xmm9-%xmm15 as specified below.
|
|||
|
+##
|
|||
|
+.type _vpaes_encrypt_preheat,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_encrypt_preheat:
|
|||
|
+ adrp x10, :pg_hi21:.Lk_inv
|
|||
|
+ add x10, x10, :lo12:.Lk_inv
|
|||
|
+ movi v17.16b, #0x0f
|
|||
|
+ ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
|
|||
|
+ ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
|
|||
|
+ ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
|
|||
|
+ ret
|
|||
|
+.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
|
|||
|
+
|
|||
|
+##
|
|||
|
+## _aes_encrypt_core
|
|||
|
+##
|
|||
|
+## AES-encrypt %xmm0.
|
|||
|
+##
|
|||
|
+## Inputs:
|
|||
|
+## %xmm0 = input
|
|||
|
+## %xmm9-%xmm15 as in _vpaes_preheat
|
|||
|
+## (%rdx) = scheduled keys
|
|||
|
+##
|
|||
|
+## Output in %xmm0
|
|||
|
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
|
|||
|
+## Preserves %xmm6 - %xmm8 so you get some local vectors
|
|||
|
+##
|
|||
|
+##
|
|||
|
+.type _vpaes_encrypt_core,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_encrypt_core:
|
|||
|
+ mov x9, $key
|
|||
|
+ ldr w8, [$key,#240] // pull rounds
|
|||
|
+ adrp x11, :pg_hi21:.Lk_mc_forward+16
|
|||
|
+ add x11, x11, :lo12:.Lk_mc_forward+16
|
|||
|
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
|
|||
|
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
|
|||
|
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
|
|||
|
+ ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
|
|||
|
+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
|
|||
|
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
|
|||
|
+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
|
|||
|
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
|
|||
|
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
|
|||
|
+ b .Lenc_entry
|
|||
|
+
|
|||
|
+.align 4
|
|||
|
+.Lenc_loop:
|
|||
|
+ // middle of middle round
|
|||
|
+ add x10, x11, #0x40
|
|||
|
+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
|
|||
|
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
|
|||
|
+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
|
|||
|
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
|
|||
|
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
|
|||
|
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
|
|||
|
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
|
|||
|
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
|
|||
|
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
|
|||
|
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
|
|||
|
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
|
|||
|
+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
|
|||
|
+ sub w8, w8, #1 // nr--
|
|||
|
+
|
|||
|
+.Lenc_entry:
|
|||
|
+ // top of round
|
|||
|
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
|
|||
|
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
|
|||
|
+ tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
|
|||
|
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
|
|||
|
+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
|
|||
|
+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
|
|||
|
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
|
|||
|
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
|
|||
|
+ tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
|
|||
|
+ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
|
|||
|
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
|
|||
|
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
|
|||
|
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
|
|||
|
+ cbnz w8, .Lenc_loop
|
|||
|
+
|
|||
|
+ // middle of last round
|
|||
|
+ add x10, x11, #0x80
|
|||
|
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
|
|||
|
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
|
|||
|
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
|
|||
|
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
|
|||
|
+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
|
|||
|
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
|
|||
|
+ ret
|
|||
|
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
|
|||
|
+
|
|||
|
+.globl GFp_vpaes_encrypt
|
|||
|
+.type GFp_vpaes_encrypt,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_encrypt:
|
|||
|
+ AARCH64_SIGN_LINK_REGISTER
|
|||
|
+ stp x29,x30,[sp,#-16]!
|
|||
|
+ add x29,sp,#0
|
|||
|
+
|
|||
|
+ ld1 {v7.16b}, [$inp]
|
|||
|
+ bl _vpaes_encrypt_preheat
|
|||
|
+ bl _vpaes_encrypt_core
|
|||
|
+ st1 {v0.16b}, [$out]
|
|||
|
+
|
|||
|
+ ldp x29,x30,[sp],#16
|
|||
|
+ AARCH64_VALIDATE_LINK_REGISTER
|
|||
|
+ ret
|
|||
|
+.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt
|
|||
|
+
|
|||
|
+.type _vpaes_encrypt_2x,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_encrypt_2x:
|
|||
|
+ mov x9, $key
|
|||
|
+ ldr w8, [$key,#240] // pull rounds
|
|||
|
+ adrp x11, :pg_hi21:.Lk_mc_forward+16
|
|||
|
+ add x11, x11, :lo12:.Lk_mc_forward+16
|
|||
|
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
|
|||
|
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
|
|||
|
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
|
|||
|
+ ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
|
|||
|
+ and v9.16b, v15.16b, v17.16b
|
|||
|
+ ushr v8.16b, v15.16b, #4
|
|||
|
+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
|
|||
|
+ tbl v9.16b, {$iptlo}, v9.16b
|
|||
|
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
|
|||
|
+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
|
|||
|
+ tbl v10.16b, {$ipthi}, v8.16b
|
|||
|
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
|
|||
|
+ eor v8.16b, v9.16b, v16.16b
|
|||
|
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
|
|||
|
+ eor v8.16b, v8.16b, v10.16b
|
|||
|
+ b .Lenc_2x_entry
|
|||
|
+
|
|||
|
+.align 4
|
|||
|
+.Lenc_2x_loop:
|
|||
|
+ // middle of middle round
|
|||
|
+ add x10, x11, #0x40
|
|||
|
+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
|
|||
|
+ tbl v12.16b, {$sb1t}, v10.16b
|
|||
|
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
|
|||
|
+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
|
|||
|
+ tbl v8.16b, {$sb1u}, v11.16b
|
|||
|
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ eor v12.16b, v12.16b, v16.16b
|
|||
|
+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
|
|||
|
+ tbl v13.16b, {$sb2t}, v10.16b
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ eor v8.16b, v8.16b, v12.16b
|
|||
|
+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
|
|||
|
+ tbl v10.16b, {$sb2u}, v11.16b
|
|||
|
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
|
|||
|
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
|
|||
|
+ tbl v11.16b, {v8.16b}, v1.16b
|
|||
|
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
|
|||
|
+ eor v10.16b, v10.16b, v13.16b
|
|||
|
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
|
|||
|
+ tbl v8.16b, {v8.16b}, v4.16b
|
|||
|
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
|
|||
|
+ eor v11.16b, v11.16b, v10.16b
|
|||
|
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
|
|||
|
+ tbl v12.16b, {v11.16b},v1.16b
|
|||
|
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
|
|||
|
+ eor v8.16b, v8.16b, v11.16b
|
|||
|
+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
|
|||
|
+ eor v8.16b, v8.16b, v12.16b
|
|||
|
+ sub w8, w8, #1 // nr--
|
|||
|
+
|
|||
|
+.Lenc_2x_entry:
|
|||
|
+ // top of round
|
|||
|
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
|
|||
|
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
|
|||
|
+ and v9.16b, v8.16b, v17.16b
|
|||
|
+ ushr v8.16b, v8.16b, #4
|
|||
|
+ tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
|
|||
|
+ tbl v13.16b, {$invhi},v9.16b
|
|||
|
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
|
|||
|
+ eor v9.16b, v9.16b, v8.16b
|
|||
|
+ tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
|
|||
|
+ tbl v11.16b, {$invlo},v8.16b
|
|||
|
+ tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
|
|||
|
+ tbl v12.16b, {$invlo},v9.16b
|
|||
|
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
|
|||
|
+ eor v11.16b, v11.16b, v13.16b
|
|||
|
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
|
|||
|
+ eor v12.16b, v12.16b, v13.16b
|
|||
|
+ tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
|
|||
|
+ tbl v10.16b, {$invlo},v11.16b
|
|||
|
+ tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
|
|||
|
+ tbl v11.16b, {$invlo},v12.16b
|
|||
|
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
|
|||
|
+ eor v10.16b, v10.16b, v9.16b
|
|||
|
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
|
|||
|
+ eor v11.16b, v11.16b, v8.16b
|
|||
|
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
|
|||
|
+ cbnz w8, .Lenc_2x_loop
|
|||
|
+
|
|||
|
+ // middle of last round
|
|||
|
+ add x10, x11, #0x80
|
|||
|
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
|
|||
|
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
|
|||
|
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
|
|||
|
+ tbl v12.16b, {$sbou}, v10.16b
|
|||
|
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
|
|||
|
+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
|
|||
|
+ tbl v8.16b, {$sbot}, v11.16b
|
|||
|
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
|
|||
|
+ eor v12.16b, v12.16b, v16.16b
|
|||
|
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
|
|||
|
+ eor v8.16b, v8.16b, v12.16b
|
|||
|
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
|
|||
|
+ tbl v1.16b, {v8.16b},v1.16b
|
|||
|
+ ret
|
|||
|
+.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
|
|||
|
+___
|
|||
|
+}
|
|||
|
+{
|
|||
|
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
|
|||
|
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
|
|||
|
+
|
|||
|
+$code.=<<___;
|
|||
|
+########################################################
|
|||
|
+## ##
|
|||
|
+## AES key schedule ##
|
|||
|
+## ##
|
|||
|
+########################################################
|
|||
|
+.type _vpaes_key_preheat,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_key_preheat:
|
|||
|
+ adrp x10, :pg_hi21:.Lk_inv
|
|||
|
+ add x10, x10, :lo12:.Lk_inv
|
|||
|
+ movi v16.16b, #0x5b // .Lk_s63
|
|||
|
+ adrp x11, :pg_hi21:.Lk_sb1
|
|||
|
+ add x11, x11, :lo12:.Lk_sb1
|
|||
|
+ movi v17.16b, #0x0f // .Lk_s0F
|
|||
|
+ ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
|
|||
|
+ adrp x10, :pg_hi21:.Lk_dksd
|
|||
|
+ add x10, x10, :lo12:.Lk_dksd
|
|||
|
+ ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
|
|||
|
+ adrp x11, :pg_hi21:.Lk_mc_forward
|
|||
|
+ add x11, x11, :lo12:.Lk_mc_forward
|
|||
|
+ ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
|
|||
|
+ ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
|
|||
|
+ ld1 {v8.2d}, [x10] // .Lk_rcon
|
|||
|
+ ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
|
|||
|
+ ret
|
|||
|
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
|
|||
|
+
|
|||
|
+.type _vpaes_schedule_core,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_core:
|
|||
|
+ AARCH64_SIGN_LINK_REGISTER
|
|||
|
+ stp x29, x30, [sp,#-16]!
|
|||
|
+ add x29,sp,#0
|
|||
|
+
|
|||
|
+ bl _vpaes_key_preheat // load the tables
|
|||
|
+
|
|||
|
+ ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
|
|||
|
+
|
|||
|
+ // input transform
|
|||
|
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
|
|||
|
+ bl _vpaes_schedule_transform
|
|||
|
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
|
|||
|
+
|
|||
|
+ adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10
|
|||
|
+ add x10, x10, :lo12:.Lk_sr
|
|||
|
+
|
|||
|
+ add x8, x8, x10
|
|||
|
+
|
|||
|
+ // encrypting, output zeroth round key after transform
|
|||
|
+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
|
|||
|
+
|
|||
|
+ cmp $bits, #192 // cmp \$192, %esi
|
|||
|
+ b.hi .Lschedule_256
|
|||
|
+ b.eq .Lschedule_192
|
|||
|
+ // 128: fall though
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .schedule_128
|
|||
|
+##
|
|||
|
+## 128-bit specific part of key schedule.
|
|||
|
+##
|
|||
|
+## This schedule is really simple, because all its parts
|
|||
|
+## are accomplished by the subroutines.
|
|||
|
+##
|
|||
|
+.Lschedule_128:
|
|||
|
+ mov $inp, #10 // mov \$10, %esi
|
|||
|
+
|
|||
|
+.Loop_schedule_128:
|
|||
|
+ sub $inp, $inp, #1 // dec %esi
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ cbz $inp, .Lschedule_mangle_last
|
|||
|
+ bl _vpaes_schedule_mangle // write output
|
|||
|
+ b .Loop_schedule_128
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_192
|
|||
|
+##
|
|||
|
+## 192-bit specific part of key schedule.
|
|||
|
+##
|
|||
|
+## The main body of this schedule is the same as the 128-bit
|
|||
|
+## schedule, but with more smearing. The long, high side is
|
|||
|
+## stored in %xmm7 as before, and the short, low side is in
|
|||
|
+## the high bits of %xmm6.
|
|||
|
+##
|
|||
|
+## This schedule is somewhat nastier, however, because each
|
|||
|
+## round produces 192 bits of key material, or 1.5 round keys.
|
|||
|
+## Therefore, on each cycle we do 2 rounds and produce 3 round
|
|||
|
+## keys.
|
|||
|
+##
|
|||
|
+.align 4
|
|||
|
+.Lschedule_192:
|
|||
|
+ sub $inp, $inp, #8
|
|||
|
+ ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
|
|||
|
+ bl _vpaes_schedule_transform // input transform
|
|||
|
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
|
|||
|
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
|
|||
|
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
|
|||
|
+ mov $inp, #4 // mov \$4, %esi
|
|||
|
+
|
|||
|
+.Loop_schedule_192:
|
|||
|
+ sub $inp, $inp, #1 // dec %esi
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
|
|||
|
+ bl _vpaes_schedule_mangle // save key n
|
|||
|
+ bl _vpaes_schedule_192_smear
|
|||
|
+ bl _vpaes_schedule_mangle // save key n+1
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ cbz $inp, .Lschedule_mangle_last
|
|||
|
+ bl _vpaes_schedule_mangle // save key n+2
|
|||
|
+ bl _vpaes_schedule_192_smear
|
|||
|
+ b .Loop_schedule_192
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_256
|
|||
|
+##
|
|||
|
+## 256-bit specific part of key schedule.
|
|||
|
+##
|
|||
|
+## The structure here is very similar to the 128-bit
|
|||
|
+## schedule, but with an additional "low side" in
|
|||
|
+## %xmm6. The low side's rounds are the same as the
|
|||
|
+## high side's, except no rcon and no rotation.
|
|||
|
+##
|
|||
|
+.align 4
|
|||
|
+.Lschedule_256:
|
|||
|
+ ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
|||
|
+ bl _vpaes_schedule_transform // input transform
|
|||
|
+ mov $inp, #7 // mov \$7, %esi
|
|||
|
+
|
|||
|
+.Loop_schedule_256:
|
|||
|
+ sub $inp, $inp, #1 // dec %esi
|
|||
|
+ bl _vpaes_schedule_mangle // output low result
|
|||
|
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
|
|||
|
+
|
|||
|
+ // high round
|
|||
|
+ bl _vpaes_schedule_round
|
|||
|
+ cbz $inp, .Lschedule_mangle_last
|
|||
|
+ bl _vpaes_schedule_mangle
|
|||
|
+
|
|||
|
+ // low round. swap xmm7 and xmm6
|
|||
|
+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
|
|||
|
+ movi v4.16b, #0
|
|||
|
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
|
|||
|
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
|
|||
|
+ bl _vpaes_schedule_low_round
|
|||
|
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
|
|||
|
+
|
|||
|
+ b .Loop_schedule_256
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_mangle_last
|
|||
|
+##
|
|||
|
+## Mangler for last round of key schedule
|
|||
|
+## Mangles %xmm0
|
|||
|
+## when encrypting, outputs out(%xmm0) ^ 63
|
|||
|
+## when decrypting, outputs unskew(%xmm0)
|
|||
|
+##
|
|||
|
+## Always called right before return... jumps to cleanup and exits
|
|||
|
+##
|
|||
|
+.align 4
|
|||
|
+.Lschedule_mangle_last:
|
|||
|
+ // schedule last round key from xmm0
|
|||
|
+ adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
|
|||
|
+ add x11, x11, :lo12:.Lk_deskew
|
|||
|
+
|
|||
|
+ cbnz $dir, .Lschedule_mangle_last_dec
|
|||
|
+
|
|||
|
+ // encrypting
|
|||
|
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
|
|||
|
+ adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
|
|||
|
+ add x11, x11, :lo12:.Lk_opt
|
|||
|
+ add $out, $out, #32 // add \$32, %rdx
|
|||
|
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
|
|||
|
+
|
|||
|
+.Lschedule_mangle_last_dec:
|
|||
|
+ ld1 {v20.2d-v21.2d}, [x11] // reload constants
|
|||
|
+ sub $out, $out, #16 // add \$-16, %rdx
|
|||
|
+ eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
|
|||
|
+ bl _vpaes_schedule_transform // output transform
|
|||
|
+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
|
|||
|
+
|
|||
|
+ // cleanup
|
|||
|
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
|
|||
|
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
|
|||
|
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
|
|||
|
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
|
|||
|
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
|
|||
|
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
|
|||
|
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
|
|||
|
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
|
|||
|
+ ldp x29, x30, [sp],#16
|
|||
|
+ AARCH64_VALIDATE_LINK_REGISTER
|
|||
|
+ ret
|
|||
|
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_192_smear
|
|||
|
+##
|
|||
|
+## Smear the short, low side in the 192-bit key schedule.
|
|||
|
+##
|
|||
|
+## Inputs:
|
|||
|
+## %xmm7: high side, b a x y
|
|||
|
+## %xmm6: low side, d c 0 0
|
|||
|
+## %xmm13: 0
|
|||
|
+##
|
|||
|
+## Outputs:
|
|||
|
+## %xmm6: b+c+d b+c 0 0
|
|||
|
+## %xmm0: b+c+d b+c b a
|
|||
|
+##
|
|||
|
+.type _vpaes_schedule_192_smear,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_192_smear:
|
|||
|
+ movi v1.16b, #0
|
|||
|
+ dup v0.4s, v7.s[3]
|
|||
|
+ ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
|
|||
|
+ ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
|
|||
|
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
|
|||
|
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
|
|||
|
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
|
|||
|
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
|
|||
|
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
|
|||
|
+ ret
|
|||
|
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_round
|
|||
|
+##
|
|||
|
+## Runs one main round of the key schedule on %xmm0, %xmm7
|
|||
|
+##
|
|||
|
+## Specifically, runs subbytes on the high dword of %xmm0
|
|||
|
+## then rotates it by one byte and xors into the low dword of
|
|||
|
+## %xmm7.
|
|||
|
+##
|
|||
|
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
|
|||
|
+## next rcon.
|
|||
|
+##
|
|||
|
+## Smears the dwords of %xmm7 by xoring the low into the
|
|||
|
+## second low, result into third, result into highest.
|
|||
|
+##
|
|||
|
+## Returns results in %xmm7 = %xmm0.
|
|||
|
+## Clobbers %xmm1-%xmm4, %r11.
|
|||
|
+##
|
|||
|
+.type _vpaes_schedule_round,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_round:
|
|||
|
+ // extract rcon from xmm8
|
|||
|
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
|
|||
|
+ ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
|
|||
|
+ ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
|
|||
|
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
|
|||
|
+
|
|||
|
+ // rotate
|
|||
|
+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
|
|||
|
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
|
|||
|
+
|
|||
|
+ // fall through...
|
|||
|
+
|
|||
|
+ // low round: same as high round, but no rotation and no rcon.
|
|||
|
+_vpaes_schedule_low_round:
|
|||
|
+ // smear xmm7
|
|||
|
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
|
|||
|
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
|
|||
|
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
|
|||
|
+
|
|||
|
+ // subbytes
|
|||
|
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
|
|||
|
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
|
|||
|
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
|
|||
|
+ tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
|
|||
|
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
|
|||
|
+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
|
|||
|
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
|
|||
|
+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
|
|||
|
+ eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
|
|||
|
+ tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
|
|||
|
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
|
|||
|
+ tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
|
|||
|
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
|
|||
|
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
|
|||
|
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
|
|||
|
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
|
|||
|
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
|
|||
|
+
|
|||
|
+ // add in smeared stuff
|
|||
|
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
|
|||
|
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
|
|||
|
+ ret
|
|||
|
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_transform
|
|||
|
+##
|
|||
|
+## Linear-transform %xmm0 according to tables at (%r11)
|
|||
|
+##
|
|||
|
+## Requires that %xmm9 = 0x0F0F... as in preheat
|
|||
|
+## Output in %xmm0
|
|||
|
+## Clobbers %xmm1, %xmm2
|
|||
|
+##
|
|||
|
+.type _vpaes_schedule_transform,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_transform:
|
|||
|
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
|
|||
|
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
|
|||
|
+ // vmovdqa (%r11), %xmm2 # lo
|
|||
|
+ tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
|
|||
|
+ // vmovdqa 16(%r11), %xmm1 # hi
|
|||
|
+ tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
|
|||
|
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
|
|||
|
+ ret
|
|||
|
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
|
|||
|
+
|
|||
|
+##
|
|||
|
+## .aes_schedule_mangle
|
|||
|
+##
|
|||
|
+## Mangle xmm0 from (basis-transformed) standard version
|
|||
|
+## to our version.
|
|||
|
+##
|
|||
|
+## On encrypt,
|
|||
|
+## xor with 0x63
|
|||
|
+## multiply by circulant 0,1,1,1
|
|||
|
+## apply shiftrows transform
|
|||
|
+##
|
|||
|
+## On decrypt,
|
|||
|
+## xor with 0x63
|
|||
|
+## multiply by "inverse mixcolumns" circulant E,B,D,9
|
|||
|
+## deskew
|
|||
|
+## apply shiftrows transform
|
|||
|
+##
|
|||
|
+##
|
|||
|
+## Writes out to (%rdx), and increments or decrements it
|
|||
|
+## Keeps track of round number mod 4 in %r8
|
|||
|
+## Preserves xmm0
|
|||
|
+## Clobbers xmm1-xmm5
|
|||
|
+##
|
|||
|
+.type _vpaes_schedule_mangle,%function
|
|||
|
+.align 4
|
|||
|
+_vpaes_schedule_mangle:
|
|||
|
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
|
|||
|
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
|
|||
|
+
|
|||
|
+ // encrypting
|
|||
|
+ eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
|
|||
|
+ add $out, $out, #16 // add \$16, %rdx
|
|||
|
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
|
|||
|
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
|
|||
|
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
|
|||
|
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
|
|||
|
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
|
|||
|
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
|
|||
|
+
|
|||
|
+.Lschedule_mangle_both:
|
|||
|
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
|
|||
|
+ add x8, x8, #64-16 // add \$-16, %r8
|
|||
|
+ and x8, x8, #~(1<<6) // and \$0x30, %r8
|
|||
|
+ st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
|
|||
|
+ ret
|
|||
|
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
|
|||
|
+
|
|||
|
+.globl GFp_vpaes_set_encrypt_key
|
|||
|
+.type GFp_vpaes_set_encrypt_key,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_set_encrypt_key:
|
|||
|
+ AARCH64_SIGN_LINK_REGISTER
|
|||
|
+ stp x29,x30,[sp,#-16]!
|
|||
|
+ add x29,sp,#0
|
|||
|
+ stp d8,d9,[sp,#-16]! // ABI spec says so
|
|||
|
+
|
|||
|
+ lsr w9, $bits, #5 // shr \$5,%eax
|
|||
|
+ add w9, w9, #5 // \$5,%eax
|
|||
|
+ str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
|
|||
|
+
|
|||
|
+ mov $dir, #0 // mov \$0,%ecx
|
|||
|
+ mov x8, #0x30 // mov \$0x30,%r8d
|
|||
|
+ bl _vpaes_schedule_core
|
|||
|
+ eor x0, x0, x0
|
|||
|
+
|
|||
|
+ ldp d8,d9,[sp],#16
|
|||
|
+ ldp x29,x30,[sp],#16
|
|||
|
+ AARCH64_VALIDATE_LINK_REGISTER
|
|||
|
+ ret
|
|||
|
+.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key
|
|||
|
+___
|
|||
|
+}
|
|||
|
+{
|
|||
|
+my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4));
|
|||
|
+my ($ctr, $ctr_tmp) = ("w6", "w7");
|
|||
|
+
|
|||
|
+# void GFp_vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
|||
|
+# const AES_KEY *key, const uint8_t ivec[16]);
|
|||
|
+$code.=<<___;
|
|||
|
+.globl GFp_vpaes_ctr32_encrypt_blocks
|
|||
|
+.type GFp_vpaes_ctr32_encrypt_blocks,%function
|
|||
|
+.align 4
|
|||
|
+GFp_vpaes_ctr32_encrypt_blocks:
|
|||
|
+ AARCH64_SIGN_LINK_REGISTER
|
|||
|
+ stp x29,x30,[sp,#-16]!
|
|||
|
+ add x29,sp,#0
|
|||
|
+ stp d8,d9,[sp,#-16]! // ABI spec says so
|
|||
|
+ stp d10,d11,[sp,#-16]!
|
|||
|
+ stp d12,d13,[sp,#-16]!
|
|||
|
+ stp d14,d15,[sp,#-16]!
|
|||
|
+
|
|||
|
+ cbz $len, .Lctr32_done
|
|||
|
+
|
|||
|
+ // Note, unlike the other functions, $len here is measured in blocks,
|
|||
|
+ // not bytes.
|
|||
|
+ mov x17, $len
|
|||
|
+ mov x2, $key
|
|||
|
+
|
|||
|
+ // Load the IV and counter portion.
|
|||
|
+ ldr $ctr, [$ivec, #12]
|
|||
|
+ ld1 {v7.16b}, [$ivec]
|
|||
|
+
|
|||
|
+ bl _vpaes_encrypt_preheat
|
|||
|
+ tst x17, #1
|
|||
|
+ rev $ctr, $ctr // The counter is big-endian.
|
|||
|
+ b.eq .Lctr32_prep_loop
|
|||
|
+
|
|||
|
+ // Handle one block so the remaining block count is even for
|
|||
|
+ // _vpaes_encrypt_2x.
|
|||
|
+ ld1 {v6.16b}, [$inp], #16 // Load input ahead of time
|
|||
|
+ bl _vpaes_encrypt_core
|
|||
|
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
|
|||
|
+ st1 {v0.16b}, [$out], #16
|
|||
|
+ subs x17, x17, #1
|
|||
|
+ // Update the counter.
|
|||
|
+ add $ctr, $ctr, #1
|
|||
|
+ rev $ctr_tmp, $ctr
|
|||
|
+ mov v7.s[3], $ctr_tmp
|
|||
|
+ b.ls .Lctr32_done
|
|||
|
+
|
|||
|
+.Lctr32_prep_loop:
|
|||
|
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
|
|||
|
+ // uses v14 and v15.
|
|||
|
+ mov v15.16b, v7.16b
|
|||
|
+ mov v14.16b, v7.16b
|
|||
|
+ add $ctr, $ctr, #1
|
|||
|
+ rev $ctr_tmp, $ctr
|
|||
|
+ mov v15.s[3], $ctr_tmp
|
|||
|
+
|
|||
|
+.Lctr32_loop:
|
|||
|
+ ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time
|
|||
|
+ bl _vpaes_encrypt_2x
|
|||
|
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
|
|||
|
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
|
|||
|
+ st1 {v0.16b,v1.16b}, [$out], #32
|
|||
|
+ subs x17, x17, #2
|
|||
|
+ // Update the counter.
|
|||
|
+ add $ctr_tmp, $ctr, #1
|
|||
|
+ add $ctr, $ctr, #2
|
|||
|
+ rev $ctr_tmp, $ctr_tmp
|
|||
|
+ mov v14.s[3], $ctr_tmp
|
|||
|
+ rev $ctr_tmp, $ctr
|
|||
|
+ mov v15.s[3], $ctr_tmp
|
|||
|
+ b.hi .Lctr32_loop
|
|||
|
+
|
|||
|
+.Lctr32_done:
|
|||
|
+ ldp d14,d15,[sp],#16
|
|||
|
+ ldp d12,d13,[sp],#16
|
|||
|
+ ldp d10,d11,[sp],#16
|
|||
|
+ ldp d8,d9,[sp],#16
|
|||
|
+ ldp x29,x30,[sp],#16
|
|||
|
+ AARCH64_VALIDATE_LINK_REGISTER
|
|||
|
+ ret
|
|||
|
+.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks
|
|||
|
+___
|
|||
|
+}
|
|||
|
+
|
|||
|
+print $code;
|
|||
|
+
|
|||
|
+close STDOUT or die "error closing STDOUT";
|
|||
|
diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
|
|||
|
new file mode 100644
|
|||
|
index 0000000..7e52ad6
|
|||
|
--- /dev/null
|
|||
|
+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
|
|||
|
@@ -0,0 +1,294 @@
|
|||
|
+#! /usr/bin/env perl
|
|||
|
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|||
|
+#
|
|||
|
+# Licensed under the OpenSSL license (the "License"). You may not use
|
|||
|
+# this file except in compliance with the License. You can obtain a copy
|
|||
|
+# in the file LICENSE in the source distribution or at
|
|||
|
+# https://www.openssl.org/source/license.html
|
|||
|
+
|
|||
|
+# ====================================================================
|
|||
|
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|||
|
+# project. The module is, however, dual licensed under OpenSSL and
|
|||
|
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|||
|
+# details see http://www.openssl.org/~appro/cryptogams/.
|
|||
|
+# ====================================================================
|
|||
|
+
|
|||
|
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
|
|||
|
+# implements the multiplication algorithm described in:
|
|||
|
+#
|
|||
|
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
|||
|
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
|||
|
+#
|
|||
|
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
|||
|
+#
|
|||
|
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
|
|||
|
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
|
|||
|
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
|
|||
|
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
|
|||
|
+# vN. Where the 32-bit version would use the upper half, this file must keep
|
|||
|
+# halves in separate registers.
|
|||
|
+#
|
|||
|
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
|
|||
|
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
|
|||
|
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
|
|||
|
+#
|
|||
|
+# vshl.i64 q0, q0, #1
|
|||
|
+#
|
|||
|
+# in 64-bit, it would be written:
|
|||
|
+#
|
|||
|
+# shl v0.2d, v0.2d, #1
|
|||
|
+#
|
|||
|
+# See Programmer's Guide for ARMv8-A, section 7 for details.
|
|||
|
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
|
|||
|
+#
|
|||
|
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
|
|||
|
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
|
|||
|
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
|
|||
|
+# polynomial and is conditioned on the PMULL extension. This file emulates the
|
|||
|
+# latter with the former.
|
|||
|
+
|
|||
|
+use strict;
|
|||
|
+
|
|||
|
+my $flavour = shift;
|
|||
|
+my $output;
|
|||
|
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
|||
|
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
|||
|
+
|
|||
|
+if ($flavour && $flavour ne "void") {
|
|||
|
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/;
|
|||
|
+ my $dir = $1;
|
|||
|
+ my $xlate;
|
|||
|
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|||
|
+ ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|||
|
+ die "can't locate arm-xlate.pl";
|
|||
|
+
|
|||
|
+ open OUT,"| \"$^X\" $xlate $flavour $output";
|
|||
|
+ *STDOUT=*OUT;
|
|||
|
+} else {
|
|||
|
+ open OUT,">$output";
|
|||
|
+ *STDOUT=*OUT;
|
|||
|
+}
|
|||
|
+
|
|||
|
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block
|
|||
|
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
|
|||
|
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
|
|||
|
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
|
|||
|
+# to spare.
|
|||
|
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
|
|||
|
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
|
|||
|
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
|
|||
|
+
|
|||
|
+my $code = "";
|
|||
|
+
|
|||
|
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
|
|||
|
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
|
|||
|
+sub clmul64x64 {
|
|||
|
+my ($r, $a, $b) = @_;
|
|||
|
+$code .= <<___;
|
|||
|
+ ext $t0.8b, $a.8b, $a.8b, #1 // A1
|
|||
|
+ pmull $t0.8h, $t0.8b, $b.8b // F = A1*B
|
|||
|
+ ext $r.8b, $b.8b, $b.8b, #1 // B1
|
|||
|
+ pmull $r.8h, $a.8b, $r.8b // E = A*B1
|
|||
|
+ ext $t1.8b, $a.8b, $a.8b, #2 // A2
|
|||
|
+ pmull $t1.8h, $t1.8b, $b.8b // H = A2*B
|
|||
|
+ ext $t3.8b, $b.8b, $b.8b, #2 // B2
|
|||
|
+ pmull $t3.8h, $a.8b, $t3.8b // G = A*B2
|
|||
|
+ ext $t2.8b, $a.8b, $a.8b, #3 // A3
|
|||
|
+ eor $t0.16b, $t0.16b, $r.16b // L = E + F
|
|||
|
+ pmull $t2.8h, $t2.8b, $b.8b // J = A3*B
|
|||
|
+ ext $r.8b, $b.8b, $b.8b, #3 // B3
|
|||
|
+ eor $t1.16b, $t1.16b, $t3.16b // M = G + H
|
|||
|
+ pmull $r.8h, $a.8b, $r.8b // I = A*B3
|
|||
|
+
|
|||
|
+ // Here we diverge from the 32-bit version. It computes the following
|
|||
|
+ // (instructions reordered for clarity):
|
|||
|
+ //
|
|||
|
+ // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)
|
|||
|
+ // vand \$t0#hi, \$t0#hi, \$k48
|
|||
|
+ // veor \$t0#lo, \$t0#lo, \$t0#hi
|
|||
|
+ //
|
|||
|
+ // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)
|
|||
|
+ // vand \$t1#hi, \$t1#hi, \$k32
|
|||
|
+ // veor \$t1#lo, \$t1#lo, \$t1#hi
|
|||
|
+ //
|
|||
|
+ // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)
|
|||
|
+ // vand \$t2#hi, \$t2#hi, \$k16
|
|||
|
+ // veor \$t2#lo, \$t2#lo, \$t2#hi
|
|||
|
+ //
|
|||
|
+ // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)
|
|||
|
+ // vmov.i64 \$t3#hi, #0
|
|||
|
+ //
|
|||
|
+ // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
|
|||
|
+ // upper halves of SIMD registers, so we must split each half into
|
|||
|
+ // separate registers. To compensate, we pair computations up and
|
|||
|
+ // parallelize.
|
|||
|
+
|
|||
|
+ ext $t3.8b, $b.8b, $b.8b, #4 // B4
|
|||
|
+ eor $t2.16b, $t2.16b, $r.16b // N = I + J
|
|||
|
+ pmull $t3.8h, $a.8b, $t3.8b // K = A*B4
|
|||
|
+
|
|||
|
+ // This can probably be scheduled more efficiently. For now, we just
|
|||
|
+ // pair up independent instructions.
|
|||
|
+ zip1 $t0l_t1l.2d, $t0.2d, $t1.2d
|
|||
|
+ zip1 $t2l_t3l.2d, $t2.2d, $t3.2d
|
|||
|
+ zip2 $t0h_t1h.2d, $t0.2d, $t1.2d
|
|||
|
+ zip2 $t2h_t3h.2d, $t2.2d, $t3.2d
|
|||
|
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
|
|||
|
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
|
|||
|
+ and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
|
|||
|
+ and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
|
|||
|
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
|
|||
|
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
|
|||
|
+ zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
|
|||
|
+ zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
|
|||
|
+ zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
|
|||
|
+ zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
|
|||
|
+
|
|||
|
+ ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8
|
|||
|
+ ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16
|
|||
|
+ pmull $r.8h, $a.8b, $b.8b // D = A*B
|
|||
|
+ ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32
|
|||
|
+ ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24
|
|||
|
+ eor $t0.16b, $t0.16b, $t1.16b
|
|||
|
+ eor $t2.16b, $t2.16b, $t3.16b
|
|||
|
+ eor $r.16b, $r.16b, $t0.16b
|
|||
|
+ eor $r.16b, $r.16b, $t2.16b
|
|||
|
+___
|
|||
|
+}
|
|||
|
+
|
|||
|
+$code .= <<___;
|
|||
|
+#include <GFp/arm_arch.h>
|
|||
|
+
|
|||
|
+.text
|
|||
|
+
|
|||
|
+.global GFp_gcm_init_neon
|
|||
|
+.type GFp_gcm_init_neon,%function
|
|||
|
+.align 4
|
|||
|
+GFp_gcm_init_neon:
|
|||
|
+ AARCH64_VALID_CALL_TARGET
|
|||
|
+ // This function is adapted from gcm_init_v8. xC2 is t3.
|
|||
|
+ ld1 {$t1.2d}, [x1] // load H
|
|||
|
+ movi $t3.16b, #0xe1
|
|||
|
+ shl $t3.2d, $t3.2d, #57 // 0xc2.0
|
|||
|
+ ext $INlo.16b, $t1.16b, $t1.16b, #8
|
|||
|
+ ushr $t2.2d, $t3.2d, #63
|
|||
|
+ dup $t1.4s, $t1.s[1]
|
|||
|
+ ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01
|
|||
|
+ ushr $t2.2d, $INlo.2d, #63
|
|||
|
+ sshr $t1.4s, $t1.4s, #31 // broadcast carry bit
|
|||
|
+ and $t2.16b, $t2.16b, $t0.16b
|
|||
|
+ shl $INlo.2d, $INlo.2d, #1
|
|||
|
+ ext $t2.16b, $t2.16b, $t2.16b, #8
|
|||
|
+ and $t0.16b, $t0.16b, $t1.16b
|
|||
|
+ orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1
|
|||
|
+ eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H
|
|||
|
+ st1 {$Hlo.2d}, [x0] // store Htable[0]
|
|||
|
+ ret
|
|||
|
+.size GFp_gcm_init_neon,.-GFp_gcm_init_neon
|
|||
|
+
|
|||
|
+.global GFp_gcm_gmult_neon
|
|||
|
+.type GFp_gcm_gmult_neon,%function
|
|||
|
+.align 4
|
|||
|
+GFp_gcm_gmult_neon:
|
|||
|
+ AARCH64_VALID_CALL_TARGET
|
|||
|
+ ld1 {$INlo.16b}, [$Xi] // load Xi
|
|||
|
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
|
|||
|
+ ld1 {$Hhi.1d}, [$Htbl]
|
|||
|
+ adrp x9, :pg_hi21:.Lmasks // load constants
|
|||
|
+ add x9, x9, :lo12:.Lmasks
|
|||
|
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
|
|||
|
+ rev64 $INlo.16b, $INlo.16b // byteswap Xi
|
|||
|
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
|
|||
|
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
|
|||
|
+
|
|||
|
+ mov $len, #16
|
|||
|
+ b .Lgmult_neon
|
|||
|
+.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
|
|||
|
+
|
|||
|
+.global GFp_gcm_ghash_neon
|
|||
|
+.type GFp_gcm_ghash_neon,%function
|
|||
|
+.align 4
|
|||
|
+GFp_gcm_ghash_neon:
|
|||
|
+ AARCH64_VALID_CALL_TARGET
|
|||
|
+ ld1 {$Xl.16b}, [$Xi] // load Xi
|
|||
|
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
|
|||
|
+ ld1 {$Hhi.1d}, [$Htbl]
|
|||
|
+ adrp x9, :pg_hi21:.Lmasks // load constants
|
|||
|
+ add x9, x9, :lo12:.Lmasks
|
|||
|
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
|
|||
|
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi
|
|||
|
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
|
|||
|
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
|
|||
|
+
|
|||
|
+.Loop_neon:
|
|||
|
+ ld1 {$INlo.16b}, [$inp], #16 // load inp
|
|||
|
+ rev64 $INlo.16b, $INlo.16b // byteswap inp
|
|||
|
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
|
|||
|
+ eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi
|
|||
|
+
|
|||
|
+.Lgmult_neon:
|
|||
|
+ // Split the input into $INlo and $INhi. (The upper halves are unused,
|
|||
|
+ // so it is okay to leave them alone.)
|
|||
|
+ ins $INhi.d[0], $INlo.d[1]
|
|||
|
+___
|
|||
|
+&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo
|
|||
|
+$code .= <<___;
|
|||
|
+ eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing
|
|||
|
+___
|
|||
|
+&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
|||
|
+&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi
|
|||
|
+$code .= <<___;
|
|||
|
+ ext $t0.16b, $Xl.16b, $Xh.16b, #8
|
|||
|
+ eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing
|
|||
|
+ eor $Xm.16b, $Xm.16b, $Xh.16b
|
|||
|
+ eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi
|
|||
|
+ ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result
|
|||
|
+ // This is a no-op due to the ins instruction below.
|
|||
|
+ // ins $Xh.d[0], $Xm.d[1]
|
|||
|
+
|
|||
|
+ // equivalent of reduction_avx from ghash-x86_64.pl
|
|||
|
+ shl $t1.2d, $Xl.2d, #57 // 1st phase
|
|||
|
+ shl $t2.2d, $Xl.2d, #62
|
|||
|
+ eor $t2.16b, $t2.16b, $t1.16b //
|
|||
|
+ shl $t1.2d, $Xl.2d, #63
|
|||
|
+ eor $t2.16b, $t2.16b, $t1.16b //
|
|||
|
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
|
|||
|
+ eor $t2.16b, $t2.16b, $Xm.16b
|
|||
|
+ ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]
|
|||
|
+ ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]
|
|||
|
+
|
|||
|
+ ushr $t2.2d, $Xl.2d, #1 // 2nd phase
|
|||
|
+ eor $Xh.16b, $Xh.16b,$Xl.16b
|
|||
|
+ eor $Xl.16b, $Xl.16b,$t2.16b //
|
|||
|
+ ushr $t2.2d, $t2.2d, #6
|
|||
|
+ ushr $Xl.2d, $Xl.2d, #1 //
|
|||
|
+ eor $Xl.16b, $Xl.16b, $Xh.16b //
|
|||
|
+ eor $Xl.16b, $Xl.16b, $t2.16b //
|
|||
|
+
|
|||
|
+ subs $len, $len, #16
|
|||
|
+ bne .Loop_neon
|
|||
|
+
|
|||
|
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi and write
|
|||
|
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
|
|||
|
+ st1 {$Xl.16b}, [$Xi]
|
|||
|
+
|
|||
|
+ ret
|
|||
|
+.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
|
|||
|
+
|
|||
|
+.section .rodata
|
|||
|
+.align 4
|
|||
|
+.Lmasks:
|
|||
|
+.quad 0x0000ffffffffffff // k48
|
|||
|
+.quad 0x00000000ffffffff // k32
|
|||
|
+.quad 0x000000000000ffff // k16
|
|||
|
+.quad 0x0000000000000000 // k0
|
|||
|
+.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
|
|||
|
+.align 2
|
|||
|
+___
|
|||
|
+
|
|||
|
+foreach (split("\n",$code)) {
|
|||
|
+ s/\`([^\`]*)\`/eval $1/geo;
|
|||
|
+
|
|||
|
+ print $_,"\n";
|
|||
|
+}
|
|||
|
+close STDOUT or die "error closing STDOUT"; # enforce flush
|
|||
|
--
|
|||
|
Efraim Flashner <efraim@flashner.co.il> רנשלפ םירפא
|
|||
|
GPG key = A28B F40C 3E55 1372 662D 14F7 41AA E7DC CA3D 8351
|
|||
|
Confidentiality cannot be guaranteed on emails sent or received unencrypted
|
|||
|
|