Bug Summary

File:jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
Warning:line 1797, column 5
Value stored to 'index' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name macroAssembler_x86_aes.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -mthread-model posix -fno-delete-null-pointer-checks -mframe-pointer=all -relaxed-aliasing -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/libjvm/objs/precompiled -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D _GNU_SOURCE -D _REENTRANT -D LIBC=gnu -D LINUX -D VM_LITTLE_ENDIAN -D _LP64=1 -D ASSERT -D CHECK_UNHANDLED_OOPS -D TARGET_ARCH_x86 -D INCLUDE_SUFFIX_OS=_linux -D INCLUDE_SUFFIX_CPU=_x86 -D INCLUDE_SUFFIX_COMPILER=_gcc -D TARGET_COMPILER_gcc -D AMD64 -D HOTSPOT_LIB_ARCH="amd64" -D COMPILER1 -D COMPILER2 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -I /home/daniel/Projects/java/jdk/src/hotspot/share/precompiled -I /home/daniel/Projects/java/jdk/src/hotspot/share/include -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix/include -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base/linux -I /home/daniel/Projects/java/jdk/src/java.base/share/native/libjimage -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -D _FORTIFY_SOURCE=2 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-format-zero-length -Wno-unused-parameter -Wno-unused -Wno-parentheses -Wno-comment -Wno-unknown-pragmas -Wno-address -Wno-delete-non-virtual-dtor -Wno-char-subscripts -Wno-array-bounds -Wno-int-in-bool-context -Wno-ignored-qualifiers -Wno-missing-field-initializers -Wno-implicit-fallthrough -Wno-empty-body -Wno-strict-overflow -Wno-sequence-point -Wno-maybe-uninitialized -Wno-misleading-indentation -Wno-cast-function-type -Wno-shift-negative-value -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /home/daniel/Projects/java/jdk/make/hotspot -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -stack-protector 1 -fno-rtti -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -o /home/daniel/Projects/java/scan/2021-12-21-193737-8510-1 -x c++ /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
1/*
2* Copyright (c) 2019, 2021, Intel Corporation. All rights reserved.
3*
4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5*
6* This code is free software; you can redistribute it and/or modify it
7* under the terms of the GNU General Public License version 2 only, as
8* published by the Free Software Foundation.
9*
10* This code is distributed in the hope that it will be useful, but WITHOUT
11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13* version 2 for more details (a copy is included in the LICENSE file that
14* accompanied this code).
15*
16* You should have received a copy of the GNU General Public License version
17* 2 along with this work; if not, write to the Free Software Foundation,
18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19*
20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21* or visit www.oracle.com if you need additional information or have any
22* questions.
23*
24*/
25
26#include "precompiled.hpp"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "runtime/stubRoutines.hpp"
30#include "macroAssembler_x86.hpp"
31
32#ifdef _LP641
33
34void MacroAssembler::roundEnc(XMMRegister key, int rnum) {
35 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
36 vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
37 }
38}
39
40void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) {
41 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
42 vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
43 }
44}
45
46void MacroAssembler::roundDec(XMMRegister key, int rnum) {
47 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
48 vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
49 }
50}
51
52void MacroAssembler::lastroundDec(XMMRegister key, int rnum) {
53 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
54 vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
55 }
56}
57
58// Load key and shuffle operation
59void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL__null) {
60 movdqu(xmmdst, Address(key, offset));
61 if (xmm_shuf_mask != NULL__null) {
62 pshufb(xmmdst, xmm_shuf_mask);
63 } else {
64 pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
65 }
66 evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
67}
68
69// AES-ECB Encrypt Operation
70void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
71
72 const Register pos = rax;
73 const Register rounds = r12;
74
75 Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
76 push(r13);
77 push(r12);
78
79 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
80 // context for the registers used, where all instructions below are using 128-bit mode
81 // On EVEX without VL and BW, these instructions will all be AVX.
82 if (VM_Version::supports_avx512vlbw()) {
83 movl(rax, 0xffff);
84 kmovql(k1, rax);
85 }
86 push(len); // Save
87 push(rbx);
88
89 vzeroupper();
90
91 xorptr(pos, pos);
92
93 // Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
94 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
95
96 // Load Key shuf mask
97 const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
98 movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
99
100 // Load and shuffle key based on number of rounds
101 ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
102 ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
103 ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
104 ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
105 ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
106 ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
107 ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
108 ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
109 ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
110 ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
111 ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
112 cmpl(rounds, 52);
113 jcc(Assembler::greaterEqual, KEY_192);
114 jmp(Loop_start);
115
116 bind(KEY_192);
117 ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
118 ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
119 cmpl(rounds, 60);
120 jcc(Assembler::equal, KEY_256);
121 jmp(Loop_start);
122
123 bind(KEY_256);
124 ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
125 ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
126
127 bind(Loop_start);
128 movq(rbx, len);
129 // Divide length by 16 to convert it to number of blocks
130 shrq(len, 4);
131 shlq(rbx, 60);
132 jcc(Assembler::equal, NO_PARTS);
133 addq(len, 1);
134 // Check if number of blocks is greater than or equal to 32
135 // If true, 512 bytes are processed at a time (code marked by label LOOP)
136 // If not, 16 bytes are processed (code marked by REMAINDER label)
137 bind(NO_PARTS);
138 movq(rbx, len);
139 shrq(len, 5);
140 jcc(Assembler::equal, REMAINDER);
141 movl(r13, len);
142 // Compute number of blocks that will be processed 512 bytes at a time
143 // Subtract this from the total number of blocks which will then be processed by REMAINDER loop
144 shlq(r13, 5);
145 subq(rbx, r13);
146 //Begin processing 512 bytes
147 bind(LOOP);
148 // Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
149 evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
150 evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
151 evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
152 evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
153 evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
154 evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
155 evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
156 evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
157 // Xor with the first round key
158 evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
159 evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
160 evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
161 evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
162 evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
163 evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
164 evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
165 evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
166 // 9 Aes encode round operations
167 roundEnc(xmm9, 7);
168 roundEnc(xmm10, 7);
169 roundEnc(xmm23, 7);
170 roundEnc(xmm12, 7);
171 roundEnc(xmm13, 7);
172 roundEnc(xmm14, 7);
173 roundEnc(xmm15, 7);
174 roundEnc(xmm16, 7);
175 roundEnc(xmm17, 7);
176 cmpl(rounds, 52);
177 jcc(Assembler::aboveEqual, AES192);
178 // Aesenclast round operation for keysize = 128
179 lastroundEnc(xmm24, 7);
180 jmp(END_LOOP);
181 //Additional 2 rounds of Aesenc operation for keysize = 192
182 bind(AES192);
183 roundEnc(xmm24, 7);
184 roundEnc(xmm19, 7);
185 cmpl(rounds, 60);
186 jcc(Assembler::aboveEqual, AES256);
187 // Aesenclast round for keysize = 192
188 lastroundEnc(xmm20, 7);
189 jmp(END_LOOP);
190 // 2 rounds of Aesenc operation and Aesenclast for keysize = 256
191 bind(AES256);
192 roundEnc(xmm20, 7);
193 roundEnc(xmm21, 7);
194 lastroundEnc(xmm22, 7);
195
196 bind(END_LOOP);
197 // Move 512 bytes of CT to destination
198 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
199 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
200 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
201 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
202 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
203 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
204 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
205 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
206
207 addq(pos, 512);
208 decq(len);
209 jcc(Assembler::notEqual, LOOP);
210
211 bind(REMAINDER);
212 vzeroupper();
213 cmpq(rbx, 0);
214 jcc(Assembler::equal, END);
215 // Process 16 bytes at a time
216 bind(LOOP2);
217 movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
218 vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
219 // xmm2 contains shuffled key for Aesenclast operation.
220 vmovdqu(xmm2, xmm24);
221
222 vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
223 vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
224 vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
225 vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
226 vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
227 vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
228 vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
229 vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
230 vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
231
232 cmpl(rounds, 52);
233 jcc(Assembler::below, LAST2);
234 vmovdqu(xmm2, xmm20);
235 vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
236 vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
237 cmpl(rounds, 60);
238 jcc(Assembler::below, LAST2);
239 vmovdqu(xmm2, xmm22);
240 vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
241 vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
242
243 bind(LAST2);
244 // Aesenclast round
245 vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
246 // Write 16 bytes of CT to destination
247 movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
248 addq(pos, 16);
249 decq(rbx);
250 jcc(Assembler::notEqual, LOOP2);
251
252 bind(END);
253 // Zero out the round keys
254 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
255 evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
256 evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
257 evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
258 evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
259 evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
260 evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
261 evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
262 evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
263 evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
264 evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
265 cmpl(rounds, 44);
266 jcc(Assembler::belowEqual, EXIT);
267 evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
268 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
269 cmpl(rounds, 52);
270 jcc(Assembler::belowEqual, EXIT);
271 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
272 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
273 bind(EXIT);
274 pop(rbx);
275 pop(rax); // return length
276 pop(r12);
277 pop(r13);
278}
279
280// AES-ECB Decrypt Operation
281void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) {
282
283 Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
284 const Register pos = rax;
285 const Register rounds = r12;
286 push(r13);
287 push(r12);
288
289 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
290 // context for the registers used, where all instructions below are using 128-bit mode
291 // On EVEX without VL and BW, these instructions will all be AVX.
292 if (VM_Version::supports_avx512vlbw()) {
293 movl(rax, 0xffff);
294 kmovql(k1, rax);
295 }
296
297 push(len); // Save
298 push(rbx);
299
300 vzeroupper();
301
302 xorptr(pos, pos);
303 // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
304 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
305
306 // Load Key shuf mask
307 const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
308 movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
309
310 // Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
311 // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
312 ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
313 ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
314 ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
315 ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
316 ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
317 ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
318 ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
319 ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
320 ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
321 ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
322 ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
323 cmpl(rounds, 52);
324 jcc(Assembler::greaterEqual, KEY_192);
325 jmp(Loop_start);
326
327 bind(KEY_192);
328 ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
329 ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
330 cmpl(rounds, 60);
331 jcc(Assembler::equal, KEY_256);
332 jmp(Loop_start);
333
334 bind(KEY_256);
335 ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
336 ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
337 bind(Loop_start);
338 movq(rbx, len);
339 // Convert input length to number of blocks
340 shrq(len, 4);
341 shlq(rbx, 60);
342 jcc(Assembler::equal, NO_PARTS);
343 addq(len, 1);
344 // Check if number of blocks is greater than/ equal to 32
345 // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
346 // If not, 16 bytes are processed (code marked by label REMAINDER)
347 bind(NO_PARTS);
348 movq(rbx, len);
349 shrq(len, 5);
350 jcc(Assembler::equal, REMAINDER);
351 movl(r13, len);
352 // Compute number of blocks that will be processed as 512 bytes at a time
353 // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
354 shlq(r13, 5);
355 subq(rbx, r13);
356
357 bind(LOOP);
358 // Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
359 evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
360 evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
361 evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
362 evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
363 evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
364 evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
365 evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
366 evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
367 // Xor with the first round key
368 evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
369 evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
370 evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
371 evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
372 evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
373 evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
374 evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
375 evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
376 // 9 rounds of Aesdec
377 roundDec(xmm10, 7);
378 roundDec(xmm11, 7);
379 roundDec(xmm12, 7);
380 roundDec(xmm13, 7);
381 roundDec(xmm14, 7);
382 roundDec(xmm15, 7);
383 roundDec(xmm16, 7);
384 roundDec(xmm17, 7);
385 roundDec(xmm18, 7);
386 cmpl(rounds, 52);
387 jcc(Assembler::aboveEqual, AES192);
388 // Aesdeclast round for keysize = 128
389 lastroundDec(xmm27, 7);
390 jmp(END_LOOP);
391
392 bind(AES192);
393 // 2 Additional rounds for keysize = 192
394 roundDec(xmm19, 7);
395 roundDec(xmm20, 7);
396 cmpl(rounds, 60);
397 jcc(Assembler::aboveEqual, AES256);
398 // Aesdeclast round for keysize = 192
399 lastroundDec(xmm27, 7);
400 jmp(END_LOOP);
401 bind(AES256);
402 // 2 Additional rounds and Aesdeclast for keysize = 256
403 roundDec(xmm21, 7);
404 roundDec(xmm22, 7);
405 lastroundDec(xmm27, 7);
406
407 bind(END_LOOP);
408 // Write 512 bytes of PT to the destination
409 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
410 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
411 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
412 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
413 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
414 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
415 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
416 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
417
418 addq(pos, 512);
419 decq(len);
420 jcc(Assembler::notEqual, LOOP);
421
422 bind(REMAINDER);
423 vzeroupper();
424 cmpq(rbx, 0);
425 jcc(Assembler::equal, END);
426 // Process 16 bytes at a time
427 bind(LOOP2);
428 movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
429 vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
430 // xmm2 contains shuffled key for Aesdeclast operation.
431 vmovdqu(xmm2, xmm27);
432
433 vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
434 vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
435 vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
436 vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
437 vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
438 vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
439 vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
440 vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
441 vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);
442
443 cmpl(rounds, 52);
444 jcc(Assembler::below, LAST2);
445 vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
446 vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
447 cmpl(rounds, 60);
448 jcc(Assembler::below, LAST2);
449 vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
450 vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);
451
452 bind(LAST2);
453 // Aesdeclast round
454 vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
455 // Write 16 bytes of PT to destination
456 movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
457 addq(pos, 16);
458 decq(rbx);
459 jcc(Assembler::notEqual, LOOP2);
460
461 bind(END);
462 // Zero out the round keys
463 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
464 evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
465 evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
466 evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
467 evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
468 evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
469 evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
470 evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
471 evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
472 evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
473 evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
474 evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
475 cmpl(rounds, 44);
476 jcc(Assembler::belowEqual, EXIT);
477 evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
478 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
479 cmpl(rounds, 52);
480 jcc(Assembler::belowEqual, EXIT);
481 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
482 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
483 bind(EXIT);
484 pop(rbx);
485 pop(rax); // return length
486 pop(r12);
487 pop(r13);
488}
489
490// Multiply 128 x 128 bits, using 4 pclmulqdq operations
491void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
492 XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
493 movdqu(xmm15, Address(htbl, i * 16));
494 vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
495 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
496 vpclmulldq(tmp3, data, xmm15); // 0x00
497 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
498 vpclmulhdq(tmp3, data, xmm15); // 0x11
499 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
500 vpclmullqhqdq(tmp3, data, xmm15); // 0x10
501 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
502}
503
504// Multiply two 128 bit numbers resulting in a 256 bit value
505// Result of the multiplication followed by reduction stored in state
506void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
507 const XMMRegister tmp1 = xmm4;
508 const XMMRegister tmp2 = xmm5;
509 const XMMRegister tmp3 = xmm6;
510 const XMMRegister tmp4 = xmm7;
511
512 vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
513 vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
514 vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
515 vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
516
517 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
518
519 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
520 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
521 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
522 vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
523 // Follows the reduction technique mentioned in
524 // Shift-XOR reduction described in Gueron-Kounavis May 2010
525 // First phase of reduction
526 //
527 vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
528 vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
529 vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
530 // xor the shifted versions
531 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
532 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
533 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
534 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
535 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
536 //
537 // Second phase of the reduction
538 //
539 vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
540 vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
541 vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
542 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
543 vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
544 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
545 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
546 vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
547 ret(0);
548}
549
550// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
551// The power of H is used in reduction process for one block ghash
552void MacroAssembler::generateHtbl_one_block(Register htbl) {
553 const XMMRegister t = xmm13;
554
555 // load the original subkey hash
556 movdqu(t, Address(htbl, 0));
557 // shuffle using long swap mask
558 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
559 vpshufb(t, t, xmm10, Assembler::AVX_128bit);
560
561 // Compute H' = GFMUL(H, 2)
562 vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
563 movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
564 vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
565 movl(rax, 0xff00);
566 movdl(xmm4, rax);
567 vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
568 movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
569 vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
570 vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
571 vpslld(xmm4, t, 1, Assembler::AVX_128bit);
572 vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
573 vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
574
575 //Adding p(x)<<1 to xmm5 which holds the reduction polynomial
576 vpxor(t, t, xmm5, Assembler::AVX_128bit);
577 movdqu(Address(htbl, 1 * 16), t); // H * 2
578
579 ret(0);
580}
581
582// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
583// The power of H is used in reduction process for eight block ghash
584void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
585 const XMMRegister t = xmm13;
586 const XMMRegister tmp0 = xmm1;
587 Label GFMUL;
588
589 movdqu(t, Address(htbl, 1 * 16));
590 movdqu(tmp0, t);
591
592 // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
593 call(GFMUL, relocInfo::none);
594 movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
595 call(GFMUL, relocInfo::none);
596 movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
597 call(GFMUL, relocInfo::none);
598 movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
599 call(GFMUL, relocInfo::none);
600 movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
601 call(GFMUL, relocInfo::none);
602 movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
603 call(GFMUL, relocInfo::none);
604 movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
605 call(GFMUL, relocInfo::none);
606 movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
607 ret(0);
608
609 bind(GFMUL);
610 gfmul(tmp0, t);
611}
612
613// Multiblock and single block GHASH computation using Shift XOR reduction technique
614void MacroAssembler::avx_ghash(Register input_state, Register htbl,
615 Register input_data, Register blocks) {
616
617 // temporary variables to hold input data and input state
618 const XMMRegister data = xmm1;
619 const XMMRegister state = xmm0;
620 // temporary variables to hold intermediate results
621 const XMMRegister tmp0 = xmm3;
622 const XMMRegister tmp1 = xmm4;
623 const XMMRegister tmp2 = xmm5;
624 const XMMRegister tmp3 = xmm6;
625 // temporary variables to hold byte and long swap masks
626 const XMMRegister bswap_mask = xmm2;
627 const XMMRegister lswap_mask = xmm14;
628
629 Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
630 ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
631
632 testptr(blocks, blocks);
633 jcc(Assembler::zero, EXIT_GHASH);
634
635 // Check if Hashtable (1*16) has been already generated
636 // For anything less than 8 blocks, we generate only the first power of H.
637 movdqu(tmp2, Address(htbl, 1 * 16));
638 ptest(tmp2, tmp2);
639 jcc(Assembler::notZero, BEGIN_PROCESS);
640 call(GENERATE_HTBL_1_BLK, relocInfo::none);
641
642 // Shuffle the input state
643 bind(BEGIN_PROCESS);
644 movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
645 movdqu(state, Address(input_state, 0));
646 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
647
648 cmpl(blocks, 8);
649 jcc(Assembler::below, ONE_BLK_INIT);
650 // If we have 8 blocks or more data, then generate remaining powers of H
651 movdqu(tmp2, Address(htbl, 8 * 16));
652 ptest(tmp2, tmp2);
653 jcc(Assembler::notZero, PROCESS_8_BLOCKS);
654 call(GENERATE_HTBL_8_BLKS, relocInfo::none);
655
656 //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
657 //Each block = 16 bytes.
658 bind(PROCESS_8_BLOCKS);
659 subl(blocks, 8);
660 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
661 movdqu(data, Address(input_data, 16 * 7));
662 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
663 //Loading 1*16 as calculated powers of H required starts at that location.
664 movdqu(xmm15, Address(htbl, 1 * 16));
665 //Perform carryless multiplication of (H*2, data block #7)
666 vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
667 vpclmulldq(tmp0, data, xmm15);//a0 * b0
668 vpclmulhdq(tmp1, data, xmm15);//a1 * b1
669 vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
670 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
671
672 movdqu(data, Address(input_data, 16 * 6));
673 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
674 // Perform carryless multiplication of (H^2 * 2, data block #6)
675 schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
676
677 movdqu(data, Address(input_data, 16 * 5));
678 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
679 // Perform carryless multiplication of (H^3 * 2, data block #5)
680 schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
681 movdqu(data, Address(input_data, 16 * 4));
682 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
683 // Perform carryless multiplication of (H^4 * 2, data block #4)
684 schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
685 movdqu(data, Address(input_data, 16 * 3));
686 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
687 // Perform carryless multiplication of (H^5 * 2, data block #3)
688 schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
689 movdqu(data, Address(input_data, 16 * 2));
690 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
691 // Perform carryless multiplication of (H^6 * 2, data block #2)
692 schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
693 movdqu(data, Address(input_data, 16 * 1));
694 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
695 // Perform carryless multiplication of (H^7 * 2, data block #1)
696 schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
697 movdqu(data, Address(input_data, 16 * 0));
698 // xor data block#0 with input state before perfoming carry-less multiplication
699 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
700 vpxor(data, data, state, Assembler::AVX_128bit);
701 // Perform carryless multiplication of (H^8 * 2, data block #0)
702 schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
703 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
704 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
705 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
706 vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
707
708 // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
709 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
710 // Follows the reduction technique mentioned in
711 // Shift-XOR reduction described in Gueron-Kounavis May 2010
712 bind(BLOCK8_REDUCTION);
713 // First Phase of the reduction
714 vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
715 vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
716 vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
717 // xor the shifted versions
718 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
719 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
720
721 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
722 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
723
724 vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
725 // second phase of the reduction
726 vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
727 vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
728 vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
729 // xor the shifted versions
730 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
731 vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
732 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
733 vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
734 // Final result is in state
735 vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
736
737 lea(input_data, Address(input_data, 16 * 8));
738 cmpl(blocks, 8);
739 jcc(Assembler::below, ONE_BLK_INIT);
740 jmp(PROCESS_8_BLOCKS);
741
742 // Since this is one block operation we will only use H * 2 i.e. the first power of H
743 bind(ONE_BLK_INIT);
744 movdqu(tmp0, Address(htbl, 1 * 16));
745 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
746
747 //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
748 bind(PROCESS_1_BLOCK);
749 cmpl(blocks, 0);
750 jcc(Assembler::equal, SAVE_STATE);
751 subl(blocks, 1);
752 movdqu(data, Address(input_data, 0));
753 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
754 vpxor(state, state, data, Assembler::AVX_128bit);
755 // gfmul(H*2, state)
756 call(GFMUL, relocInfo::none);
757 addptr(input_data, 16);
758 jmp(PROCESS_1_BLOCK);
759
760 bind(SAVE_STATE);
761 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
762 movdqu(Address(input_state, 0), state);
763 jmp(EXIT_GHASH);
764
765 bind(GFMUL);
766 gfmul(tmp0, state);
767
768 bind(GENERATE_HTBL_1_BLK);
769 generateHtbl_one_block(htbl);
770
771 bind(GENERATE_HTBL_8_BLKS);
772 generateHtbl_eight_blocks(htbl);
773
774 bind(EXIT_GHASH);
775 // zero out xmm registers used for Htbl storage
776 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
777 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
778 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
779 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
780}
781
782// AES Counter Mode using VAES instructions
783void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
784 Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
785
786 const Register rounds = 0;
787 const Register pos = r12;
788
789 Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
790 AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
791 REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
792 AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
793 AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
794 EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
795
796 cmpl(len_reg, 0);
797 jcc(Assembler::belowEqual, EXIT);
798
799 movl(pos, 0);
800 // if the number of used encrypted counter bytes < 16,
801 // XOR PT with saved encrypted counter to obtain CT
802 bind(PRELOOP_START);
803 cmpl(used, 16);
804 jcc(Assembler::aboveEqual, EXIT_PRELOOP);
805 movb(rbx, Address(saved_encCounter_start, used));
806 xorb(rbx, Address(src_addr, pos));
807 movb(Address(dest_addr, pos), rbx);
808 addptr(pos, 1);
809 addptr(used, 1);
810 decrement(len_reg);
811 jmp(PRELOOP_START);
812
813 bind(EXIT_PRELOOP);
814 movl(Address(used_addr, 0), used);
815
816 // Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
817 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
818
819 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
820 // Move initial counter value in xmm0
821 movdqu(xmm0, Address(counter, 0));
822 // broadcast counter value to zmm8
823 evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
824
825 // load lbswap mask
826 evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15);
827
828 //shuffle counter using lbswap_mask
829 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
830
831 // pre-increment and propagate counter values to zmm9-zmm15 registers.
832 // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
833 // The counter is incremented after each block i.e. 16 bytes is processed;
834 // each zmm register has 4 counter values as its MSB
835 // the counters are incremented in parallel
836 vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0
837 vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip)
838 vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
839 vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
840 vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
841 vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
842 vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
843 vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
844
845 // load linc32 mask in zmm register.linc32 increments counter by 32
846 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32
847
848 // xmm31 contains the key shuffle mask.
849 movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
850 // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
851 // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
852 // that holds shuffled key value.
853 ev_load_key(xmm20, key, 0, xmm31);
854 ev_load_key(xmm21, key, 1 * 16, xmm31);
855 ev_load_key(xmm22, key, 2 * 16, xmm31);
856 ev_load_key(xmm23, key, 3 * 16, xmm31);
857 ev_load_key(xmm24, key, 4 * 16, xmm31);
858 ev_load_key(xmm25, key, 5 * 16, xmm31);
859 ev_load_key(xmm26, key, 6 * 16, xmm31);
860 ev_load_key(xmm27, key, 7 * 16, xmm31);
861 ev_load_key(xmm28, key, 8 * 16, xmm31);
862 ev_load_key(xmm29, key, 9 * 16, xmm31);
863 ev_load_key(xmm30, key, 10 * 16, xmm31);
864
865 // Process 32 blocks or 512 bytes of data
866 bind(LOOP);
867 cmpl(len_reg, 512);
868 jcc(Assembler::less, REMAINDER);
869 subq(len_reg, 512);
870 //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
871 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
872 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
873 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
874 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
875 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
876 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
877 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
878 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
879 vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
880 evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
881 vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
882 evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
883 vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
884 evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
885 vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
886 evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
887 // Perform AES encode operations and put results in zmm0-zmm7.
888 // This is followed by incrementing counter values in zmm8-zmm15.
889 // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
890 roundEnc(xmm21, 7);
891 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
892 roundEnc(xmm22, 7);
893 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
894 roundEnc(xmm23, 7);
895 vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
896 roundEnc(xmm24, 7);
897 vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
898 roundEnc(xmm25, 7);
899 vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
900 roundEnc(xmm26, 7);
901 vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
902 roundEnc(xmm27, 7);
903 vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
904 roundEnc(xmm28, 7);
905 vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
906 roundEnc(xmm29, 7);
907
908 cmpl(rounds, 52);
909 jcc(Assembler::aboveEqual, AES192);
910 lastroundEnc(xmm30, 7);
911 jmp(END_LOOP);
912
913 bind(AES192);
914 roundEnc(xmm30, 7);
915 ev_load_key(xmm18, key, 11 * 16, xmm31);
916 roundEnc(xmm18, 7);
917 cmpl(rounds, 60);
918 jcc(Assembler::aboveEqual, AES256);
919 ev_load_key(xmm18, key, 12 * 16, xmm31);
920 lastroundEnc(xmm18, 7);
921 jmp(END_LOOP);
922
923 bind(AES256);
924 ev_load_key(xmm18, key, 12 * 16, xmm31);
925 roundEnc(xmm18, 7);
926 ev_load_key(xmm18, key, 13 * 16, xmm31);
927 roundEnc(xmm18, 7);
928 ev_load_key(xmm18, key, 14 * 16, xmm31);
929 lastroundEnc(xmm18, 7);
930
931 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
932 // xor encrypted block cipher and input plaintext and store resultant ciphertext
933 bind(END_LOOP);
934 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
935 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
936 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
937 evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
938 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
939 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
940 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
941 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
942 evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
943 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
944 evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
945 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
946 evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
947 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
948 evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
949 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
950 addq(pos, 512);
951 jmp(LOOP);
952
953 // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
954 bind(REMAINDER);
955 cmpl(len_reg, 0);
956 jcc(Assembler::equal, END);
957 cmpl(len_reg, 256);
958 jcc(Assembler::aboveEqual, REMAINDER_16);
959 cmpl(len_reg, 128);
960 jcc(Assembler::aboveEqual, REMAINDER_8);
961 cmpl(len_reg, 64);
962 jcc(Assembler::aboveEqual, REMAINDER_4);
963 // At this point, we will process 16 bytes of data at a time.
964 // So load xmm19 with counter increment value as 1
965 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);
966 jmp(REMAINDER_LOOP);
967
968 // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
969 bind(REMAINDER_16);
970 subq(len_reg, 256);
971 // As we process 16 blocks at a time, load mask for incrementing the counter value by 16
972 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip)
973 // shuffle counter and XOR counter with roundkey1
974 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
975 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
976 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
977 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
978 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
979 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
980 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
981 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
982 // Increment counter values by 16
983 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
984 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
985 // AES encode rounds
986 roundEnc(xmm21, 3);
987 roundEnc(xmm22, 3);
988 roundEnc(xmm23, 3);
989 roundEnc(xmm24, 3);
990 roundEnc(xmm25, 3);
991 roundEnc(xmm26, 3);
992 roundEnc(xmm27, 3);
993 roundEnc(xmm28, 3);
994 roundEnc(xmm29, 3);
995
996 cmpl(rounds, 52);
997 jcc(Assembler::aboveEqual, AES192_REMAINDER16);
998 lastroundEnc(xmm30, 3);
999 jmp(REMAINDER16_END_LOOP);
1000
1001 bind(AES192_REMAINDER16);
1002 roundEnc(xmm30, 3);
1003 ev_load_key(xmm18, key, 11 * 16, xmm31);
1004 roundEnc(xmm18, 3);
1005 ev_load_key(xmm5, key, 12 * 16, xmm31);
1006
1007 cmpl(rounds, 60);
1008 jcc(Assembler::aboveEqual, AES256_REMAINDER16);
1009 lastroundEnc(xmm5, 3);
1010 jmp(REMAINDER16_END_LOOP);
1011 bind(AES256_REMAINDER16);
1012 roundEnc(xmm5, 3);
1013 ev_load_key(xmm6, key, 13 * 16, xmm31);
1014 roundEnc(xmm6, 3);
1015 ev_load_key(xmm7, key, 14 * 16, xmm31);
1016 lastroundEnc(xmm7, 3);
1017
1018 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
1019 // xor 256 bytes of PT with the encrypted counters to produce CT.
1020 bind(REMAINDER16_END_LOOP);
1021 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
1022 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1023 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1024 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1025 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1026 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
1027 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1028 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
1029 addq(pos, 256);
1030
1031 cmpl(len_reg, 128);
1032 jcc(Assembler::aboveEqual, REMAINDER_8);
1033
1034 cmpl(len_reg, 64);
1035 jcc(Assembler::aboveEqual, REMAINDER_4);
1036 //load mask for incrementing the counter value by 1
1037 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1038 jmp(REMAINDER_LOOP);
1039
1040 // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
1041 bind(REMAINDER_8);
1042 subq(len_reg, 128);
1043 // As we process 8 blocks at a time, load mask for incrementing the counter value by 8
1044 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip)
1045 // shuffle counters and xor with roundkey1
1046 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1047 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1048 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
1049 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
1050 // increment counter by 8
1051 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1052 // AES encode
1053 roundEnc(xmm21, 1);
1054 roundEnc(xmm22, 1);
1055 roundEnc(xmm23, 1);
1056 roundEnc(xmm24, 1);
1057 roundEnc(xmm25, 1);
1058 roundEnc(xmm26, 1);
1059 roundEnc(xmm27, 1);
1060 roundEnc(xmm28, 1);
1061 roundEnc(xmm29, 1);
1062
1063 cmpl(rounds, 52);
1064 jcc(Assembler::aboveEqual, AES192_REMAINDER8);
1065 lastroundEnc(xmm30, 1);
1066 jmp(REMAINDER8_END_LOOP);
1067
1068 bind(AES192_REMAINDER8);
1069 roundEnc(xmm30, 1);
1070 ev_load_key(xmm18, key, 11 * 16, xmm31);
1071 roundEnc(xmm18, 1);
1072 ev_load_key(xmm5, key, 12 * 16, xmm31);
1073 cmpl(rounds, 60);
1074 jcc(Assembler::aboveEqual, AES256_REMAINDER8);
1075 lastroundEnc(xmm5, 1);
1076 jmp(REMAINDER8_END_LOOP);
1077
1078 bind(AES256_REMAINDER8);
1079 roundEnc(xmm5, 1);
1080 ev_load_key(xmm6, key, 13 * 16, xmm31);
1081 roundEnc(xmm6, 1);
1082 ev_load_key(xmm7, key, 14 * 16, xmm31);
1083 lastroundEnc(xmm7, 1);
1084
1085 bind(REMAINDER8_END_LOOP);
1086 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
1087 // XOR PT with the encrypted counter and store as CT
1088 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1089 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
1090 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1091 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1092 addq(pos, 128);
1093
1094 cmpl(len_reg, 64);
1095 jcc(Assembler::aboveEqual, REMAINDER_4);
1096 // load mask for incrementing the counter value by 1
1097 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1098 jmp(REMAINDER_LOOP);
1099
1100 // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
1101 bind(REMAINDER_4);
1102 subq(len_reg, 64);
1103 // As we process 4 blocks at a time, load mask for incrementing the counter value by 4
1104 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
1105 // XOR counter with first roundkey
1106 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1107 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1108 // Increment counter
1109 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1110 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
1111 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
1112 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
1113 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
1114 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
1115 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
1116 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
1117 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
1118 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
1119 cmpl(rounds, 52);
1120 jcc(Assembler::aboveEqual, AES192_REMAINDER4);
1121 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1122 jmp(END_REMAINDER4);
1123
1124 bind(AES192_REMAINDER4);
1125 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1126 ev_load_key(xmm18, key, 11 * 16, xmm31);
1127 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
1128 ev_load_key(xmm5, key, 12 * 16, xmm31);
1129
1130 cmpl(rounds, 60);
1131 jcc(Assembler::aboveEqual, AES256_REMAINDER4);
1132 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1133 jmp(END_REMAINDER4);
1134
1135 bind(AES256_REMAINDER4);
1136 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1137 ev_load_key(xmm6, key, 13 * 16, xmm31);
1138 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
1139 ev_load_key(xmm7, key, 14 * 16, xmm31);
1140 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
1141 // After AES encode rounds, the encrypted block cipher lies in zmm0.
1142 // XOR encrypted block cipher with PT and store 64 bytes of ciphertext
1143 bind(END_REMAINDER4);
1144 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1145 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1146 addq(pos, 64);
1147 // load mask for incrementing the counter value by 1
1148 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1149
1150 // For a single block, the AES rounds start here.
1151 bind(REMAINDER_LOOP);
1152 cmpl(len_reg, 0);
1153 jcc(Assembler::belowEqual, END);
1154 // XOR counter with first roundkey
1155 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
1156 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
1157 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
1158 // Increment counter by 1
1159 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
1160 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
1161 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
1162 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
1163 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
1164 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
1165 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
1166 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
1167 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
1168
1169 cmpl(rounds, 52);
1170 jcc(Assembler::aboveEqual, AES192_REMAINDER);
1171 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1172 jmp(END_REMAINDER_LOOP);
1173
1174 bind(AES192_REMAINDER);
1175 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1176 ev_load_key(xmm18, key, 11 * 16, xmm31);
1177 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
1178 ev_load_key(xmm5, key, 12 * 16, xmm31);
1179 cmpl(rounds, 60);
1180 jcc(Assembler::aboveEqual, AES256_REMAINDER);
1181 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1182 jmp(END_REMAINDER_LOOP);
1183
1184 bind(AES256_REMAINDER);
1185 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1186 ev_load_key(xmm6, key, 13 * 16, xmm31);
1187 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
1188 ev_load_key(xmm7, key, 14 * 16, xmm31);
1189 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
1190
1191 bind(END_REMAINDER_LOOP);
1192 // If the length register is less than the blockSize i.e. 16
1193 // then we store only those bytes of the CT to the destination
1194 // corresponding to the length register value
1195 // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
1196 cmpl(len_reg, 16);
1197 jcc(Assembler::less, EXTRACT_TAILBYTES);
1198 subl(len_reg, 16);
1199 // After AES encode rounds, the encrypted block cipher lies in xmm0.
1200 // If the length register is equal to 16 bytes, store CT in dest after XOR operation.
1201 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1202 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
1203 addl(pos, 16);
1204
1205 jmp(REMAINDER_LOOP);
1206
1207 bind(EXTRACT_TAILBYTES);
1208 // Save encrypted counter value in xmm0 for next invocation, before XOR operation
1209 movdqu(Address(saved_encCounter_start, 0), xmm0);
1210 // XOR encryted block cipher in xmm0 with PT to produce CT
1211 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1212 // extract upto 15 bytes of CT from xmm0 as specified by length register
1213 testptr(len_reg, 8);
1214 jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
1215 pextrq(Address(dest_addr, pos), xmm0, 0);
1216 psrldq(xmm0, 8);
1217 addl(pos, 8);
1218 bind(EXTRACT_TAIL_4BYTES);
1219 testptr(len_reg, 4);
1220 jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
1221 pextrd(Address(dest_addr, pos), xmm0, 0);
1222 psrldq(xmm0, 4);
1223 addq(pos, 4);
1224 bind(EXTRACT_TAIL_2BYTES);
1225 testptr(len_reg, 2);
1226 jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
1227 pextrw(Address(dest_addr, pos), xmm0, 0);
1228 psrldq(xmm0, 2);
1229 addl(pos, 2);
1230 bind(EXTRACT_TAIL_1BYTE);
1231 testptr(len_reg, 1);
1232 jcc(Assembler::zero, END);
1233 pextrb(Address(dest_addr, pos), xmm0, 0);
1234 addl(pos, 1);
1235
1236 bind(END);
1237 // If there are no tail bytes, store counter value and exit
1238 cmpl(len_reg, 0);
1239 jcc(Assembler::equal, STORE_CTR);
1240 movl(Address(used_addr, 0), len_reg);
1241
1242 bind(STORE_CTR);
1243 //shuffle updated counter and store it
1244 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
1245 movdqu(Address(counter, 0), xmm8);
1246 // Zero out counter and key registers
1247 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
1248 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
1249 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
1250 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
1251 evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
1252 evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
1253 evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
1254 evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
1255 evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
1256 evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
1257 evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
1258 evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
1259 cmpl(rounds, 44);
1260 jcc(Assembler::belowEqual, EXIT);
1261 evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
1262 evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
1263 cmpl(rounds, 52);
1264 jcc(Assembler::belowEqual, EXIT);
1265 evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
1266 evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
1267 bind(EXIT);
1268}
1269
1270void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
1271 const XMMRegister TMP1 = xmm0;
1272 const XMMRegister TMP2 = xmm1;
1273 const XMMRegister TMP3 = xmm2;
1274
1275 evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit);
1276 evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit);
1277 evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit);
1278 evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit);
1279 evpxorq(GH, GH, TMP3, Assembler::AVX_512bit);
1280 vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit);
1281 vpslldq(GH, GH, 8, Assembler::AVX_512bit);
1282 evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit);
1283 evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1284
1285 evmovdquq(TMP3, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, r15);
1286 evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit);
1287 vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit);
1288 evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1289 evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit);
1290 vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit);
1291 evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit);
1292 vpslldq(GH, GH, 4, Assembler::AVX_512bit);
1293 vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
1294}
1295
1296void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) {
1297 const XMMRegister HK = xmm6;
1298 const XMMRegister ZT5 = xmm4;
1299 const XMMRegister ZT7 = xmm7;
1300 const XMMRegister ZT8 = xmm8;
1301
1302 Label GFMUL_AVX512;
1303
1304 movdqu(HK, Address(htbl, 0));
1305 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1306 vpshufb(HK, HK, xmm10, Assembler::AVX_128bit);
1307
1308 movdqu(xmm11, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 64)); // Poly
1309 movdqu(xmm12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 80)); // Twoone
1310 // Compute H ^ 2 from the input subkeyH
1311 movdqu(xmm2, xmm6);
1312 vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit);
1313 vpsrlq(xmm2, xmm2, 63, Assembler::AVX_128bit);
1314 movdqu(xmm1, xmm2);
1315 vpslldq(xmm2, xmm2, 8, Assembler::AVX_128bit);
1316 vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit);
1317 vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1318
1319 vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit);
1320 vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit);
1321 vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
1322 vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1323 movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2
1324 // Compute the remaining three powers of H using XMM registers and all following powers using ZMM
1325 movdqu(ZT5, HK);
1326 vinserti32x4(ZT7, ZT7, HK, 3);
1327
1328 gfmul_avx512(ZT5, HK);
1329 movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2
1330 vinserti32x4(ZT7, ZT7, ZT5, 2);
1331
1332 gfmul_avx512(ZT5, HK);
1333 movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3
1334 vinserti32x4(ZT7, ZT7, ZT5, 1);
1335
1336 gfmul_avx512(ZT5, HK);
1337 movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4
1338 vinserti32x4(ZT7, ZT7, ZT5, 0);
1339
1340 evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit);
1341 evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);
1342 gfmul_avx512(ZT7, ZT5);
1343 evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit);
1344 evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);
1345 gfmul_avx512(ZT8, ZT5);
1346 evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit);
1347 gfmul_avx512(ZT7, ZT5);
1348 evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit);
1349 gfmul_avx512(ZT8, ZT5);
1350 evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit);
1351 gfmul_avx512(ZT7, ZT5);
1352 evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);
1353 gfmul_avx512(ZT8, ZT5);
1354 evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit);
1355 gfmul_avx512(ZT7, ZT5);
1356 evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit);
1357 gfmul_avx512(ZT8, ZT5);
1358 evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit);
1359 gfmul_avx512(ZT7, ZT5);
1360 evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit);
1361 gfmul_avx512(ZT8, ZT5);
1362 evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit);
1363 gfmul_avx512(ZT7, ZT5);
1364 evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit);
1365 ret(0);
1366}
1367
1368#define vclmul_reduce(out, poly, hi128, lo128, tmp0, tmp1)evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); vpslldq
(tmp0, tmp0, 8, Assembler::AVX_512bit); evpxorq(tmp0, lo128, tmp0
, Assembler::AVX_512bit); evpclmulqdq(tmp1, poly, tmp0, 0x00,
Assembler::AVX_512bit); vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit
); evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit);
vpslldq(out, out, 4, Assembler::AVX_512bit); vpternlogq(out,
0x96, tmp1, hi128, Assembler::AVX_512bit);
\
1369evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); \
1370vpslldq(tmp0, tmp0, 8, Assembler::AVX_512bit); \
1371evpxorq(tmp0, lo128, tmp0, Assembler::AVX_512bit); \
1372evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); \
1373vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit); \
1374evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); \
1375vpslldq(out, out, 4, Assembler::AVX_512bit); \
1376vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \
1377
1378#define vhpxori4x128(reg, tmp)vextracti64x4(tmp, reg, 1); evpxorq(reg, reg, tmp, Assembler::
AVX_256bit); vextracti32x4(tmp, reg, 1); evpxorq(reg, reg, tmp
, Assembler::AVX_128bit);
\
1379vextracti64x4(tmp, reg, 1); \
1380evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \
1381vextracti32x4(tmp, reg, 1); \
1382evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \
1383
1384#define roundEncode(key, dst1, dst2, dst3, dst4)vaesenc(dst1, dst1, key, Assembler::AVX_512bit); vaesenc(dst2
, dst2, key, Assembler::AVX_512bit); vaesenc(dst3, dst3, key,
Assembler::AVX_512bit); vaesenc(dst4, dst4, key, Assembler::
AVX_512bit);
\
1385vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \
1386vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \
1387vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \
1388vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \
1389
1390#define lastroundEncode(key, dst1, dst2, dst3, dst4)vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); vaesenclast
(dst2, dst2, key, Assembler::AVX_512bit); vaesenclast(dst3, dst3
, key, Assembler::AVX_512bit); vaesenclast(dst4, dst4, key, Assembler
::AVX_512bit);
\
1391vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \
1392vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \
1393vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \
1394vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \
1395
1396#define storeData(dst, position, src1, src2, src3, src4)evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1
, Assembler::AVX_512bit); evmovdquq(Address(dst, position, Address
::times_1, 1 * 64), src2, Assembler::AVX_512bit); evmovdquq(Address
(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit
); evmovdquq(Address(dst, position, Address::times_1, 3 * 64)
, src4, Assembler::AVX_512bit);
\
1397evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \
1398evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \
1399evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \
1400evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \
1401
1402#define loadData(src, position, dst1, dst2, dst3, dst4)evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64
), Assembler::AVX_512bit); evmovdquq(dst2, Address(src, position
, Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq
(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler
::AVX_512bit); evmovdquq(dst4, Address(src, position, Address
::times_1, 3 * 64), Assembler::AVX_512bit);
\
1403evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \
1404evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \
1405evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \
1406evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \
1407
1408#define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey)evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit)
; evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit
); evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit
); evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit
);
\
1409evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit); \
1410evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit); \
1411evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit); \
1412evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit); \
1413
1414#define shuffleExorRnd1Key(dst0, dst1, dst2, dst3, shufmask, rndkey)vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); evpxorq
(dst0, dst0, rndkey, Assembler::AVX_512bit); vpshufb(dst1, dst1
, shufmask, Assembler::AVX_512bit); evpxorq(dst1, dst1, rndkey
, Assembler::AVX_512bit); vpshufb(dst2, dst2, shufmask, Assembler
::AVX_512bit); evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit
); vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); evpxorq
(dst3, dst3, rndkey, Assembler::AVX_512bit);
\
1415vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); \
1416evpxorq(dst0, dst0, rndkey, Assembler::AVX_512bit); \
1417vpshufb(dst1, dst1, shufmask, Assembler::AVX_512bit); \
1418evpxorq(dst1, dst1, rndkey, Assembler::AVX_512bit); \
1419vpshufb(dst2, dst2, shufmask, Assembler::AVX_512bit); \
1420evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit); \
1421vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); \
1422evpxorq(dst3, dst3, rndkey, Assembler::AVX_512bit); \
1423
1424#define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3)evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); evpxorq(dst1
, dst1, src1, Assembler::AVX_512bit); evpxorq(dst2, dst2, src2
, Assembler::AVX_512bit); evpxorq(dst3, dst3, src3, Assembler
::AVX_512bit);
\
1425evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \
1426evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \
1427evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \
1428evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \
1429
1430#define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33)vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); vpternlogq
(dst1, 0x96, src12, src13, Assembler::AVX_512bit); vpternlogq
(dst2, 0x96, src22, src23, Assembler::AVX_512bit); vpternlogq
(dst3, 0x96, src32, src33, Assembler::AVX_512bit);
\
1431vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \
1432vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \
1433vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \
1434vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \
1435
1436void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx,
1437 Register in, Register out, Register data, Register pos, bool first_time_reduction, XMMRegister addmask, bool ghash_input, Register rounds,
1438 Register ghash_pos, bool final_reduction, int i, XMMRegister counter_inc_mask) {
1439
1440 Label AES_192, AES_256, LAST_AES_RND;
1441 const XMMRegister ZTMP0 = xmm0;
1442 const XMMRegister ZTMP1 = xmm3;
1443 const XMMRegister ZTMP2 = xmm4;
1444 const XMMRegister ZTMP3 = xmm5;
1445 const XMMRegister ZTMP5 = xmm7;
1446 const XMMRegister ZTMP6 = xmm10;
1447 const XMMRegister ZTMP7 = xmm11;
1448 const XMMRegister ZTMP8 = xmm12;
1449 const XMMRegister ZTMP9 = xmm13;
1450 const XMMRegister ZTMP10 = xmm15;
1451 const XMMRegister ZTMP11 = xmm16;
1452 const XMMRegister ZTMP12 = xmm17;
1453
1454 const XMMRegister ZTMP13 = xmm19;
1455 const XMMRegister ZTMP14 = xmm20;
1456 const XMMRegister ZTMP15 = xmm21;
1457 const XMMRegister ZTMP16 = xmm30;
1458 const XMMRegister ZTMP17 = xmm31;
1459 const XMMRegister ZTMP18 = xmm1;
1460 const XMMRegister ZTMP19 = xmm2;
1461 const XMMRegister ZTMP20 = xmm8;
1462 const XMMRegister ZTMP21 = xmm22;
1463 const XMMRegister ZTMP22 = xmm23;
1464
1465 // Pre increment counters
1466 vpaddd(ZTMP0, ctr_blockx, counter_inc_mask, Assembler::AVX_512bit);
1467 vpaddd(ZTMP1, ZTMP0, counter_inc_mask, Assembler::AVX_512bit);
1468 vpaddd(ZTMP2, ZTMP1, counter_inc_mask, Assembler::AVX_512bit);
1469 vpaddd(ZTMP3, ZTMP2, counter_inc_mask, Assembler::AVX_512bit);
1470 // Save counter value
1471 evmovdquq(ctr_blockx, ZTMP3, Assembler::AVX_512bit);
1472
1473 // Reuse ZTMP17 / ZTMP18 for loading AES Keys
1474 // Pre-load AES round keys
1475 ev_load_key(ZTMP17, key, 0, xmm29);
1476 ev_load_key(ZTMP18, key, 1 * 16, xmm29);
1477
1478 // ZTMP19 & ZTMP20 used for loading hash key
1479 // Pre-load hash key
1480 evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit);
1481 evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1482 // Load data for computing ghash
1483 evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1484 vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1485
1486 // Xor cipher block 0 with input ghash, if available
1487 if (ghash_input) {
1488 evpxorq(ZTMP21, ZTMP21, aad_hashx, Assembler::AVX_512bit);
1489 }
1490 // Load data for computing ghash
1491 evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1492 vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1493
1494 // stitch AES rounds with GHASH
1495 // AES round 0, xmm24 has shuffle mask
1496 shuffleExorRnd1Key(ZTMP0, ZTMP1, ZTMP2, ZTMP3, xmm24, ZTMP17)vpshufb(ZTMP0, ZTMP0, xmm24, Assembler::AVX_512bit); evpxorq(
ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vpshufb(ZTMP1, ZTMP1
, xmm24, Assembler::AVX_512bit); evpxorq(ZTMP1, ZTMP1, ZTMP17
, Assembler::AVX_512bit); vpshufb(ZTMP2, ZTMP2, xmm24, Assembler
::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit
); vpshufb(ZTMP3, ZTMP3, xmm24, Assembler::AVX_512bit); evpxorq
(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);
;
1497 // Reuse ZTMP17 / ZTMP18 for loading remaining AES Keys
1498 ev_load_key(ZTMP17, key, 2 * 16, xmm29);
1499 // GHASH 4 blocks
1500 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19)evpclmulqdq(ZTMP6, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit
);
;
1501 // Load the next hkey and Ghash data
1502 evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1503 evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1504 vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1505
1506 // AES round 1
1507 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1508 ev_load_key(ZTMP18, key, 3 * 16, xmm29);
1509
1510 // GHASH 4 blocks(11 to 8)
1511 carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit
);
;
1512 // Load the next hkey and GDATA
1513 evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1514 evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1515 vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1516
1517 // AES round 2
1518 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1519 ev_load_key(ZTMP17, key, 4 * 16, xmm29);
1520
1521 // GHASH 4 blocks(7 to 4)
1522 carrylessMultiply(ZTMP14, ZTMP16, ZTMP15, ZTMP13, ZTMP21, ZTMP19)evpclmulqdq(ZTMP14, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP16, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP15, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP13, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit
);
;
1523 // AES rounds 3
1524 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1525 ev_load_key(ZTMP18, key, 5 * 16, xmm29);
1526
1527 // Gather(XOR) GHASH for 12 blocks
1528 xorGHASH(ZTMP5, ZTMP6, ZTMP8, ZTMP7, ZTMP9, ZTMP13, ZTMP10, ZTMP14, ZTMP12, ZTMP16, ZTMP11, ZTMP15)vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP13, Assembler::AVX_512bit)
; vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP14, Assembler::AVX_512bit
); vpternlogq(ZTMP8, 0x96, ZTMP12, ZTMP16, Assembler::AVX_512bit
); vpternlogq(ZTMP7, 0x96, ZTMP11, ZTMP15, Assembler::AVX_512bit
);
;
1529
1530 // AES rounds 4
1531 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1532 ev_load_key(ZTMP17, key, 6 * 16, xmm29);
1533
1534 // load plain / cipher text(recycle registers)
1535 loadData(in, pos, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evmovdquq(ZTMP13, Address(in, pos, Address::times_1, 0 * 64),
Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(in, pos, Address
::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP15,
Address(in, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit
); evmovdquq(ZTMP16, Address(in, pos, Address::times_1, 3 * 64
), Assembler::AVX_512bit);
;
1536
1537 // AES rounds 5
1538 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1539 ev_load_key(ZTMP18, key, 7 * 16, xmm29);
1540 // GHASH 4 blocks(3 to 0)
1541 carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit
);
;
1542
1543 // AES round 6
1544 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1545 ev_load_key(ZTMP17, key, 8 * 16, xmm29);
1546
1547 // gather GHASH in ZTMP6(low) and ZTMP5(high)
1548 if (first_time_reduction) {
1549 vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1550 evpxorq(xmm25, ZTMP7, ZTMP11, Assembler::AVX_512bit);
1551 evpxorq(xmm27, ZTMP5, ZTMP9, Assembler::AVX_512bit);
1552 evpxorq(xmm26, ZTMP6, ZTMP10, Assembler::AVX_512bit);
1553 }
1554 else if (!first_time_reduction && !final_reduction) {
1555 xorGHASH(ZTMP7, xmm25, xmm27, xmm26, ZTMP8, ZTMP12, ZTMP7, ZTMP11, ZTMP5, ZTMP9, ZTMP6, ZTMP10)vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit)
; vpternlogq(xmm25, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit
); vpternlogq(xmm27, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit
); vpternlogq(xmm26, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit
);
;
1556 }
1557
1558 if (final_reduction) {
1559 // Phase one: Add mid products together
1560 // Also load polynomial constant for reduction
1561 vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1562 vpternlogq(ZTMP7, 0x96, xmm25, ZTMP11, Assembler::AVX_512bit);
1563 vpsrldq(ZTMP11, ZTMP7, 8, Assembler::AVX_512bit);
1564 vpslldq(ZTMP7, ZTMP7, 8, Assembler::AVX_512bit);
1565 evmovdquq(ZTMP12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1566 }
1567 // AES round 7
1568 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1569 ev_load_key(ZTMP18, key, 9 * 16, xmm29);
1570 if (final_reduction) {
1571 vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP11, Assembler::AVX_512bit);
1572 evpxorq(ZTMP5, ZTMP5, xmm27, Assembler::AVX_512bit);
1573 vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP7, Assembler::AVX_512bit);
1574 evpxorq(ZTMP6, ZTMP6, xmm26, Assembler::AVX_512bit);
1575 }
1576 // AES round 8
1577 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1578 ev_load_key(ZTMP17, key, 10 * 16, xmm29);
1579
1580 // Horizontal xor of low and high 4*128
1581 if (final_reduction) {
1582 vhpxori4x128(ZTMP5, ZTMP9)vextracti64x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler
::AVX_256bit); vextracti32x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5,
ZTMP5, ZTMP9, Assembler::AVX_128bit);
;
1583 vhpxori4x128(ZTMP6, ZTMP10)vextracti64x4(ZTMP10, ZTMP6, 1); evpxorq(ZTMP6, ZTMP6, ZTMP10
, Assembler::AVX_256bit); vextracti32x4(ZTMP10, ZTMP6, 1); evpxorq
(ZTMP6, ZTMP6, ZTMP10, Assembler::AVX_128bit);
;
1584 }
1585 // AES round 9
1586 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1587 // First phase of reduction
1588 if (final_reduction) {
1589 evpclmulqdq(ZTMP10, ZTMP12, ZTMP6, 0x01, Assembler::AVX_128bit);
1590 vpslldq(ZTMP10, ZTMP10, 8, Assembler::AVX_128bit);
1591 evpxorq(ZTMP10, ZTMP6, ZTMP10, Assembler::AVX_128bit);
1592 }
1593 cmpl(rounds, 52);
1594 jcc(Assembler::greaterEqual, AES_192);
1595 jmp(LAST_AES_RND);
1596 // AES rounds upto 11 (AES192) or 13 (AES256)
1597 bind(AES_192);
1598 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1599 ev_load_key(ZTMP18, key, 11 * 16, xmm29);
1600 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1601 ev_load_key(ZTMP17, key, 12 * 16, xmm29);
1602 cmpl(rounds, 60);
1603 jcc(Assembler::aboveEqual, AES_256);
1604 jmp(LAST_AES_RND);
1605
1606 bind(AES_256);
1607 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1608 ev_load_key(ZTMP18, key, 13 * 16, xmm29);
1609 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1610 ev_load_key(ZTMP17, key, 14 * 16, xmm29);
1611
1612 bind(LAST_AES_RND);
1613 // Second phase of reduction
1614 if (final_reduction) {
1615 evpclmulqdq(ZTMP9, ZTMP12, ZTMP10, 0x00, Assembler::AVX_128bit);
1616 vpsrldq(ZTMP9, ZTMP9, 4, Assembler::AVX_128bit); // Shift-R 1-DW to obtain 2-DWs shift-R
1617 evpclmulqdq(ZTMP11, ZTMP12, ZTMP10, 0x10, Assembler::AVX_128bit);
1618 vpslldq(ZTMP11, ZTMP11, 4, Assembler::AVX_128bit); // Shift-L 1-DW for result
1619 // ZTMP5 = ZTMP5 X ZTMP11 X ZTMP9
1620 vpternlogq(ZTMP5, 0x96, ZTMP11, ZTMP9, Assembler::AVX_128bit);
1621 }
1622 // Last AES round
1623 lastroundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenclast(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenclast
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP2
, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP3, ZTMP3
, ZTMP17, Assembler::AVX_512bit);
;
1624 // XOR against plain / cipher text
1625 xorBeforeStore(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evpxorq(ZTMP0, ZTMP0, ZTMP13, Assembler::AVX_512bit); evpxorq
(ZTMP1, ZTMP1, ZTMP14, Assembler::AVX_512bit); evpxorq(ZTMP2,
ZTMP2, ZTMP15, Assembler::AVX_512bit); evpxorq(ZTMP3, ZTMP3,
ZTMP16, Assembler::AVX_512bit);
;
1626 // store cipher / plain text
1627 storeData(out, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP0,
Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address::
times_1, 1 * 64), ZTMP1, Assembler::AVX_512bit); evmovdquq(Address
(out, pos, Address::times_1, 2 * 64), ZTMP2, Assembler::AVX_512bit
); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP3
, Assembler::AVX_512bit);
;
1628}
1629
1630void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
1631 Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
1632 Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32,
1633 AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16;
1634 const XMMRegister CTR_BLOCKx = xmm9;
1635 const XMMRegister AAD_HASHx = xmm14;
1636 const Register pos = rax;
1637 const Register rounds = r15;
1638 Register ghash_pos;
1639#ifndef _WIN64
1640 ghash_pos = r14;
1641#else
1642 ghash_pos = r11;
1643#endif // !_WIN64
1644 const XMMRegister ZTMP0 = xmm0;
1645 const XMMRegister ZTMP1 = xmm3;
1646 const XMMRegister ZTMP2 = xmm4;
1647 const XMMRegister ZTMP3 = xmm5;
1648 const XMMRegister ZTMP4 = xmm6;
1649 const XMMRegister ZTMP5 = xmm7;
1650 const XMMRegister ZTMP6 = xmm10;
1651 const XMMRegister ZTMP7 = xmm11;
1652 const XMMRegister ZTMP8 = xmm12;
1653 const XMMRegister ZTMP9 = xmm13;
1654 const XMMRegister ZTMP10 = xmm15;
1655 const XMMRegister ZTMP11 = xmm16;
1656 const XMMRegister ZTMP12 = xmm17;
1657 const XMMRegister ZTMP13 = xmm19;
1658 const XMMRegister ZTMP14 = xmm20;
1659 const XMMRegister ZTMP15 = xmm21;
1660 const XMMRegister ZTMP16 = xmm30;
1661 const XMMRegister COUNTER_INC_MASK = xmm18;
1662
1663 movl(pos, 0); // Total length processed
1664 // Min data size processed = 768 bytes
1665 cmpl(len, 768);
1666 jcc(Assembler::less, ENC_DEC_DONE);
1667
1668 // Generate 48 constants for htbl
1669 call(GENERATE_HTBL_48_BLKS, relocInfo::none);
1670 int index = 0; // Index for choosing subkeyHtbl entry
1671 movl(ghash_pos, 0); // Pointer for ghash read and store operations
1672
1673 // Move initial counter value and STATE value into variables
1674 movdqu(CTR_BLOCKx, Address(counter, 0));
1675 movdqu(AAD_HASHx, Address(state, 0));
1676 // Load lswap mask for ghash
1677 movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()), rbx);
1678 // Shuffle input state using lswap mask
1679 vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1680
1681 // Compute #rounds for AES based on the length of the key array
1682 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1683
1684 // Broadcast counter value to 512 bit register
1685 evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit);
1686 // Load counter shuffle mask
1687 evmovdquq(xmm24, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, rbx);
1688 // Shuffle counter
1689 vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1690
1691 // Load mask for incrementing counter
1692 evmovdquq(COUNTER_INC_MASK, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, rbx);
1693 // Pre-increment counter
1694 vpaddd(ZTMP5, CTR_BLOCKx, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, rbx);
1695 vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1696 vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1697 vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1698
1699 // Begin 32 blocks of AES processing
1700 bind(AES_32_BLOCKS);
1701 // Save incremented counter before overwriting it with AES data
1702 evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit);
1703
1704 // Move 256 bytes of data
1705 loadData(in, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(ZTMP0, Address(in, pos, Address::times_1, 0 * 64), Assembler
::AVX_512bit); evmovdquq(ZTMP1, Address(in, pos, Address::times_1
, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP2, Address(in
, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq
(ZTMP3, Address(in, pos, Address::times_1, 3 * 64), Assembler
::AVX_512bit);
;
1706 // Load key shuffle mask
1707 movdqu(xmm29, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx);
1708 // Load 0th AES round key
1709 ev_load_key(ZTMP4, key, 0, xmm29);
1710 // AES-ROUND0, xmm24 has the shuffle mask
1711 shuffleExorRnd1Key(ZTMP5, ZTMP6, ZTMP7, ZTMP8, xmm24, ZTMP4)vpshufb(ZTMP5, ZTMP5, xmm24, Assembler::AVX_512bit); evpxorq(
ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP6, ZTMP6
, xmm24, Assembler::AVX_512bit); evpxorq(ZTMP6, ZTMP6, ZTMP4,
Assembler::AVX_512bit); vpshufb(ZTMP7, ZTMP7, xmm24, Assembler
::AVX_512bit); evpxorq(ZTMP7, ZTMP7, ZTMP4, Assembler::AVX_512bit
); vpshufb(ZTMP8, ZTMP8, xmm24, Assembler::AVX_512bit); evpxorq
(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);
;
1712
1713 for (int j = 1; j < 10; j++) {
1714 ev_load_key(ZTMP4, key, j * 16, xmm29);
1715 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1716 }
1717 ev_load_key(ZTMP4, key, 10 * 16, xmm29);
1718 // AES rounds upto 11 (AES192) or 13 (AES256)
1719 cmpl(rounds, 52);
1720 jcc(Assembler::greaterEqual, AES_192);
1721 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1722 jmp(STORE_CT);
1723
1724 bind(AES_192);
1725 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1726 ev_load_key(ZTMP4, key, 11 * 16, xmm29);
1727 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1728 cmpl(rounds, 60);
1729 jcc(Assembler::aboveEqual, AES_256);
1730 ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1731 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1732 jmp(STORE_CT);
1733
1734 bind(AES_256);
1735 ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1736 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1737 ev_load_key(ZTMP4, key, 13 * 16, xmm29);
1738 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1739 ev_load_key(ZTMP4, key, 14 * 16, xmm29);
1740 // Last AES round
1741 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1742
1743 bind(STORE_CT);
1744 // Xor the encrypted key with PT to obtain CT
1745 xorBeforeStore(ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evpxorq(ZTMP5, ZTMP5, ZTMP0, Assembler::AVX_512bit); evpxorq(
ZTMP6, ZTMP6, ZTMP1, Assembler::AVX_512bit); evpxorq(ZTMP7, ZTMP7
, ZTMP2, Assembler::AVX_512bit); evpxorq(ZTMP8, ZTMP8, ZTMP3,
Assembler::AVX_512bit);
;
1746 storeData(out, pos, ZTMP5, ZTMP6, ZTMP7, ZTMP8)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP5,
Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address::
times_1, 1 * 64), ZTMP6, Assembler::AVX_512bit); evmovdquq(Address
(out, pos, Address::times_1, 2 * 64), ZTMP7, Assembler::AVX_512bit
); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP8
, Assembler::AVX_512bit);
;
1747 // 16 blocks encryption completed
1748 addl(pos, 256);
1749 cmpl(pos, 512);
1750 jcc(Assembler::aboveEqual, GHASH_AES_PARALLEL);
1751 vpaddd(ZTMP5, CTR_BLOCKx, COUNTER_INC_MASK, Assembler::AVX_512bit);
1752 vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1753 vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1754 vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1755 jmp(AES_32_BLOCKS);
1756
1757 bind(GHASH_AES_PARALLEL);
1758 // Ghash16_encrypt16_parallel takes place in the order with three reduction values:
1759 // 1) First time -> cipher xor input ghash
1760 // 2) No reduction -> accumulate multiplication values
1761 // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round
1762 // Reduction value = first time
1763 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1764 addl(pos, 256);
1765 addl(ghash_pos, 256);
1766 index += 4;
1767
1768 // At this point we have processed 768 bytes of AES and 256 bytes of GHASH.
1769 // If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code
1770 subl(len, 768);
1771 cmpl(len, 768);
1772 jcc(Assembler::less, GHASH_LAST_32);
1773
1774 // AES 16 blocks and GHASH 16 blocks in parallel
1775 // For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times
1776 // Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations
1777 // Each call uses 4 subkeyHtbl values, so increment the index by 4.
1778 bind(GHASH_16_AES_16);
1779 // Reduction value = no reduction
1780 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1781 addl(pos, 256);
1782 addl(ghash_pos, 256);
1783 index += 4;
1784 // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash
1785 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK);
1786 addl(pos, 256);
1787 addl(ghash_pos, 256);
1788 // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline
1789 movdqu(AAD_HASHx, ZTMP5);
1790 index = 0; // Reset subkeyHtbl index
1791
1792 // Restart the pipeline
1793 // Reduction value = first time
1794 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1795 addl(pos, 256);
1796 addl(ghash_pos, 256);
1797 index += 4;
Value stored to 'index' is never read
1798
1799 subl(len, 768);
1800 cmpl(len, 768);
1801 jcc(Assembler::greaterEqual, GHASH_16_AES_16);
1802
1803 // GHASH last 32 blocks processed here
1804 // GHASH products accumulated in ZMM27, ZMM25 and ZMM26 during GHASH16-AES16 operation is used
1805 bind(GHASH_LAST_32);
1806 // Use rbx as a pointer to the htbl; For last 32 blocks of GHASH, use key# 4-11 entry in subkeyHtbl
1807 movl(rbx, 256);
1808 // Load cipher blocks
1809 evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1810 evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1811 vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1812 vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1813 // Load ghash keys
1814 evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1815 evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1816
1817 // Ghash blocks 0 - 3
1818 carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15)evpclmulqdq(ZTMP2, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP3, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP4, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP1, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit
);
;
1819 // Ghash blocks 4 - 7
1820 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP14, ZTMP16)evpclmulqdq(ZTMP6, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit
);
;
1821
1822 vpternlogq(ZTMP1, 0x96, ZTMP5, xmm27, Assembler::AVX_512bit); // ZTMP1 = ZTMP1 + ZTMP5 + zmm27
1823 vpternlogq(ZTMP2, 0x96, ZTMP6, xmm26, Assembler::AVX_512bit); // ZTMP2 = ZTMP2 + ZTMP6 + zmm26
1824 vpternlogq(ZTMP3, 0x96, ZTMP7, xmm25, Assembler::AVX_512bit); // ZTMP3 = ZTMP3 + ZTMP7 + zmm25
1825 evpxorq(ZTMP4, ZTMP4, ZTMP8, Assembler::AVX_512bit); // ZTMP4 = ZTMP4 + ZTMP8
1826
1827 addl(ghash_pos, 128);
1828 addl(rbx, 128);
1829
1830 // Ghash remaining blocks
1831 bind(LOOP);
1832 cmpl(ghash_pos, pos);
1833 jcc(Assembler::aboveEqual, ACCUMULATE);
1834 // Load next cipher blocks and corresponding ghash keys
1835 evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1836 evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1837 vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1838 vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1839 evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1840 evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1841
1842 // ghash blocks 0 - 3
1843 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15)evpclmulqdq(ZTMP6, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit
);
;
1844
1845 // ghash blocks 4 - 7
1846 carrylessMultiply(ZTMP10, ZTMP11, ZTMP12, ZTMP9, ZTMP14, ZTMP16)evpclmulqdq(ZTMP10, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit
);
;
1847
1848 // update sums
1849 // ZTMP1 = ZTMP1 + ZTMP5 + ZTMP9
1850 // ZTMP2 = ZTMP2 + ZTMP6 + ZTMP10
1851 // ZTMP3 = ZTMP3 + ZTMP7 xor ZTMP11
1852 // ZTMP4 = ZTMP4 + ZTMP8 xor ZTMP12
1853 xorGHASH(ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP9, ZTMP6, ZTMP10, ZTMP7, ZTMP11, ZTMP8, ZTMP12)vpternlogq(ZTMP1, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit);
vpternlogq(ZTMP2, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit
); vpternlogq(ZTMP3, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit
); vpternlogq(ZTMP4, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit
);
;
1854 addl(ghash_pos, 128);
1855 addl(rbx, 128);
1856 jmp(LOOP);
1857
1858 // Integrate ZTMP3/ZTMP4 into ZTMP1 and ZTMP2
1859 bind(ACCUMULATE);
1860 evpxorq(ZTMP3, ZTMP3, ZTMP4, Assembler::AVX_512bit);
1861 vpsrldq(ZTMP7, ZTMP3, 8, Assembler::AVX_512bit);
1862 vpslldq(ZTMP8, ZTMP3, 8, Assembler::AVX_512bit);
1863 evpxorq(ZTMP1, ZTMP1, ZTMP7, Assembler::AVX_512bit);
1864 evpxorq(ZTMP2, ZTMP2, ZTMP8, Assembler::AVX_512bit);
1865
1866 // Add ZTMP1 and ZTMP2 128 - bit words horizontally
1867 vhpxori4x128(ZTMP1, ZTMP11)vextracti64x4(ZTMP11, ZTMP1, 1); evpxorq(ZTMP1, ZTMP1, ZTMP11
, Assembler::AVX_256bit); vextracti32x4(ZTMP11, ZTMP1, 1); evpxorq
(ZTMP1, ZTMP1, ZTMP11, Assembler::AVX_128bit);
;
1868 vhpxori4x128(ZTMP2, ZTMP12)vextracti64x4(ZTMP12, ZTMP2, 1); evpxorq(ZTMP2, ZTMP2, ZTMP12
, Assembler::AVX_256bit); vextracti32x4(ZTMP12, ZTMP2, 1); evpxorq
(ZTMP2, ZTMP2, ZTMP12, Assembler::AVX_128bit);
;
1869 // Load reduction polynomial and compute final reduction
1870 evmovdquq(ZTMP15, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1871 vclmul_reduce(AAD_HASHx, ZTMP15, ZTMP1, ZTMP2, ZTMP3, ZTMP4)evpclmulqdq(ZTMP3, ZTMP15, ZTMP2, 0x01, Assembler::AVX_512bit
); vpslldq(ZTMP3, ZTMP3, 8, Assembler::AVX_512bit); evpxorq(ZTMP3
, ZTMP2, ZTMP3, Assembler::AVX_512bit); evpclmulqdq(ZTMP4, ZTMP15
, ZTMP3, 0x00, Assembler::AVX_512bit); vpsrldq(ZTMP4, ZTMP4, 4
, Assembler::AVX_512bit); evpclmulqdq(AAD_HASHx, ZTMP15, ZTMP3
, 0x10, Assembler::AVX_512bit); vpslldq(AAD_HASHx, AAD_HASHx,
4, Assembler::AVX_512bit); vpternlogq(AAD_HASHx, 0x96, ZTMP4
, ZTMP1, Assembler::AVX_512bit);
;
1872
1873 // Pre-increment counter for next operation
1874 vpaddd(CTR_BLOCKx, CTR_BLOCKx, xmm18, Assembler::AVX_128bit);
1875 // Shuffle counter and save the updated value
1876 vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1877 movdqu(Address(counter, 0), CTR_BLOCKx);
1878 // Load ghash lswap mask
1879 movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1880 // Shuffle ghash using lbswap_mask and store it
1881 vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1882 movdqu(Address(state, 0), AAD_HASHx);
1883 jmp(ENC_DEC_DONE);
1884
1885 bind(GENERATE_HTBL_48_BLKS);
1886 generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl);
1887
1888 bind(ENC_DEC_DONE);
1889 movq(rax, pos);
1890}
1891
1892#endif // _LP64