/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/macroAssembler_x86

Bug Summary

File:	jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
Warning:	line 1797, column 5 Value stored to 'index' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name macroAssembler_x86_aes.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -mthread-model posix -fno-delete-null-pointer-checks -mframe-pointer=all -relaxed-aliasing -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/libjvm/objs/precompiled -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D _GNU_SOURCE -D _REENTRANT -D LIBC=gnu -D LINUX -D VM_LITTLE_ENDIAN -D _LP64=1 -D ASSERT -D CHECK_UNHANDLED_OOPS -D TARGET_ARCH_x86 -D INCLUDE_SUFFIX_OS=_linux -D INCLUDE_SUFFIX_CPU=_x86 -D INCLUDE_SUFFIX_COMPILER=_gcc -D TARGET_COMPILER_gcc -D AMD64 -D HOTSPOT_LIB_ARCH="amd64" -D COMPILER1 -D COMPILER2 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -I /home/daniel/Projects/java/jdk/src/hotspot/share/precompiled -I /home/daniel/Projects/java/jdk/src/hotspot/share/include -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix/include -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base/linux -I /home/daniel/Projects/java/jdk/src/java.base/share/native/libjimage -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -D _FORTIFY_SOURCE=2 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-format-zero-length -Wno-unused-parameter -Wno-unused -Wno-parentheses -Wno-comment -Wno-unknown-pragmas -Wno-address -Wno-delete-non-virtual-dtor -Wno-char-subscripts -Wno-array-bounds -Wno-int-in-bool-context -Wno-ignored-qualifiers -Wno-missing-field-initializers -Wno-implicit-fallthrough -Wno-empty-body -Wno-strict-overflow -Wno-sequence-point -Wno-maybe-uninitialized -Wno-misleading-indentation -Wno-cast-function-type -Wno-shift-negative-value -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /home/daniel/Projects/java/jdk/make/hotspot -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -stack-protector 1 -fno-rtti -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -o /home/daniel/Projects/java/scan/2021-12-21-193737-8510-1 -x c++ /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp

1	/*
2	* Copyright (c) 2019, 2021, Intel Corporation. All rights reserved.
3	*
4	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5	*
6	* This code is free software; you can redistribute it and/or modify it
7	* under the terms of the GNU General Public License version 2 only, as
8	* published by the Free Software Foundation.
9	*
10	* This code is distributed in the hope that it will be useful, but WITHOUT
11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13	* version 2 for more details (a copy is included in the LICENSE file that
14	* accompanied this code).
15	*
16	* You should have received a copy of the GNU General Public License version
17	* 2 along with this work; if not, write to the Free Software Foundation,
18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19	*
20	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21	* or visit www.oracle.com if you need additional information or have any
22	* questions.
23	*
24	*/
25
26	#include "precompiled.hpp"
27	#include "asm/assembler.hpp"
28	#include "asm/assembler.inline.hpp"
29	#include "runtime/stubRoutines.hpp"
30	#include "macroAssembler_x86.hpp"
31
32	#ifdef _LP641
33
34	void MacroAssembler::roundEnc(XMMRegister key, int rnum) {
35	for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
36	vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
37	}
38	}
39
40	void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) {
41	for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
42	vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
43	}
44	}
45
46	void MacroAssembler::roundDec(XMMRegister key, int rnum) {
47	for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
48	vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
49	}
50	}
51
52	void MacroAssembler::lastroundDec(XMMRegister key, int rnum) {
53	for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
54	vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
55	}
56	}
57
58	// Load key and shuffle operation
59	void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL__null) {
60	movdqu(xmmdst, Address(key, offset));
61	if (xmm_shuf_mask != NULL__null) {
62	pshufb(xmmdst, xmm_shuf_mask);
63	} else {
64	pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
65	}
66	evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
67	}
68
69	// AES-ECB Encrypt Operation
70	void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
71
72	const Register pos = rax;
73	const Register rounds = r12;
74
75	Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
76	push(r13);
77	push(r12);
78
79	// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
80	// context for the registers used, where all instructions below are using 128-bit mode
81	// On EVEX without VL and BW, these instructions will all be AVX.
82	if (VM_Version::supports_avx512vlbw()) {
83	movl(rax, 0xffff);
84	kmovql(k1, rax);
85	}
86	push(len); // Save
87	push(rbx);
88
89	vzeroupper();
90
91	xorptr(pos, pos);
92
93	// Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
94	movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
95
96	// Load Key shuf mask
97	const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
98	movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
99
100	// Load and shuffle key based on number of rounds
101	ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
102	ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
103	ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
104	ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
105	ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
106	ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
107	ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
108	ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
109	ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
110	ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
111	ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
112	cmpl(rounds, 52);
113	jcc(Assembler::greaterEqual, KEY_192);
114	jmp(Loop_start);
115
116	bind(KEY_192);
117	ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
118	ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
119	cmpl(rounds, 60);
120	jcc(Assembler::equal, KEY_256);
121	jmp(Loop_start);
122
123	bind(KEY_256);
124	ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
125	ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
126
127	bind(Loop_start);
128	movq(rbx, len);
129	// Divide length by 16 to convert it to number of blocks
130	shrq(len, 4);
131	shlq(rbx, 60);
132	jcc(Assembler::equal, NO_PARTS);
133	addq(len, 1);
134	// Check if number of blocks is greater than or equal to 32
135	// If true, 512 bytes are processed at a time (code marked by label LOOP)
136	// If not, 16 bytes are processed (code marked by REMAINDER label)
137	bind(NO_PARTS);
138	movq(rbx, len);
139	shrq(len, 5);
140	jcc(Assembler::equal, REMAINDER);
141	movl(r13, len);
142	// Compute number of blocks that will be processed 512 bytes at a time
143	// Subtract this from the total number of blocks which will then be processed by REMAINDER loop
144	shlq(r13, 5);
145	subq(rbx, r13);
146	//Begin processing 512 bytes
147	bind(LOOP);
148	// Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
149	evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
150	evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
151	evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
152	evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
153	evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
154	evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
155	evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
156	evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
157	// Xor with the first round key
158	evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
159	evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
160	evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
161	evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
162	evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
163	evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
164	evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
165	evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
166	// 9 Aes encode round operations
167	roundEnc(xmm9, 7);
168	roundEnc(xmm10, 7);
169	roundEnc(xmm23, 7);
170	roundEnc(xmm12, 7);
171	roundEnc(xmm13, 7);
172	roundEnc(xmm14, 7);
173	roundEnc(xmm15, 7);
174	roundEnc(xmm16, 7);
175	roundEnc(xmm17, 7);
176	cmpl(rounds, 52);
177	jcc(Assembler::aboveEqual, AES192);
178	// Aesenclast round operation for keysize = 128
179	lastroundEnc(xmm24, 7);
180	jmp(END_LOOP);
181	//Additional 2 rounds of Aesenc operation for keysize = 192
182	bind(AES192);
183	roundEnc(xmm24, 7);
184	roundEnc(xmm19, 7);
185	cmpl(rounds, 60);
186	jcc(Assembler::aboveEqual, AES256);
187	// Aesenclast round for keysize = 192
188	lastroundEnc(xmm20, 7);
189	jmp(END_LOOP);
190	// 2 rounds of Aesenc operation and Aesenclast for keysize = 256
191	bind(AES256);
192	roundEnc(xmm20, 7);
193	roundEnc(xmm21, 7);
194	lastroundEnc(xmm22, 7);
195
196	bind(END_LOOP);
197	// Move 512 bytes of CT to destination
198	evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
199	evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
200	evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
201	evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
202	evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
203	evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
204	evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
205	evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
206
207	addq(pos, 512);
208	decq(len);
209	jcc(Assembler::notEqual, LOOP);
210
211	bind(REMAINDER);
212	vzeroupper();
213	cmpq(rbx, 0);
214	jcc(Assembler::equal, END);
215	// Process 16 bytes at a time
216	bind(LOOP2);
217	movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
218	vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
219	// xmm2 contains shuffled key for Aesenclast operation.
220	vmovdqu(xmm2, xmm24);
221
222	vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
223	vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
224	vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
225	vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
226	vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
227	vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
228	vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
229	vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
230	vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
231
232	cmpl(rounds, 52);
233	jcc(Assembler::below, LAST2);
234	vmovdqu(xmm2, xmm20);
235	vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
236	vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
237	cmpl(rounds, 60);
238	jcc(Assembler::below, LAST2);
239	vmovdqu(xmm2, xmm22);
240	vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
241	vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
242
243	bind(LAST2);
244	// Aesenclast round
245	vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
246	// Write 16 bytes of CT to destination
247	movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
248	addq(pos, 16);
249	decq(rbx);
250	jcc(Assembler::notEqual, LOOP2);
251
252	bind(END);
253	// Zero out the round keys
254	evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
255	evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
256	evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
257	evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
258	evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
259	evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
260	evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
261	evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
262	evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
263	evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
264	evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
265	cmpl(rounds, 44);
266	jcc(Assembler::belowEqual, EXIT);
267	evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
268	evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
269	cmpl(rounds, 52);
270	jcc(Assembler::belowEqual, EXIT);
271	evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
272	evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
273	bind(EXIT);
274	pop(rbx);
275	pop(rax); // return length
276	pop(r12);
277	pop(r13);
278	}
279
280	// AES-ECB Decrypt Operation
281	void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) {
282
283	Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
284	const Register pos = rax;
285	const Register rounds = r12;
286	push(r13);
287	push(r12);
288
289	// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
290	// context for the registers used, where all instructions below are using 128-bit mode
291	// On EVEX without VL and BW, these instructions will all be AVX.
292	if (VM_Version::supports_avx512vlbw()) {
293	movl(rax, 0xffff);
294	kmovql(k1, rax);
295	}
296
297	push(len); // Save
298	push(rbx);
299
300	vzeroupper();
301
302	xorptr(pos, pos);
303	// Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
304	movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
305
306	// Load Key shuf mask
307	const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
308	movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
309
310	// Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
311	// So the first round key is loaded from 116 here and last round key is loaded from 016
312	ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
313	ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
314	ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
315	ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
316	ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
317	ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
318	ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
319	ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
320	ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
321	ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
322	ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
323	cmpl(rounds, 52);
324	jcc(Assembler::greaterEqual, KEY_192);
325	jmp(Loop_start);
326
327	bind(KEY_192);
328	ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
329	ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
330	cmpl(rounds, 60);
331	jcc(Assembler::equal, KEY_256);
332	jmp(Loop_start);
333
334	bind(KEY_256);
335	ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
336	ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
337	bind(Loop_start);
338	movq(rbx, len);
339	// Convert input length to number of blocks
340	shrq(len, 4);
341	shlq(rbx, 60);
342	jcc(Assembler::equal, NO_PARTS);
343	addq(len, 1);
344	// Check if number of blocks is greater than/ equal to 32
345	// If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
346	// If not, 16 bytes are processed (code marked by label REMAINDER)
347	bind(NO_PARTS);
348	movq(rbx, len);
349	shrq(len, 5);
350	jcc(Assembler::equal, REMAINDER);
351	movl(r13, len);
352	// Compute number of blocks that will be processed as 512 bytes at a time
353	// Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
354	shlq(r13, 5);
355	subq(rbx, r13);
356
357	bind(LOOP);
358	// Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
359	evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
360	evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
361	evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
362	evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
363	evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
364	evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
365	evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
366	evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
367	// Xor with the first round key
368	evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
369	evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
370	evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
371	evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
372	evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
373	evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
374	evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
375	evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
376	// 9 rounds of Aesdec
377	roundDec(xmm10, 7);
378	roundDec(xmm11, 7);
379	roundDec(xmm12, 7);
380	roundDec(xmm13, 7);
381	roundDec(xmm14, 7);
382	roundDec(xmm15, 7);
383	roundDec(xmm16, 7);
384	roundDec(xmm17, 7);
385	roundDec(xmm18, 7);
386	cmpl(rounds, 52);
387	jcc(Assembler::aboveEqual, AES192);
388	// Aesdeclast round for keysize = 128
389	lastroundDec(xmm27, 7);
390	jmp(END_LOOP);
391
392	bind(AES192);
393	// 2 Additional rounds for keysize = 192
394	roundDec(xmm19, 7);
395	roundDec(xmm20, 7);
396	cmpl(rounds, 60);
397	jcc(Assembler::aboveEqual, AES256);
398	// Aesdeclast round for keysize = 192
399	lastroundDec(xmm27, 7);
400	jmp(END_LOOP);
401	bind(AES256);
402	// 2 Additional rounds and Aesdeclast for keysize = 256
403	roundDec(xmm21, 7);
404	roundDec(xmm22, 7);
405	lastroundDec(xmm27, 7);
406
407	bind(END_LOOP);
408	// Write 512 bytes of PT to the destination
409	evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
410	evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
411	evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
412	evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
413	evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
414	evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
415	evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
416	evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
417
418	addq(pos, 512);
419	decq(len);
420	jcc(Assembler::notEqual, LOOP);
421
422	bind(REMAINDER);
423	vzeroupper();
424	cmpq(rbx, 0);
425	jcc(Assembler::equal, END);
426	// Process 16 bytes at a time
427	bind(LOOP2);
428	movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
429	vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
430	// xmm2 contains shuffled key for Aesdeclast operation.
431	vmovdqu(xmm2, xmm27);
432
433	vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
434	vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
435	vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
436	vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
437	vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
438	vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
439	vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
440	vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
441	vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);
442
443	cmpl(rounds, 52);
444	jcc(Assembler::below, LAST2);
445	vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
446	vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
447	cmpl(rounds, 60);
448	jcc(Assembler::below, LAST2);
449	vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
450	vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);
451
452	bind(LAST2);
453	// Aesdeclast round
454	vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
455	// Write 16 bytes of PT to destination
456	movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
457	addq(pos, 16);
458	decq(rbx);
459	jcc(Assembler::notEqual, LOOP2);
460
461	bind(END);
462	// Zero out the round keys
463	evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
464	evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
465	evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
466	evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
467	evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
468	evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
469	evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
470	evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
471	evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
472	evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
473	evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
474	evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
475	cmpl(rounds, 44);
476	jcc(Assembler::belowEqual, EXIT);
477	evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
478	evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
479	cmpl(rounds, 52);
480	jcc(Assembler::belowEqual, EXIT);
481	evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
482	evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
483	bind(EXIT);
484	pop(rbx);
485	pop(rax); // return length
486	pop(r12);
487	pop(r13);
488	}
489
490	// Multiply 128 x 128 bits, using 4 pclmulqdq operations
491	void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
492	XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
493	movdqu(xmm15, Address(htbl, i * 16));
494	vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
495	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
496	vpclmulldq(tmp3, data, xmm15); // 0x00
497	vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
498	vpclmulhdq(tmp3, data, xmm15); // 0x11
499	vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
500	vpclmullqhqdq(tmp3, data, xmm15); // 0x10
501	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
502	}
503
504	// Multiply two 128 bit numbers resulting in a 256 bit value
505	// Result of the multiplication followed by reduction stored in state
506	void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
507	const XMMRegister tmp1 = xmm4;
508	const XMMRegister tmp2 = xmm5;
509	const XMMRegister tmp3 = xmm6;
510	const XMMRegister tmp4 = xmm7;
511
512	vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
513	vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
514	vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
515	vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
516
517	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
518
519	vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
520	vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
521	vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
522	vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
523	// Follows the reduction technique mentioned in
524	// Shift-XOR reduction described in Gueron-Kounavis May 2010
525	// First phase of reduction
526	//
527	vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
528	vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
529	vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
530	// xor the shifted versions
531	vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
532	vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
533	vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
534	vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
535	vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
536	//
537	// Second phase of the reduction
538	//
539	vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
540	vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
541	vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
542	vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
543	vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
544	vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
545	vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
546	vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
547	ret(0);
548	}
549
550	// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
551	// The power of H is used in reduction process for one block ghash
552	void MacroAssembler::generateHtbl_one_block(Register htbl) {
553	const XMMRegister t = xmm13;
554
555	// load the original subkey hash
556	movdqu(t, Address(htbl, 0));
557	// shuffle using long swap mask
558	movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
559	vpshufb(t, t, xmm10, Assembler::AVX_128bit);
560
561	// Compute H' = GFMUL(H, 2)
562	vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
563	movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
564	vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
565	movl(rax, 0xff00);
566	movdl(xmm4, rax);
567	vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
568	movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
569	vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
570	vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
571	vpslld(xmm4, t, 1, Assembler::AVX_128bit);
572	vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
573	vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
574
575	//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
576	vpxor(t, t, xmm5, Assembler::AVX_128bit);
577	movdqu(Address(htbl, 1 * 16), t); // H * 2
578
579	ret(0);
580	}
581
582	// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
583	// The power of H is used in reduction process for eight block ghash
584	void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
585	const XMMRegister t = xmm13;
586	const XMMRegister tmp0 = xmm1;
587	Label GFMUL;
588
589	movdqu(t, Address(htbl, 1 * 16));
590	movdqu(tmp0, t);
591
592	// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
593	call(GFMUL, relocInfo::none);
594	movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
595	call(GFMUL, relocInfo::none);
596	movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
597	call(GFMUL, relocInfo::none);
598	movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
599	call(GFMUL, relocInfo::none);
600	movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
601	call(GFMUL, relocInfo::none);
602	movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
603	call(GFMUL, relocInfo::none);
604	movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
605	call(GFMUL, relocInfo::none);
606	movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
607	ret(0);
608
609	bind(GFMUL);
610	gfmul(tmp0, t);
611	}
612
613	// Multiblock and single block GHASH computation using Shift XOR reduction technique
614	void MacroAssembler::avx_ghash(Register input_state, Register htbl,
615	Register input_data, Register blocks) {
616
617	// temporary variables to hold input data and input state
618	const XMMRegister data = xmm1;
619	const XMMRegister state = xmm0;
620	// temporary variables to hold intermediate results
621	const XMMRegister tmp0 = xmm3;
622	const XMMRegister tmp1 = xmm4;
623	const XMMRegister tmp2 = xmm5;
624	const XMMRegister tmp3 = xmm6;
625	// temporary variables to hold byte and long swap masks
626	const XMMRegister bswap_mask = xmm2;
627	const XMMRegister lswap_mask = xmm14;
628
629	Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
630	ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
631
632	testptr(blocks, blocks);
633	jcc(Assembler::zero, EXIT_GHASH);
634
635	// Check if Hashtable (1*16) has been already generated
636	// For anything less than 8 blocks, we generate only the first power of H.
637	movdqu(tmp2, Address(htbl, 1 * 16));
638	ptest(tmp2, tmp2);
639	jcc(Assembler::notZero, BEGIN_PROCESS);
640	call(GENERATE_HTBL_1_BLK, relocInfo::none);
641
642	// Shuffle the input state
643	bind(BEGIN_PROCESS);
644	movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
645	movdqu(state, Address(input_state, 0));
646	vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
647
648	cmpl(blocks, 8);
649	jcc(Assembler::below, ONE_BLK_INIT);
650	// If we have 8 blocks or more data, then generate remaining powers of H
651	movdqu(tmp2, Address(htbl, 8 * 16));
652	ptest(tmp2, tmp2);
653	jcc(Assembler::notZero, PROCESS_8_BLOCKS);
654	call(GENERATE_HTBL_8_BLKS, relocInfo::none);
655
656	//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
657	//Each block = 16 bytes.
658	bind(PROCESS_8_BLOCKS);
659	subl(blocks, 8);
660	movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
661	movdqu(data, Address(input_data, 16 * 7));
662	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
663	//Loading 1*16 as calculated powers of H required starts at that location.
664	movdqu(xmm15, Address(htbl, 1 * 16));
665	//Perform carryless multiplication of (H*2, data block #7)
666	vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
667	vpclmulldq(tmp0, data, xmm15);//a0 * b0
668	vpclmulhdq(tmp1, data, xmm15);//a1 * b1
669	vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
670	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
671
672	movdqu(data, Address(input_data, 16 * 6));
673	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
674	// Perform carryless multiplication of (H^2 * 2, data block #6)
675	schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
676
677	movdqu(data, Address(input_data, 16 * 5));
678	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
679	// Perform carryless multiplication of (H^3 * 2, data block #5)
680	schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
681	movdqu(data, Address(input_data, 16 * 4));
682	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
683	// Perform carryless multiplication of (H^4 * 2, data block #4)
684	schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
685	movdqu(data, Address(input_data, 16 * 3));
686	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
687	// Perform carryless multiplication of (H^5 * 2, data block #3)
688	schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
689	movdqu(data, Address(input_data, 16 * 2));
690	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
691	// Perform carryless multiplication of (H^6 * 2, data block #2)
692	schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
693	movdqu(data, Address(input_data, 16 * 1));
694	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
695	// Perform carryless multiplication of (H^7 * 2, data block #1)
696	schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
697	movdqu(data, Address(input_data, 16 * 0));
698	// xor data block#0 with input state before perfoming carry-less multiplication
699	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
700	vpxor(data, data, state, Assembler::AVX_128bit);
701	// Perform carryless multiplication of (H^8 * 2, data block #0)
702	schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
703	vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
704	vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
705	vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
706	vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
707
708	// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
709	// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
710	// Follows the reduction technique mentioned in
711	// Shift-XOR reduction described in Gueron-Kounavis May 2010
712	bind(BLOCK8_REDUCTION);
713	// First Phase of the reduction
714	vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
715	vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
716	vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
717	// xor the shifted versions
718	vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
719	vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
720
721	vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
722	vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
723
724	vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
725	// second phase of the reduction
726	vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
727	vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
728	vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
729	// xor the shifted versions
730	vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
731	vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
732	vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
733	vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
734	// Final result is in state
735	vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
736
737	lea(input_data, Address(input_data, 16 * 8));
738	cmpl(blocks, 8);
739	jcc(Assembler::below, ONE_BLK_INIT);
740	jmp(PROCESS_8_BLOCKS);
741
742	// Since this is one block operation we will only use H * 2 i.e. the first power of H
743	bind(ONE_BLK_INIT);
744	movdqu(tmp0, Address(htbl, 1 * 16));
745	movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
746
747	//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
748	bind(PROCESS_1_BLOCK);
749	cmpl(blocks, 0);
750	jcc(Assembler::equal, SAVE_STATE);
751	subl(blocks, 1);
752	movdqu(data, Address(input_data, 0));
753	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
754	vpxor(state, state, data, Assembler::AVX_128bit);
755	// gfmul(H*2, state)
756	call(GFMUL, relocInfo::none);
757	addptr(input_data, 16);
758	jmp(PROCESS_1_BLOCK);
759
760	bind(SAVE_STATE);
761	vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
762	movdqu(Address(input_state, 0), state);
763	jmp(EXIT_GHASH);
764
765	bind(GFMUL);
766	gfmul(tmp0, state);
767
768	bind(GENERATE_HTBL_1_BLK);
769	generateHtbl_one_block(htbl);
770
771	bind(GENERATE_HTBL_8_BLKS);
772	generateHtbl_eight_blocks(htbl);
773
774	bind(EXIT_GHASH);
775	// zero out xmm registers used for Htbl storage
776	vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
777	vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
778	vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
779	vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
780	}
781
782	// AES Counter Mode using VAES instructions
783	void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
784	Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
785
786	const Register rounds = 0;
787	const Register pos = r12;
788
789	Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
790	AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
791	REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
792	AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
793	AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
794	EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
795
796	cmpl(len_reg, 0);
797	jcc(Assembler::belowEqual, EXIT);
798
799	movl(pos, 0);
800	// if the number of used encrypted counter bytes < 16,
801	// XOR PT with saved encrypted counter to obtain CT
802	bind(PRELOOP_START);
803	cmpl(used, 16);
804	jcc(Assembler::aboveEqual, EXIT_PRELOOP);
805	movb(rbx, Address(saved_encCounter_start, used));
806	xorb(rbx, Address(src_addr, pos));
807	movb(Address(dest_addr, pos), rbx);
808	addptr(pos, 1);
809	addptr(used, 1);
810	decrement(len_reg);
811	jmp(PRELOOP_START);
812
813	bind(EXIT_PRELOOP);
814	movl(Address(used_addr, 0), used);
815
816	// Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
817	movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
818
819	vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
820	// Move initial counter value in xmm0
821	movdqu(xmm0, Address(counter, 0));
822	// broadcast counter value to zmm8
823	evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
824
825	// load lbswap mask
826	evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15);
827
828	//shuffle counter using lbswap_mask
829	vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
830
831	// pre-increment and propagate counter values to zmm9-zmm15 registers.
832	// Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
833	// The counter is incremented after each block i.e. 16 bytes is processed;
834	// each zmm register has 4 counter values as its MSB
835	// the counters are incremented in parallel
836	vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0
837	vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip)
838	vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
839	vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
840	vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
841	vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
842	vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
843	vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
844
845	// load linc32 mask in zmm register.linc32 increments counter by 32
846	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32
847
848	// xmm31 contains the key shuffle mask.
849	movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
850	// Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
851	// For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
852	// that holds shuffled key value.
853	ev_load_key(xmm20, key, 0, xmm31);
854	ev_load_key(xmm21, key, 1 * 16, xmm31);
855	ev_load_key(xmm22, key, 2 * 16, xmm31);
856	ev_load_key(xmm23, key, 3 * 16, xmm31);
857	ev_load_key(xmm24, key, 4 * 16, xmm31);
858	ev_load_key(xmm25, key, 5 * 16, xmm31);
859	ev_load_key(xmm26, key, 6 * 16, xmm31);
860	ev_load_key(xmm27, key, 7 * 16, xmm31);
861	ev_load_key(xmm28, key, 8 * 16, xmm31);
862	ev_load_key(xmm29, key, 9 * 16, xmm31);
863	ev_load_key(xmm30, key, 10 * 16, xmm31);
864
865	// Process 32 blocks or 512 bytes of data
866	bind(LOOP);
867	cmpl(len_reg, 512);
868	jcc(Assembler::less, REMAINDER);
869	subq(len_reg, 512);
870	//Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
871	vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
872	evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
873	vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
874	evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
875	vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
876	evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
877	vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
878	evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
879	vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
880	evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
881	vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
882	evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
883	vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
884	evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
885	vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
886	evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
887	// Perform AES encode operations and put results in zmm0-zmm7.
888	// This is followed by incrementing counter values in zmm8-zmm15.
889	// Since we will be processing 32 blocks at a time, the counter is incremented by 32.
890	roundEnc(xmm21, 7);
891	vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
892	roundEnc(xmm22, 7);
893	vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
894	roundEnc(xmm23, 7);
895	vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
896	roundEnc(xmm24, 7);
897	vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
898	roundEnc(xmm25, 7);
899	vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
900	roundEnc(xmm26, 7);
901	vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
902	roundEnc(xmm27, 7);
903	vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
904	roundEnc(xmm28, 7);
905	vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
906	roundEnc(xmm29, 7);
907
908	cmpl(rounds, 52);
909	jcc(Assembler::aboveEqual, AES192);
910	lastroundEnc(xmm30, 7);
911	jmp(END_LOOP);
912
913	bind(AES192);
914	roundEnc(xmm30, 7);
915	ev_load_key(xmm18, key, 11 * 16, xmm31);
916	roundEnc(xmm18, 7);
917	cmpl(rounds, 60);
918	jcc(Assembler::aboveEqual, AES256);
919	ev_load_key(xmm18, key, 12 * 16, xmm31);
920	lastroundEnc(xmm18, 7);
921	jmp(END_LOOP);
922
923	bind(AES256);
924	ev_load_key(xmm18, key, 12 * 16, xmm31);
925	roundEnc(xmm18, 7);
926	ev_load_key(xmm18, key, 13 * 16, xmm31);
927	roundEnc(xmm18, 7);
928	ev_load_key(xmm18, key, 14 * 16, xmm31);
929	lastroundEnc(xmm18, 7);
930
931	// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
932	// xor encrypted block cipher and input plaintext and store resultant ciphertext
933	bind(END_LOOP);
934	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
935	evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
936	evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
937	evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
938	evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
939	evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
940	evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
941	evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
942	evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
943	evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
944	evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
945	evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
946	evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
947	evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
948	evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
949	evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
950	addq(pos, 512);
951	jmp(LOOP);
952
953	// Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
954	bind(REMAINDER);
955	cmpl(len_reg, 0);
956	jcc(Assembler::equal, END);
957	cmpl(len_reg, 256);
958	jcc(Assembler::aboveEqual, REMAINDER_16);
959	cmpl(len_reg, 128);
960	jcc(Assembler::aboveEqual, REMAINDER_8);
961	cmpl(len_reg, 64);
962	jcc(Assembler::aboveEqual, REMAINDER_4);
963	// At this point, we will process 16 bytes of data at a time.
964	// So load xmm19 with counter increment value as 1
965	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);
966	jmp(REMAINDER_LOOP);
967
968	// Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
969	bind(REMAINDER_16);
970	subq(len_reg, 256);
971	// As we process 16 blocks at a time, load mask for incrementing the counter value by 16
972	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip)
973	// shuffle counter and XOR counter with roundkey1
974	vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
975	evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
976	vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
977	evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
978	vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
979	evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
980	vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
981	evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
982	// Increment counter values by 16
983	vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
984	vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
985	// AES encode rounds
986	roundEnc(xmm21, 3);
987	roundEnc(xmm22, 3);
988	roundEnc(xmm23, 3);
989	roundEnc(xmm24, 3);
990	roundEnc(xmm25, 3);
991	roundEnc(xmm26, 3);
992	roundEnc(xmm27, 3);
993	roundEnc(xmm28, 3);
994	roundEnc(xmm29, 3);
995
996	cmpl(rounds, 52);
997	jcc(Assembler::aboveEqual, AES192_REMAINDER16);
998	lastroundEnc(xmm30, 3);
999	jmp(REMAINDER16_END_LOOP);
1000
1001	bind(AES192_REMAINDER16);
1002	roundEnc(xmm30, 3);
1003	ev_load_key(xmm18, key, 11 * 16, xmm31);
1004	roundEnc(xmm18, 3);
1005	ev_load_key(xmm5, key, 12 * 16, xmm31);
1006
1007	cmpl(rounds, 60);
1008	jcc(Assembler::aboveEqual, AES256_REMAINDER16);
1009	lastroundEnc(xmm5, 3);
1010	jmp(REMAINDER16_END_LOOP);
1011	bind(AES256_REMAINDER16);
1012	roundEnc(xmm5, 3);
1013	ev_load_key(xmm6, key, 13 * 16, xmm31);
1014	roundEnc(xmm6, 3);
1015	ev_load_key(xmm7, key, 14 * 16, xmm31);
1016	lastroundEnc(xmm7, 3);
1017
1018	// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
1019	// xor 256 bytes of PT with the encrypted counters to produce CT.
1020	bind(REMAINDER16_END_LOOP);
1021	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
1022	evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1023	evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1024	evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1025	evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1026	evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
1027	evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1028	evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
1029	addq(pos, 256);
1030
1031	cmpl(len_reg, 128);
1032	jcc(Assembler::aboveEqual, REMAINDER_8);
1033
1034	cmpl(len_reg, 64);
1035	jcc(Assembler::aboveEqual, REMAINDER_4);
1036	//load mask for incrementing the counter value by 1
1037	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1038	jmp(REMAINDER_LOOP);
1039
1040	// Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
1041	bind(REMAINDER_8);
1042	subq(len_reg, 128);
1043	// As we process 8 blocks at a time, load mask for incrementing the counter value by 8
1044	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip)
1045	// shuffle counters and xor with roundkey1
1046	vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1047	evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1048	vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
1049	evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
1050	// increment counter by 8
1051	vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1052	// AES encode
1053	roundEnc(xmm21, 1);
1054	roundEnc(xmm22, 1);
1055	roundEnc(xmm23, 1);
1056	roundEnc(xmm24, 1);
1057	roundEnc(xmm25, 1);
1058	roundEnc(xmm26, 1);
1059	roundEnc(xmm27, 1);
1060	roundEnc(xmm28, 1);
1061	roundEnc(xmm29, 1);
1062
1063	cmpl(rounds, 52);
1064	jcc(Assembler::aboveEqual, AES192_REMAINDER8);
1065	lastroundEnc(xmm30, 1);
1066	jmp(REMAINDER8_END_LOOP);
1067
1068	bind(AES192_REMAINDER8);
1069	roundEnc(xmm30, 1);
1070	ev_load_key(xmm18, key, 11 * 16, xmm31);
1071	roundEnc(xmm18, 1);
1072	ev_load_key(xmm5, key, 12 * 16, xmm31);
1073	cmpl(rounds, 60);
1074	jcc(Assembler::aboveEqual, AES256_REMAINDER8);
1075	lastroundEnc(xmm5, 1);
1076	jmp(REMAINDER8_END_LOOP);
1077
1078	bind(AES256_REMAINDER8);
1079	roundEnc(xmm5, 1);
1080	ev_load_key(xmm6, key, 13 * 16, xmm31);
1081	roundEnc(xmm6, 1);
1082	ev_load_key(xmm7, key, 14 * 16, xmm31);
1083	lastroundEnc(xmm7, 1);
1084
1085	bind(REMAINDER8_END_LOOP);
1086	// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
1087	// XOR PT with the encrypted counter and store as CT
1088	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1089	evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
1090	evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1091	evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1092	addq(pos, 128);
1093
1094	cmpl(len_reg, 64);
1095	jcc(Assembler::aboveEqual, REMAINDER_4);
1096	// load mask for incrementing the counter value by 1
1097	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1098	jmp(REMAINDER_LOOP);
1099
1100	// Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
1101	bind(REMAINDER_4);
1102	subq(len_reg, 64);
1103	// As we process 4 blocks at a time, load mask for incrementing the counter value by 4
1104	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
1105	// XOR counter with first roundkey
1106	vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1107	evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1108	// Increment counter
1109	vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1110	vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
1111	vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
1112	vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
1113	vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
1114	vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
1115	vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
1116	vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
1117	vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
1118	vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
1119	cmpl(rounds, 52);
1120	jcc(Assembler::aboveEqual, AES192_REMAINDER4);
1121	vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1122	jmp(END_REMAINDER4);
1123
1124	bind(AES192_REMAINDER4);
1125	vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1126	ev_load_key(xmm18, key, 11 * 16, xmm31);
1127	vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
1128	ev_load_key(xmm5, key, 12 * 16, xmm31);
1129
1130	cmpl(rounds, 60);
1131	jcc(Assembler::aboveEqual, AES256_REMAINDER4);
1132	vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1133	jmp(END_REMAINDER4);
1134
1135	bind(AES256_REMAINDER4);
1136	vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1137	ev_load_key(xmm6, key, 13 * 16, xmm31);
1138	vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
1139	ev_load_key(xmm7, key, 14 * 16, xmm31);
1140	vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
1141	// After AES encode rounds, the encrypted block cipher lies in zmm0.
1142	// XOR encrypted block cipher with PT and store 64 bytes of ciphertext
1143	bind(END_REMAINDER4);
1144	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1145	evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1146	addq(pos, 64);
1147	// load mask for incrementing the counter value by 1
1148	evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1149
1150	// For a single block, the AES rounds start here.
1151	bind(REMAINDER_LOOP);
1152	cmpl(len_reg, 0);
1153	jcc(Assembler::belowEqual, END);
1154	// XOR counter with first roundkey
1155	vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
1156	evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
1157	vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
1158	// Increment counter by 1
1159	vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
1160	vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
1161	vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
1162	vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
1163	vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
1164	vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
1165	vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
1166	vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
1167	vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
1168
1169	cmpl(rounds, 52);
1170	jcc(Assembler::aboveEqual, AES192_REMAINDER);
1171	vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1172	jmp(END_REMAINDER_LOOP);
1173
1174	bind(AES192_REMAINDER);
1175	vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1176	ev_load_key(xmm18, key, 11 * 16, xmm31);
1177	vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
1178	ev_load_key(xmm5, key, 12 * 16, xmm31);
1179	cmpl(rounds, 60);
1180	jcc(Assembler::aboveEqual, AES256_REMAINDER);
1181	vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1182	jmp(END_REMAINDER_LOOP);
1183
1184	bind(AES256_REMAINDER);
1185	vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1186	ev_load_key(xmm6, key, 13 * 16, xmm31);
1187	vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
1188	ev_load_key(xmm7, key, 14 * 16, xmm31);
1189	vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
1190
1191	bind(END_REMAINDER_LOOP);
1192	// If the length register is less than the blockSize i.e. 16
1193	// then we store only those bytes of the CT to the destination
1194	// corresponding to the length register value
1195	// extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
1196	cmpl(len_reg, 16);
1197	jcc(Assembler::less, EXTRACT_TAILBYTES);
1198	subl(len_reg, 16);
1199	// After AES encode rounds, the encrypted block cipher lies in xmm0.
1200	// If the length register is equal to 16 bytes, store CT in dest after XOR operation.
1201	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1202	evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
1203	addl(pos, 16);
1204
1205	jmp(REMAINDER_LOOP);
1206
1207	bind(EXTRACT_TAILBYTES);
1208	// Save encrypted counter value in xmm0 for next invocation, before XOR operation
1209	movdqu(Address(saved_encCounter_start, 0), xmm0);
1210	// XOR encryted block cipher in xmm0 with PT to produce CT
1211	evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1212	// extract upto 15 bytes of CT from xmm0 as specified by length register
1213	testptr(len_reg, 8);
1214	jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
1215	pextrq(Address(dest_addr, pos), xmm0, 0);
1216	psrldq(xmm0, 8);
1217	addl(pos, 8);
1218	bind(EXTRACT_TAIL_4BYTES);
1219	testptr(len_reg, 4);
1220	jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
1221	pextrd(Address(dest_addr, pos), xmm0, 0);
1222	psrldq(xmm0, 4);
1223	addq(pos, 4);
1224	bind(EXTRACT_TAIL_2BYTES);
1225	testptr(len_reg, 2);
1226	jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
1227	pextrw(Address(dest_addr, pos), xmm0, 0);
1228	psrldq(xmm0, 2);
1229	addl(pos, 2);
1230	bind(EXTRACT_TAIL_1BYTE);
1231	testptr(len_reg, 1);
1232	jcc(Assembler::zero, END);
1233	pextrb(Address(dest_addr, pos), xmm0, 0);
1234	addl(pos, 1);
1235
1236	bind(END);
1237	// If there are no tail bytes, store counter value and exit
1238	cmpl(len_reg, 0);
1239	jcc(Assembler::equal, STORE_CTR);
1240	movl(Address(used_addr, 0), len_reg);
1241
1242	bind(STORE_CTR);
1243	//shuffle updated counter and store it
1244	vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
1245	movdqu(Address(counter, 0), xmm8);
1246	// Zero out counter and key registers
1247	evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
1248	evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
1249	evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
1250	evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
1251	evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
1252	evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
1253	evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
1254	evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
1255	evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
1256	evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
1257	evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
1258	evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
1259	cmpl(rounds, 44);
1260	jcc(Assembler::belowEqual, EXIT);
1261	evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
1262	evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
1263	cmpl(rounds, 52);
1264	jcc(Assembler::belowEqual, EXIT);
1265	evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
1266	evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
1267	bind(EXIT);
1268	}
1269
1270	void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
1271	const XMMRegister TMP1 = xmm0;
1272	const XMMRegister TMP2 = xmm1;
1273	const XMMRegister TMP3 = xmm2;
1274
1275	evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit);
1276	evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit);
1277	evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit);
1278	evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit);
1279	evpxorq(GH, GH, TMP3, Assembler::AVX_512bit);
1280	vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit);
1281	vpslldq(GH, GH, 8, Assembler::AVX_512bit);
1282	evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit);
1283	evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1284
1285	evmovdquq(TMP3, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, r15);
1286	evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit);
1287	vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit);
1288	evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1289	evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit);
1290	vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit);
1291	evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit);
1292	vpslldq(GH, GH, 4, Assembler::AVX_512bit);
1293	vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
1294	}
1295
1296	void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) {
1297	const XMMRegister HK = xmm6;
1298	const XMMRegister ZT5 = xmm4;
1299	const XMMRegister ZT7 = xmm7;
1300	const XMMRegister ZT8 = xmm8;
1301
1302	Label GFMUL_AVX512;
1303
1304	movdqu(HK, Address(htbl, 0));
1305	movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1306	vpshufb(HK, HK, xmm10, Assembler::AVX_128bit);
1307
1308	movdqu(xmm11, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 64)); // Poly
1309	movdqu(xmm12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 80)); // Twoone
1310	// Compute H ^ 2 from the input subkeyH
1311	movdqu(xmm2, xmm6);
1312	vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit);
1313	vpsrlq(xmm2, xmm2, 63, Assembler::AVX_128bit);
1314	movdqu(xmm1, xmm2);
1315	vpslldq(xmm2, xmm2, 8, Assembler::AVX_128bit);
1316	vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit);
1317	vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1318
1319	vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit);
1320	vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit);
1321	vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
1322	vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1323	movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2
1324	// Compute the remaining three powers of H using XMM registers and all following powers using ZMM
1325	movdqu(ZT5, HK);
1326	vinserti32x4(ZT7, ZT7, HK, 3);
1327
1328	gfmul_avx512(ZT5, HK);
1329	movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2
1330	vinserti32x4(ZT7, ZT7, ZT5, 2);
1331
1332	gfmul_avx512(ZT5, HK);
1333	movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3
1334	vinserti32x4(ZT7, ZT7, ZT5, 1);
1335
1336	gfmul_avx512(ZT5, HK);
1337	movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4
1338	vinserti32x4(ZT7, ZT7, ZT5, 0);
1339
1340	evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit);
1341	evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);
1342	gfmul_avx512(ZT7, ZT5);
1343	evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit);
1344	evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);
1345	gfmul_avx512(ZT8, ZT5);
1346	evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit);
1347	gfmul_avx512(ZT7, ZT5);
1348	evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit);
1349	gfmul_avx512(ZT8, ZT5);
1350	evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit);
1351	gfmul_avx512(ZT7, ZT5);
1352	evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);
1353	gfmul_avx512(ZT8, ZT5);
1354	evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit);
1355	gfmul_avx512(ZT7, ZT5);
1356	evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit);
1357	gfmul_avx512(ZT8, ZT5);
1358	evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit);
1359	gfmul_avx512(ZT7, ZT5);
1360	evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit);
1361	gfmul_avx512(ZT8, ZT5);
1362	evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit);
1363	gfmul_avx512(ZT7, ZT5);
1364	evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit);
1365	ret(0);
1366	}
1367
1368	#define vclmul_reduce(out, poly, hi128, lo128, tmp0, tmp1)evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); vpslldq (tmp0, tmp0, 8, Assembler::AVX_512bit); evpxorq(tmp0, lo128, tmp0 , Assembler::AVX_512bit); evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit ); evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); vpslldq(out, out, 4, Assembler::AVX_512bit); vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \
1369	evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); \
1370	vpslldq(tmp0, tmp0, 8, Assembler::AVX_512bit); \
1371	evpxorq(tmp0, lo128, tmp0, Assembler::AVX_512bit); \
1372	evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); \
1373	vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit); \
1374	evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); \
1375	vpslldq(out, out, 4, Assembler::AVX_512bit); \
1376	vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \
1377
1378	#define vhpxori4x128(reg, tmp)vextracti64x4(tmp, reg, 1); evpxorq(reg, reg, tmp, Assembler:: AVX_256bit); vextracti32x4(tmp, reg, 1); evpxorq(reg, reg, tmp , Assembler::AVX_128bit); \
1379	vextracti64x4(tmp, reg, 1); \
1380	evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \
1381	vextracti32x4(tmp, reg, 1); \
1382	evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \
1383
1384	#define roundEncode(key, dst1, dst2, dst3, dst4)vaesenc(dst1, dst1, key, Assembler::AVX_512bit); vaesenc(dst2 , dst2, key, Assembler::AVX_512bit); vaesenc(dst3, dst3, key, Assembler::AVX_512bit); vaesenc(dst4, dst4, key, Assembler:: AVX_512bit); \
1385	vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \
1386	vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \
1387	vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \
1388	vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \
1389
1390	#define lastroundEncode(key, dst1, dst2, dst3, dst4)vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); vaesenclast (dst2, dst2, key, Assembler::AVX_512bit); vaesenclast(dst3, dst3 , key, Assembler::AVX_512bit); vaesenclast(dst4, dst4, key, Assembler ::AVX_512bit); \
1391	vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \
1392	vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \
1393	vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \
1394	vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \
1395
1396	#define storeData(dst, position, src1, src2, src3, src4)evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1 , Assembler::AVX_512bit); evmovdquq(Address(dst, position, Address ::times_1, 1 * 64), src2, Assembler::AVX_512bit); evmovdquq(Address (dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit ); evmovdquq(Address(dst, position, Address::times_1, 3 * 64) , src4, Assembler::AVX_512bit); \
1397	evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \
1398	evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \
1399	evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \
1400	evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \
1401
1402	#define loadData(src, position, dst1, dst2, dst3, dst4)evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64 ), Assembler::AVX_512bit); evmovdquq(dst2, Address(src, position , Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq (dst3, Address(src, position, Address::times_1, 2 * 64), Assembler ::AVX_512bit); evmovdquq(dst4, Address(src, position, Address ::times_1, 3 * 64), Assembler::AVX_512bit); \
1403	evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \
1404	evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \
1405	evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \
1406	evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \
1407
1408	#define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey)evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit) ; evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit ); evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit ); evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit ); \
1409	evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit); \
1410	evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit); \
1411	evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit); \
1412	evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit); \
1413
1414	#define shuffleExorRnd1Key(dst0, dst1, dst2, dst3, shufmask, rndkey)vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); evpxorq (dst0, dst0, rndkey, Assembler::AVX_512bit); vpshufb(dst1, dst1 , shufmask, Assembler::AVX_512bit); evpxorq(dst1, dst1, rndkey , Assembler::AVX_512bit); vpshufb(dst2, dst2, shufmask, Assembler ::AVX_512bit); evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit ); vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); evpxorq (dst3, dst3, rndkey, Assembler::AVX_512bit); \
1415	vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); \
1416	evpxorq(dst0, dst0, rndkey, Assembler::AVX_512bit); \
1417	vpshufb(dst1, dst1, shufmask, Assembler::AVX_512bit); \
1418	evpxorq(dst1, dst1, rndkey, Assembler::AVX_512bit); \
1419	vpshufb(dst2, dst2, shufmask, Assembler::AVX_512bit); \
1420	evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit); \
1421	vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); \
1422	evpxorq(dst3, dst3, rndkey, Assembler::AVX_512bit); \
1423
1424	#define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3)evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); evpxorq(dst1 , dst1, src1, Assembler::AVX_512bit); evpxorq(dst2, dst2, src2 , Assembler::AVX_512bit); evpxorq(dst3, dst3, src3, Assembler ::AVX_512bit); \
1425	evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \
1426	evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \
1427	evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \
1428	evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \
1429
1430	#define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33)vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); vpternlogq (dst1, 0x96, src12, src13, Assembler::AVX_512bit); vpternlogq (dst2, 0x96, src22, src23, Assembler::AVX_512bit); vpternlogq (dst3, 0x96, src32, src33, Assembler::AVX_512bit); \
1431	vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \
1432	vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \
1433	vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \
1434	vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \
1435
1436	void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx,
1437	Register in, Register out, Register data, Register pos, bool first_time_reduction, XMMRegister addmask, bool ghash_input, Register rounds,
1438	Register ghash_pos, bool final_reduction, int i, XMMRegister counter_inc_mask) {
1439
1440	Label AES_192, AES_256, LAST_AES_RND;
1441	const XMMRegister ZTMP0 = xmm0;
1442	const XMMRegister ZTMP1 = xmm3;
1443	const XMMRegister ZTMP2 = xmm4;
1444	const XMMRegister ZTMP3 = xmm5;
1445	const XMMRegister ZTMP5 = xmm7;
1446	const XMMRegister ZTMP6 = xmm10;
1447	const XMMRegister ZTMP7 = xmm11;
1448	const XMMRegister ZTMP8 = xmm12;
1449	const XMMRegister ZTMP9 = xmm13;
1450	const XMMRegister ZTMP10 = xmm15;
1451	const XMMRegister ZTMP11 = xmm16;
1452	const XMMRegister ZTMP12 = xmm17;
1453
1454	const XMMRegister ZTMP13 = xmm19;
1455	const XMMRegister ZTMP14 = xmm20;
1456	const XMMRegister ZTMP15 = xmm21;
1457	const XMMRegister ZTMP16 = xmm30;
1458	const XMMRegister ZTMP17 = xmm31;
1459	const XMMRegister ZTMP18 = xmm1;
1460	const XMMRegister ZTMP19 = xmm2;
1461	const XMMRegister ZTMP20 = xmm8;
1462	const XMMRegister ZTMP21 = xmm22;
1463	const XMMRegister ZTMP22 = xmm23;
1464
1465	// Pre increment counters
1466	vpaddd(ZTMP0, ctr_blockx, counter_inc_mask, Assembler::AVX_512bit);
1467	vpaddd(ZTMP1, ZTMP0, counter_inc_mask, Assembler::AVX_512bit);
1468	vpaddd(ZTMP2, ZTMP1, counter_inc_mask, Assembler::AVX_512bit);
1469	vpaddd(ZTMP3, ZTMP2, counter_inc_mask, Assembler::AVX_512bit);
1470	// Save counter value
1471	evmovdquq(ctr_blockx, ZTMP3, Assembler::AVX_512bit);
1472
1473	// Reuse ZTMP17 / ZTMP18 for loading AES Keys
1474	// Pre-load AES round keys
1475	ev_load_key(ZTMP17, key, 0, xmm29);
1476	ev_load_key(ZTMP18, key, 1 * 16, xmm29);
1477
1478	// ZTMP19 & ZTMP20 used for loading hash key
1479	// Pre-load hash key
1480	evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit);
1481	evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1482	// Load data for computing ghash
1483	evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1484	vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1485
1486	// Xor cipher block 0 with input ghash, if available
1487	if (ghash_input) {
1488	evpxorq(ZTMP21, ZTMP21, aad_hashx, Assembler::AVX_512bit);
1489	}
1490	// Load data for computing ghash
1491	evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1492	vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1493
1494	// stitch AES rounds with GHASH
1495	// AES round 0, xmm24 has shuffle mask
1496	shuffleExorRnd1Key(ZTMP0, ZTMP1, ZTMP2, ZTMP3, xmm24, ZTMP17)vpshufb(ZTMP0, ZTMP0, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vpshufb(ZTMP1, ZTMP1 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP1, ZTMP1, ZTMP17 , Assembler::AVX_512bit); vpshufb(ZTMP2, ZTMP2, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit ); vpshufb(ZTMP3, ZTMP3, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1497	// Reuse ZTMP17 / ZTMP18 for loading remaining AES Keys
1498	ev_load_key(ZTMP17, key, 2 * 16, xmm29);
1499	// GHASH 4 blocks
1500	carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19)evpclmulqdq(ZTMP6, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );;
1501	// Load the next hkey and Ghash data
1502	evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1503	evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1504	vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1505
1506	// AES round 1
1507	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1508	ev_load_key(ZTMP18, key, 3 * 16, xmm29);
1509
1510	// GHASH 4 blocks(11 to 8)
1511	carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );;
1512	// Load the next hkey and GDATA
1513	evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1514	evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1515	vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1516
1517	// AES round 2
1518	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1519	ev_load_key(ZTMP17, key, 4 * 16, xmm29);
1520
1521	// GHASH 4 blocks(7 to 4)
1522	carrylessMultiply(ZTMP14, ZTMP16, ZTMP15, ZTMP13, ZTMP21, ZTMP19)evpclmulqdq(ZTMP14, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP16, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP15, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP13, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );;
1523	// AES rounds 3
1524	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1525	ev_load_key(ZTMP18, key, 5 * 16, xmm29);
1526
1527	// Gather(XOR) GHASH for 12 blocks
1528	xorGHASH(ZTMP5, ZTMP6, ZTMP8, ZTMP7, ZTMP9, ZTMP13, ZTMP10, ZTMP14, ZTMP12, ZTMP16, ZTMP11, ZTMP15)vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP13, Assembler::AVX_512bit) ; vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP14, Assembler::AVX_512bit ); vpternlogq(ZTMP8, 0x96, ZTMP12, ZTMP16, Assembler::AVX_512bit ); vpternlogq(ZTMP7, 0x96, ZTMP11, ZTMP15, Assembler::AVX_512bit );;
1529
1530	// AES rounds 4
1531	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1532	ev_load_key(ZTMP17, key, 6 * 16, xmm29);
1533
1534	// load plain / cipher text(recycle registers)
1535	loadData(in, pos, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evmovdquq(ZTMP13, Address(in, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(in, pos, Address ::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP15, Address(in, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit ); evmovdquq(ZTMP16, Address(in, pos, Address::times_1, 3 * 64 ), Assembler::AVX_512bit);;
1536
1537	// AES rounds 5
1538	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1539	ev_load_key(ZTMP18, key, 7 * 16, xmm29);
1540	// GHASH 4 blocks(3 to 0)
1541	carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );;
1542
1543	// AES round 6
1544	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1545	ev_load_key(ZTMP17, key, 8 * 16, xmm29);
1546
1547	// gather GHASH in ZTMP6(low) and ZTMP5(high)
1548	if (first_time_reduction) {
1549	vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1550	evpxorq(xmm25, ZTMP7, ZTMP11, Assembler::AVX_512bit);
1551	evpxorq(xmm27, ZTMP5, ZTMP9, Assembler::AVX_512bit);
1552	evpxorq(xmm26, ZTMP6, ZTMP10, Assembler::AVX_512bit);
1553	}
1554	else if (!first_time_reduction && !final_reduction) {
1555	xorGHASH(ZTMP7, xmm25, xmm27, xmm26, ZTMP8, ZTMP12, ZTMP7, ZTMP11, ZTMP5, ZTMP9, ZTMP6, ZTMP10)vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit) ; vpternlogq(xmm25, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(xmm27, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit ); vpternlogq(xmm26, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit );;
1556	}
1557
1558	if (final_reduction) {
1559	// Phase one: Add mid products together
1560	// Also load polynomial constant for reduction
1561	vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1562	vpternlogq(ZTMP7, 0x96, xmm25, ZTMP11, Assembler::AVX_512bit);
1563	vpsrldq(ZTMP11, ZTMP7, 8, Assembler::AVX_512bit);
1564	vpslldq(ZTMP7, ZTMP7, 8, Assembler::AVX_512bit);
1565	evmovdquq(ZTMP12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1566	}
1567	// AES round 7
1568	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1569	ev_load_key(ZTMP18, key, 9 * 16, xmm29);
1570	if (final_reduction) {
1571	vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP11, Assembler::AVX_512bit);
1572	evpxorq(ZTMP5, ZTMP5, xmm27, Assembler::AVX_512bit);
1573	vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP7, Assembler::AVX_512bit);
1574	evpxorq(ZTMP6, ZTMP6, xmm26, Assembler::AVX_512bit);
1575	}
1576	// AES round 8
1577	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1578	ev_load_key(ZTMP17, key, 10 * 16, xmm29);
1579
1580	// Horizontal xor of low and high 4*128
1581	if (final_reduction) {
1582	vhpxori4x128(ZTMP5, ZTMP9)vextracti64x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler ::AVX_256bit); vextracti32x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler::AVX_128bit);;
1583	vhpxori4x128(ZTMP6, ZTMP10)vextracti64x4(ZTMP10, ZTMP6, 1); evpxorq(ZTMP6, ZTMP6, ZTMP10 , Assembler::AVX_256bit); vextracti32x4(ZTMP10, ZTMP6, 1); evpxorq (ZTMP6, ZTMP6, ZTMP10, Assembler::AVX_128bit);;
1584	}
1585	// AES round 9
1586	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1587	// First phase of reduction
1588	if (final_reduction) {
1589	evpclmulqdq(ZTMP10, ZTMP12, ZTMP6, 0x01, Assembler::AVX_128bit);
1590	vpslldq(ZTMP10, ZTMP10, 8, Assembler::AVX_128bit);
1591	evpxorq(ZTMP10, ZTMP6, ZTMP10, Assembler::AVX_128bit);
1592	}
1593	cmpl(rounds, 52);
1594	jcc(Assembler::greaterEqual, AES_192);
1595	jmp(LAST_AES_RND);
1596	// AES rounds upto 11 (AES192) or 13 (AES256)
1597	bind(AES_192);
1598	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1599	ev_load_key(ZTMP18, key, 11 * 16, xmm29);
1600	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1601	ev_load_key(ZTMP17, key, 12 * 16, xmm29);
1602	cmpl(rounds, 60);
1603	jcc(Assembler::aboveEqual, AES_256);
1604	jmp(LAST_AES_RND);
1605
1606	bind(AES_256);
1607	roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);;
1608	ev_load_key(ZTMP18, key, 13 * 16, xmm29);
1609	roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);;
1610	ev_load_key(ZTMP17, key, 14 * 16, xmm29);
1611
1612	bind(LAST_AES_RND);
1613	// Second phase of reduction
1614	if (final_reduction) {
1615	evpclmulqdq(ZTMP9, ZTMP12, ZTMP10, 0x00, Assembler::AVX_128bit);
1616	vpsrldq(ZTMP9, ZTMP9, 4, Assembler::AVX_128bit); // Shift-R 1-DW to obtain 2-DWs shift-R
1617	evpclmulqdq(ZTMP11, ZTMP12, ZTMP10, 0x10, Assembler::AVX_128bit);
1618	vpslldq(ZTMP11, ZTMP11, 4, Assembler::AVX_128bit); // Shift-L 1-DW for result
1619	// ZTMP5 = ZTMP5 X ZTMP11 X ZTMP9
1620	vpternlogq(ZTMP5, 0x96, ZTMP11, ZTMP9, Assembler::AVX_128bit);
1621	}
1622	// Last AES round
1623	lastroundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenclast(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenclast (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP2 , ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP3, ZTMP3 , ZTMP17, Assembler::AVX_512bit);;
1624	// XOR against plain / cipher text
1625	xorBeforeStore(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evpxorq(ZTMP0, ZTMP0, ZTMP13, Assembler::AVX_512bit); evpxorq (ZTMP1, ZTMP1, ZTMP14, Assembler::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP15, Assembler::AVX_512bit); evpxorq(ZTMP3, ZTMP3, ZTMP16, Assembler::AVX_512bit);;
1626	// store cipher / plain text
1627	storeData(out, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP0, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP1, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP2, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP3 , Assembler::AVX_512bit);;
1628	}
1629
1630	void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
1631	Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
1632	Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32,
1633	AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16;
1634	const XMMRegister CTR_BLOCKx = xmm9;
1635	const XMMRegister AAD_HASHx = xmm14;
1636	const Register pos = rax;
1637	const Register rounds = r15;
1638	Register ghash_pos;
1639	#ifndef _WIN64
1640	ghash_pos = r14;
1641	#else
1642	ghash_pos = r11;
1643	#endif // !_WIN64
1644	const XMMRegister ZTMP0 = xmm0;
1645	const XMMRegister ZTMP1 = xmm3;
1646	const XMMRegister ZTMP2 = xmm4;
1647	const XMMRegister ZTMP3 = xmm5;
1648	const XMMRegister ZTMP4 = xmm6;
1649	const XMMRegister ZTMP5 = xmm7;
1650	const XMMRegister ZTMP6 = xmm10;
1651	const XMMRegister ZTMP7 = xmm11;
1652	const XMMRegister ZTMP8 = xmm12;
1653	const XMMRegister ZTMP9 = xmm13;
1654	const XMMRegister ZTMP10 = xmm15;
1655	const XMMRegister ZTMP11 = xmm16;
1656	const XMMRegister ZTMP12 = xmm17;
1657	const XMMRegister ZTMP13 = xmm19;
1658	const XMMRegister ZTMP14 = xmm20;
1659	const XMMRegister ZTMP15 = xmm21;
1660	const XMMRegister ZTMP16 = xmm30;
1661	const XMMRegister COUNTER_INC_MASK = xmm18;
1662
1663	movl(pos, 0); // Total length processed
1664	// Min data size processed = 768 bytes
1665	cmpl(len, 768);
1666	jcc(Assembler::less, ENC_DEC_DONE);
1667
1668	// Generate 48 constants for htbl
1669	call(GENERATE_HTBL_48_BLKS, relocInfo::none);
1670	int index = 0; // Index for choosing subkeyHtbl entry
1671	movl(ghash_pos, 0); // Pointer for ghash read and store operations
1672
1673	// Move initial counter value and STATE value into variables
1674	movdqu(CTR_BLOCKx, Address(counter, 0));
1675	movdqu(AAD_HASHx, Address(state, 0));
1676	// Load lswap mask for ghash
1677	movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()), rbx);
1678	// Shuffle input state using lswap mask
1679	vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1680
1681	// Compute #rounds for AES based on the length of the key array
1682	movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1683
1684	// Broadcast counter value to 512 bit register
1685	evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit);
1686	// Load counter shuffle mask
1687	evmovdquq(xmm24, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, rbx);
1688	// Shuffle counter
1689	vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1690
1691	// Load mask for incrementing counter
1692	evmovdquq(COUNTER_INC_MASK, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, rbx);
1693	// Pre-increment counter
1694	vpaddd(ZTMP5, CTR_BLOCKx, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, rbx);
1695	vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1696	vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1697	vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1698
1699	// Begin 32 blocks of AES processing
1700	bind(AES_32_BLOCKS);
1701	// Save incremented counter before overwriting it with AES data
1702	evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit);
1703
1704	// Move 256 bytes of data
1705	loadData(in, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(ZTMP0, Address(in, pos, Address::times_1, 0 * 64), Assembler ::AVX_512bit); evmovdquq(ZTMP1, Address(in, pos, Address::times_1 , 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP2, Address(in , pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq (ZTMP3, Address(in, pos, Address::times_1, 3 * 64), Assembler ::AVX_512bit);;
1706	// Load key shuffle mask
1707	movdqu(xmm29, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx);
1708	// Load 0th AES round key
1709	ev_load_key(ZTMP4, key, 0, xmm29);
1710	// AES-ROUND0, xmm24 has the shuffle mask
1711	shuffleExorRnd1Key(ZTMP5, ZTMP6, ZTMP7, ZTMP8, xmm24, ZTMP4)vpshufb(ZTMP5, ZTMP5, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP6, ZTMP6 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP7, ZTMP7, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP7, ZTMP7, ZTMP4, Assembler::AVX_512bit ); vpshufb(ZTMP8, ZTMP8, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1712
1713	for (int j = 1; j < 10; j++) {
1714	ev_load_key(ZTMP4, key, j * 16, xmm29);
1715	roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1716	}
1717	ev_load_key(ZTMP4, key, 10 * 16, xmm29);
1718	// AES rounds upto 11 (AES192) or 13 (AES256)
1719	cmpl(rounds, 52);
1720	jcc(Assembler::greaterEqual, AES_192);
1721	lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);;
1722	jmp(STORE_CT);
1723
1724	bind(AES_192);
1725	roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1726	ev_load_key(ZTMP4, key, 11 * 16, xmm29);
1727	roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1728	cmpl(rounds, 60);
1729	jcc(Assembler::aboveEqual, AES_256);
1730	ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1731	lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);;
1732	jmp(STORE_CT);
1733
1734	bind(AES_256);
1735	ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1736	roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1737	ev_load_key(ZTMP4, key, 13 * 16, xmm29);
1738	roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);;
1739	ev_load_key(ZTMP4, key, 14 * 16, xmm29);
1740	// Last AES round
1741	lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);;
1742
1743	bind(STORE_CT);
1744	// Xor the encrypted key with PT to obtain CT
1745	xorBeforeStore(ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evpxorq(ZTMP5, ZTMP5, ZTMP0, Assembler::AVX_512bit); evpxorq( ZTMP6, ZTMP6, ZTMP1, Assembler::AVX_512bit); evpxorq(ZTMP7, ZTMP7 , ZTMP2, Assembler::AVX_512bit); evpxorq(ZTMP8, ZTMP8, ZTMP3, Assembler::AVX_512bit);;
1746	storeData(out, pos, ZTMP5, ZTMP6, ZTMP7, ZTMP8)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP5, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP6, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP7, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP8 , Assembler::AVX_512bit);;
1747	// 16 blocks encryption completed
1748	addl(pos, 256);
1749	cmpl(pos, 512);
1750	jcc(Assembler::aboveEqual, GHASH_AES_PARALLEL);
1751	vpaddd(ZTMP5, CTR_BLOCKx, COUNTER_INC_MASK, Assembler::AVX_512bit);
1752	vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1753	vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1754	vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1755	jmp(AES_32_BLOCKS);
1756
1757	bind(GHASH_AES_PARALLEL);
1758	// Ghash16_encrypt16_parallel takes place in the order with three reduction values:
1759	// 1) First time -> cipher xor input ghash
1760	// 2) No reduction -> accumulate multiplication values
1761	// 3) Final reduction post 48 blocks -> new ghash value is computed for the next round
1762	// Reduction value = first time
1763	ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1764	addl(pos, 256);
1765	addl(ghash_pos, 256);
1766	index += 4;
1767
1768	// At this point we have processed 768 bytes of AES and 256 bytes of GHASH.
1769	// If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code
1770	subl(len, 768);
1771	cmpl(len, 768);
1772	jcc(Assembler::less, GHASH_LAST_32);
1773
1774	// AES 16 blocks and GHASH 16 blocks in parallel
1775	// For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times
1776	// Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations
1777	// Each call uses 4 subkeyHtbl values, so increment the index by 4.
1778	bind(GHASH_16_AES_16);
1779	// Reduction value = no reduction
1780	ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1781	addl(pos, 256);
1782	addl(ghash_pos, 256);
1783	index += 4;
1784	// Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash
1785	ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK);
1786	addl(pos, 256);
1787	addl(ghash_pos, 256);
1788	// Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline
1789	movdqu(AAD_HASHx, ZTMP5);
1790	index = 0; // Reset subkeyHtbl index
1791
1792	// Restart the pipeline
1793	// Reduction value = first time
1794	ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1795	addl(pos, 256);
1796	addl(ghash_pos, 256);
1797	index += 4;
	Value stored to 'index' is never read
1798
1799	subl(len, 768);
1800	cmpl(len, 768);
1801	jcc(Assembler::greaterEqual, GHASH_16_AES_16);
1802
1803	// GHASH last 32 blocks processed here
1804	// GHASH products accumulated in ZMM27, ZMM25 and ZMM26 during GHASH16-AES16 operation is used
1805	bind(GHASH_LAST_32);
1806	// Use rbx as a pointer to the htbl; For last 32 blocks of GHASH, use key# 4-11 entry in subkeyHtbl
1807	movl(rbx, 256);
1808	// Load cipher blocks
1809	evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1810	evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1811	vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1812	vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1813	// Load ghash keys
1814	evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1815	evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1816
1817	// Ghash blocks 0 - 3
1818	carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15)evpclmulqdq(ZTMP2, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP3, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP4, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP1, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );;
1819	// Ghash blocks 4 - 7
1820	carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP14, ZTMP16)evpclmulqdq(ZTMP6, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );;
1821
1822	vpternlogq(ZTMP1, 0x96, ZTMP5, xmm27, Assembler::AVX_512bit); // ZTMP1 = ZTMP1 + ZTMP5 + zmm27
1823	vpternlogq(ZTMP2, 0x96, ZTMP6, xmm26, Assembler::AVX_512bit); // ZTMP2 = ZTMP2 + ZTMP6 + zmm26
1824	vpternlogq(ZTMP3, 0x96, ZTMP7, xmm25, Assembler::AVX_512bit); // ZTMP3 = ZTMP3 + ZTMP7 + zmm25
1825	evpxorq(ZTMP4, ZTMP4, ZTMP8, Assembler::AVX_512bit); // ZTMP4 = ZTMP4 + ZTMP8
1826
1827	addl(ghash_pos, 128);
1828	addl(rbx, 128);
1829
1830	// Ghash remaining blocks
1831	bind(LOOP);
1832	cmpl(ghash_pos, pos);
1833	jcc(Assembler::aboveEqual, ACCUMULATE);
1834	// Load next cipher blocks and corresponding ghash keys
1835	evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1836	evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1837	vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1838	vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1839	evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1840	evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1841
1842	// ghash blocks 0 - 3
1843	carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15)evpclmulqdq(ZTMP6, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );;
1844
1845	// ghash blocks 4 - 7
1846	carrylessMultiply(ZTMP10, ZTMP11, ZTMP12, ZTMP9, ZTMP14, ZTMP16)evpclmulqdq(ZTMP10, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );;
1847
1848	// update sums
1849	// ZTMP1 = ZTMP1 + ZTMP5 + ZTMP9
1850	// ZTMP2 = ZTMP2 + ZTMP6 + ZTMP10
1851	// ZTMP3 = ZTMP3 + ZTMP7 xor ZTMP11
1852	// ZTMP4 = ZTMP4 + ZTMP8 xor ZTMP12
1853	xorGHASH(ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP9, ZTMP6, ZTMP10, ZTMP7, ZTMP11, ZTMP8, ZTMP12)vpternlogq(ZTMP1, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit); vpternlogq(ZTMP2, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit ); vpternlogq(ZTMP3, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(ZTMP4, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit );;
1854	addl(ghash_pos, 128);
1855	addl(rbx, 128);
1856	jmp(LOOP);
1857
1858	// Integrate ZTMP3/ZTMP4 into ZTMP1 and ZTMP2
1859	bind(ACCUMULATE);
1860	evpxorq(ZTMP3, ZTMP3, ZTMP4, Assembler::AVX_512bit);
1861	vpsrldq(ZTMP7, ZTMP3, 8, Assembler::AVX_512bit);
1862	vpslldq(ZTMP8, ZTMP3, 8, Assembler::AVX_512bit);
1863	evpxorq(ZTMP1, ZTMP1, ZTMP7, Assembler::AVX_512bit);
1864	evpxorq(ZTMP2, ZTMP2, ZTMP8, Assembler::AVX_512bit);
1865
1866	// Add ZTMP1 and ZTMP2 128 - bit words horizontally
1867	vhpxori4x128(ZTMP1, ZTMP11)vextracti64x4(ZTMP11, ZTMP1, 1); evpxorq(ZTMP1, ZTMP1, ZTMP11 , Assembler::AVX_256bit); vextracti32x4(ZTMP11, ZTMP1, 1); evpxorq (ZTMP1, ZTMP1, ZTMP11, Assembler::AVX_128bit);;
1868	vhpxori4x128(ZTMP2, ZTMP12)vextracti64x4(ZTMP12, ZTMP2, 1); evpxorq(ZTMP2, ZTMP2, ZTMP12 , Assembler::AVX_256bit); vextracti32x4(ZTMP12, ZTMP2, 1); evpxorq (ZTMP2, ZTMP2, ZTMP12, Assembler::AVX_128bit);;
1869	// Load reduction polynomial and compute final reduction
1870	evmovdquq(ZTMP15, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1871	vclmul_reduce(AAD_HASHx, ZTMP15, ZTMP1, ZTMP2, ZTMP3, ZTMP4)evpclmulqdq(ZTMP3, ZTMP15, ZTMP2, 0x01, Assembler::AVX_512bit ); vpslldq(ZTMP3, ZTMP3, 8, Assembler::AVX_512bit); evpxorq(ZTMP3 , ZTMP2, ZTMP3, Assembler::AVX_512bit); evpclmulqdq(ZTMP4, ZTMP15 , ZTMP3, 0x00, Assembler::AVX_512bit); vpsrldq(ZTMP4, ZTMP4, 4 , Assembler::AVX_512bit); evpclmulqdq(AAD_HASHx, ZTMP15, ZTMP3 , 0x10, Assembler::AVX_512bit); vpslldq(AAD_HASHx, AAD_HASHx, 4, Assembler::AVX_512bit); vpternlogq(AAD_HASHx, 0x96, ZTMP4 , ZTMP1, Assembler::AVX_512bit);;
1872
1873	// Pre-increment counter for next operation
1874	vpaddd(CTR_BLOCKx, CTR_BLOCKx, xmm18, Assembler::AVX_128bit);
1875	// Shuffle counter and save the updated value
1876	vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1877	movdqu(Address(counter, 0), CTR_BLOCKx);
1878	// Load ghash lswap mask
1879	movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1880	// Shuffle ghash using lbswap_mask and store it
1881	vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1882	movdqu(Address(state, 0), AAD_HASHx);
1883	jmp(ENC_DEC_DONE);
1884
1885	bind(GENERATE_HTBL_48_BLKS);
1886	generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl);
1887
1888	bind(ENC_DEC_DONE);
1889	movq(rax, pos);
1890	}
1891
1892	#endif // _LP64