Bug Summary

File:jdk/src/hotspot/cpu/x86/assembler_x86.hpp
Warning:line 233, column 5
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name macroAssembler_x86_aes.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -mthread-model posix -fno-delete-null-pointer-checks -mframe-pointer=all -relaxed-aliasing -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/libjvm/objs/precompiled -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D _GNU_SOURCE -D _REENTRANT -D LIBC=gnu -D LINUX -D VM_LITTLE_ENDIAN -D _LP64=1 -D ASSERT -D CHECK_UNHANDLED_OOPS -D TARGET_ARCH_x86 -D INCLUDE_SUFFIX_OS=_linux -D INCLUDE_SUFFIX_CPU=_x86 -D INCLUDE_SUFFIX_COMPILER=_gcc -D TARGET_COMPILER_gcc -D AMD64 -D HOTSPOT_LIB_ARCH="amd64" -D COMPILER1 -D COMPILER2 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -I /home/daniel/Projects/java/jdk/src/hotspot/share/precompiled -I /home/daniel/Projects/java/jdk/src/hotspot/share/include -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix/include -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/support/modules_include/java.base/linux -I /home/daniel/Projects/java/jdk/src/java.base/share/native/libjimage -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc/adfiles -I /home/daniel/Projects/java/jdk/src/hotspot/share -I /home/daniel/Projects/java/jdk/src/hotspot/os/linux -I /home/daniel/Projects/java/jdk/src/hotspot/os/posix -I /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86 -I /home/daniel/Projects/java/jdk/src/hotspot/os_cpu/linux_x86 -I /home/daniel/Projects/java/jdk/build/linux-x86_64-server-fastdebug/hotspot/variant-server/gensrc -D _FORTIFY_SOURCE=2 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-format-zero-length -Wno-unused-parameter -Wno-unused -Wno-parentheses -Wno-comment -Wno-unknown-pragmas -Wno-address -Wno-delete-non-virtual-dtor -Wno-char-subscripts -Wno-array-bounds -Wno-int-in-bool-context -Wno-ignored-qualifiers -Wno-missing-field-initializers -Wno-implicit-fallthrough -Wno-empty-body -Wno-strict-overflow -Wno-sequence-point -Wno-maybe-uninitialized -Wno-misleading-indentation -Wno-cast-function-type -Wno-shift-negative-value -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /home/daniel/Projects/java/jdk/make/hotspot -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -stack-protector 1 -fno-rtti -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -o /home/daniel/Projects/java/scan/2021-12-21-193737-8510-1 -x c++ /home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp

/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp

1/*
2* Copyright (c) 2019, 2021, Intel Corporation. All rights reserved.
3*
4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5*
6* This code is free software; you can redistribute it and/or modify it
7* under the terms of the GNU General Public License version 2 only, as
8* published by the Free Software Foundation.
9*
10* This code is distributed in the hope that it will be useful, but WITHOUT
11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13* version 2 for more details (a copy is included in the LICENSE file that
14* accompanied this code).
15*
16* You should have received a copy of the GNU General Public License version
17* 2 along with this work; if not, write to the Free Software Foundation,
18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19*
20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21* or visit www.oracle.com if you need additional information or have any
22* questions.
23*
24*/
25
26#include "precompiled.hpp"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "runtime/stubRoutines.hpp"
30#include "macroAssembler_x86.hpp"
31
32#ifdef _LP641
33
34void MacroAssembler::roundEnc(XMMRegister key, int rnum) {
35 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
36 vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
37 }
38}
39
40void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) {
41 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
42 vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
43 }
44}
45
46void MacroAssembler::roundDec(XMMRegister key, int rnum) {
47 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
48 vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
49 }
50}
51
52void MacroAssembler::lastroundDec(XMMRegister key, int rnum) {
53 for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
54 vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
55 }
56}
57
58// Load key and shuffle operation
59void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL__null) {
60 movdqu(xmmdst, Address(key, offset));
61 if (xmm_shuf_mask != NULL__null) {
62 pshufb(xmmdst, xmm_shuf_mask);
63 } else {
64 pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
65 }
66 evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
67}
68
69// AES-ECB Encrypt Operation
70void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
71
72 const Register pos = rax;
73 const Register rounds = r12;
74
75 Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
76 push(r13);
77 push(r12);
78
79 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
80 // context for the registers used, where all instructions below are using 128-bit mode
81 // On EVEX without VL and BW, these instructions will all be AVX.
82 if (VM_Version::supports_avx512vlbw()) {
83 movl(rax, 0xffff);
84 kmovql(k1, rax);
85 }
86 push(len); // Save
87 push(rbx);
88
89 vzeroupper();
90
91 xorptr(pos, pos);
92
93 // Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
94 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
95
96 // Load Key shuf mask
97 const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
98 movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
99
100 // Load and shuffle key based on number of rounds
101 ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
102 ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
103 ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
104 ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
105 ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
106 ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
107 ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
108 ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
109 ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
110 ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
111 ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
112 cmpl(rounds, 52);
113 jcc(Assembler::greaterEqual, KEY_192);
114 jmp(Loop_start);
115
116 bind(KEY_192);
117 ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
118 ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
119 cmpl(rounds, 60);
120 jcc(Assembler::equal, KEY_256);
121 jmp(Loop_start);
122
123 bind(KEY_256);
124 ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
125 ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
126
127 bind(Loop_start);
128 movq(rbx, len);
129 // Divide length by 16 to convert it to number of blocks
130 shrq(len, 4);
131 shlq(rbx, 60);
132 jcc(Assembler::equal, NO_PARTS);
133 addq(len, 1);
134 // Check if number of blocks is greater than or equal to 32
135 // If true, 512 bytes are processed at a time (code marked by label LOOP)
136 // If not, 16 bytes are processed (code marked by REMAINDER label)
137 bind(NO_PARTS);
138 movq(rbx, len);
139 shrq(len, 5);
140 jcc(Assembler::equal, REMAINDER);
141 movl(r13, len);
142 // Compute number of blocks that will be processed 512 bytes at a time
143 // Subtract this from the total number of blocks which will then be processed by REMAINDER loop
144 shlq(r13, 5);
145 subq(rbx, r13);
146 //Begin processing 512 bytes
147 bind(LOOP);
148 // Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
149 evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
150 evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
151 evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
152 evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
153 evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
154 evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
155 evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
156 evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
157 // Xor with the first round key
158 evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
159 evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
160 evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
161 evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
162 evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
163 evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
164 evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
165 evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
166 // 9 Aes encode round operations
167 roundEnc(xmm9, 7);
168 roundEnc(xmm10, 7);
169 roundEnc(xmm23, 7);
170 roundEnc(xmm12, 7);
171 roundEnc(xmm13, 7);
172 roundEnc(xmm14, 7);
173 roundEnc(xmm15, 7);
174 roundEnc(xmm16, 7);
175 roundEnc(xmm17, 7);
176 cmpl(rounds, 52);
177 jcc(Assembler::aboveEqual, AES192);
178 // Aesenclast round operation for keysize = 128
179 lastroundEnc(xmm24, 7);
180 jmp(END_LOOP);
181 //Additional 2 rounds of Aesenc operation for keysize = 192
182 bind(AES192);
183 roundEnc(xmm24, 7);
184 roundEnc(xmm19, 7);
185 cmpl(rounds, 60);
186 jcc(Assembler::aboveEqual, AES256);
187 // Aesenclast round for keysize = 192
188 lastroundEnc(xmm20, 7);
189 jmp(END_LOOP);
190 // 2 rounds of Aesenc operation and Aesenclast for keysize = 256
191 bind(AES256);
192 roundEnc(xmm20, 7);
193 roundEnc(xmm21, 7);
194 lastroundEnc(xmm22, 7);
195
196 bind(END_LOOP);
197 // Move 512 bytes of CT to destination
198 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
199 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
200 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
201 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
202 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
203 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
204 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
205 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
206
207 addq(pos, 512);
208 decq(len);
209 jcc(Assembler::notEqual, LOOP);
210
211 bind(REMAINDER);
212 vzeroupper();
213 cmpq(rbx, 0);
214 jcc(Assembler::equal, END);
215 // Process 16 bytes at a time
216 bind(LOOP2);
217 movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
218 vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
219 // xmm2 contains shuffled key for Aesenclast operation.
220 vmovdqu(xmm2, xmm24);
221
222 vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
223 vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
224 vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
225 vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
226 vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
227 vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
228 vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
229 vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
230 vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
231
232 cmpl(rounds, 52);
233 jcc(Assembler::below, LAST2);
234 vmovdqu(xmm2, xmm20);
235 vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
236 vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
237 cmpl(rounds, 60);
238 jcc(Assembler::below, LAST2);
239 vmovdqu(xmm2, xmm22);
240 vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
241 vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
242
243 bind(LAST2);
244 // Aesenclast round
245 vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
246 // Write 16 bytes of CT to destination
247 movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
248 addq(pos, 16);
249 decq(rbx);
250 jcc(Assembler::notEqual, LOOP2);
251
252 bind(END);
253 // Zero out the round keys
254 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
255 evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
256 evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
257 evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
258 evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
259 evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
260 evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
261 evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
262 evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
263 evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
264 evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
265 cmpl(rounds, 44);
266 jcc(Assembler::belowEqual, EXIT);
267 evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
268 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
269 cmpl(rounds, 52);
270 jcc(Assembler::belowEqual, EXIT);
271 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
272 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
273 bind(EXIT);
274 pop(rbx);
275 pop(rax); // return length
276 pop(r12);
277 pop(r13);
278}
279
280// AES-ECB Decrypt Operation
281void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) {
282
283 Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
284 const Register pos = rax;
285 const Register rounds = r12;
286 push(r13);
287 push(r12);
288
289 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
290 // context for the registers used, where all instructions below are using 128-bit mode
291 // On EVEX without VL and BW, these instructions will all be AVX.
292 if (VM_Version::supports_avx512vlbw()) {
293 movl(rax, 0xffff);
294 kmovql(k1, rax);
295 }
296
297 push(len); // Save
298 push(rbx);
299
300 vzeroupper();
301
302 xorptr(pos, pos);
303 // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
304 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
305
306 // Load Key shuf mask
307 const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
308 movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
309
310 // Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
311 // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
312 ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
313 ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
314 ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
315 ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
316 ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
317 ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
318 ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
319 ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
320 ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
321 ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
322 ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
323 cmpl(rounds, 52);
324 jcc(Assembler::greaterEqual, KEY_192);
325 jmp(Loop_start);
326
327 bind(KEY_192);
328 ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
329 ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
330 cmpl(rounds, 60);
331 jcc(Assembler::equal, KEY_256);
332 jmp(Loop_start);
333
334 bind(KEY_256);
335 ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
336 ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
337 bind(Loop_start);
338 movq(rbx, len);
339 // Convert input length to number of blocks
340 shrq(len, 4);
341 shlq(rbx, 60);
342 jcc(Assembler::equal, NO_PARTS);
343 addq(len, 1);
344 // Check if number of blocks is greater than/ equal to 32
345 // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
346 // If not, 16 bytes are processed (code marked by label REMAINDER)
347 bind(NO_PARTS);
348 movq(rbx, len);
349 shrq(len, 5);
350 jcc(Assembler::equal, REMAINDER);
351 movl(r13, len);
352 // Compute number of blocks that will be processed as 512 bytes at a time
353 // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
354 shlq(r13, 5);
355 subq(rbx, r13);
356
357 bind(LOOP);
358 // Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
359 evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
360 evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
361 evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
362 evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
363 evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
364 evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
365 evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
366 evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
367 // Xor with the first round key
368 evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
369 evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
370 evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
371 evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
372 evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
373 evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
374 evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
375 evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
376 // 9 rounds of Aesdec
377 roundDec(xmm10, 7);
378 roundDec(xmm11, 7);
379 roundDec(xmm12, 7);
380 roundDec(xmm13, 7);
381 roundDec(xmm14, 7);
382 roundDec(xmm15, 7);
383 roundDec(xmm16, 7);
384 roundDec(xmm17, 7);
385 roundDec(xmm18, 7);
386 cmpl(rounds, 52);
387 jcc(Assembler::aboveEqual, AES192);
388 // Aesdeclast round for keysize = 128
389 lastroundDec(xmm27, 7);
390 jmp(END_LOOP);
391
392 bind(AES192);
393 // 2 Additional rounds for keysize = 192
394 roundDec(xmm19, 7);
395 roundDec(xmm20, 7);
396 cmpl(rounds, 60);
397 jcc(Assembler::aboveEqual, AES256);
398 // Aesdeclast round for keysize = 192
399 lastroundDec(xmm27, 7);
400 jmp(END_LOOP);
401 bind(AES256);
402 // 2 Additional rounds and Aesdeclast for keysize = 256
403 roundDec(xmm21, 7);
404 roundDec(xmm22, 7);
405 lastroundDec(xmm27, 7);
406
407 bind(END_LOOP);
408 // Write 512 bytes of PT to the destination
409 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
410 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
411 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
412 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
413 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
414 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
415 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
416 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
417
418 addq(pos, 512);
419 decq(len);
420 jcc(Assembler::notEqual, LOOP);
421
422 bind(REMAINDER);
423 vzeroupper();
424 cmpq(rbx, 0);
425 jcc(Assembler::equal, END);
426 // Process 16 bytes at a time
427 bind(LOOP2);
428 movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
429 vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
430 // xmm2 contains shuffled key for Aesdeclast operation.
431 vmovdqu(xmm2, xmm27);
432
433 vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
434 vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
435 vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
436 vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
437 vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
438 vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
439 vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
440 vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
441 vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);
442
443 cmpl(rounds, 52);
444 jcc(Assembler::below, LAST2);
445 vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
446 vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
447 cmpl(rounds, 60);
448 jcc(Assembler::below, LAST2);
449 vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
450 vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);
451
452 bind(LAST2);
453 // Aesdeclast round
454 vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
455 // Write 16 bytes of PT to destination
456 movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
457 addq(pos, 16);
458 decq(rbx);
459 jcc(Assembler::notEqual, LOOP2);
460
461 bind(END);
462 // Zero out the round keys
463 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
464 evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
465 evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
466 evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
467 evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
468 evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
469 evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
470 evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
471 evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
472 evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
473 evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
474 evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
475 cmpl(rounds, 44);
476 jcc(Assembler::belowEqual, EXIT);
477 evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
478 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
479 cmpl(rounds, 52);
480 jcc(Assembler::belowEqual, EXIT);
481 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
482 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
483 bind(EXIT);
484 pop(rbx);
485 pop(rax); // return length
486 pop(r12);
487 pop(r13);
488}
489
490// Multiply 128 x 128 bits, using 4 pclmulqdq operations
491void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
492 XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
493 movdqu(xmm15, Address(htbl, i * 16));
494 vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
495 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
496 vpclmulldq(tmp3, data, xmm15); // 0x00
497 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
498 vpclmulhdq(tmp3, data, xmm15); // 0x11
499 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
500 vpclmullqhqdq(tmp3, data, xmm15); // 0x10
501 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
502}
503
504// Multiply two 128 bit numbers resulting in a 256 bit value
505// Result of the multiplication followed by reduction stored in state
506void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
507 const XMMRegister tmp1 = xmm4;
508 const XMMRegister tmp2 = xmm5;
509 const XMMRegister tmp3 = xmm6;
510 const XMMRegister tmp4 = xmm7;
511
512 vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
513 vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
514 vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
515 vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
516
517 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
518
519 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
520 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
521 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
522 vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
523 // Follows the reduction technique mentioned in
524 // Shift-XOR reduction described in Gueron-Kounavis May 2010
525 // First phase of reduction
526 //
527 vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
528 vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
529 vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
530 // xor the shifted versions
531 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
532 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
533 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
534 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
535 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
536 //
537 // Second phase of the reduction
538 //
539 vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
540 vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
541 vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
542 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
543 vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
544 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
545 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
546 vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
547 ret(0);
548}
549
550// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
551// The power of H is used in reduction process for one block ghash
552void MacroAssembler::generateHtbl_one_block(Register htbl) {
553 const XMMRegister t = xmm13;
554
555 // load the original subkey hash
556 movdqu(t, Address(htbl, 0));
557 // shuffle using long swap mask
558 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
559 vpshufb(t, t, xmm10, Assembler::AVX_128bit);
560
561 // Compute H' = GFMUL(H, 2)
562 vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
563 movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
564 vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
565 movl(rax, 0xff00);
566 movdl(xmm4, rax);
567 vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
568 movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
569 vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
570 vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
571 vpslld(xmm4, t, 1, Assembler::AVX_128bit);
572 vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
573 vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
574
575 //Adding p(x)<<1 to xmm5 which holds the reduction polynomial
576 vpxor(t, t, xmm5, Assembler::AVX_128bit);
577 movdqu(Address(htbl, 1 * 16), t); // H * 2
578
579 ret(0);
580}
581
582// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
583// The power of H is used in reduction process for eight block ghash
584void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
585 const XMMRegister t = xmm13;
586 const XMMRegister tmp0 = xmm1;
587 Label GFMUL;
588
589 movdqu(t, Address(htbl, 1 * 16));
590 movdqu(tmp0, t);
591
592 // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
593 call(GFMUL, relocInfo::none);
594 movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
595 call(GFMUL, relocInfo::none);
596 movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
597 call(GFMUL, relocInfo::none);
598 movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
599 call(GFMUL, relocInfo::none);
600 movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
601 call(GFMUL, relocInfo::none);
602 movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
603 call(GFMUL, relocInfo::none);
604 movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
605 call(GFMUL, relocInfo::none);
606 movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
607 ret(0);
608
609 bind(GFMUL);
610 gfmul(tmp0, t);
611}
612
613// Multiblock and single block GHASH computation using Shift XOR reduction technique
614void MacroAssembler::avx_ghash(Register input_state, Register htbl,
615 Register input_data, Register blocks) {
616
617 // temporary variables to hold input data and input state
618 const XMMRegister data = xmm1;
619 const XMMRegister state = xmm0;
620 // temporary variables to hold intermediate results
621 const XMMRegister tmp0 = xmm3;
622 const XMMRegister tmp1 = xmm4;
623 const XMMRegister tmp2 = xmm5;
624 const XMMRegister tmp3 = xmm6;
625 // temporary variables to hold byte and long swap masks
626 const XMMRegister bswap_mask = xmm2;
627 const XMMRegister lswap_mask = xmm14;
628
629 Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
630 ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
631
632 testptr(blocks, blocks);
633 jcc(Assembler::zero, EXIT_GHASH);
634
635 // Check if Hashtable (1*16) has been already generated
636 // For anything less than 8 blocks, we generate only the first power of H.
637 movdqu(tmp2, Address(htbl, 1 * 16));
638 ptest(tmp2, tmp2);
639 jcc(Assembler::notZero, BEGIN_PROCESS);
640 call(GENERATE_HTBL_1_BLK, relocInfo::none);
641
642 // Shuffle the input state
643 bind(BEGIN_PROCESS);
644 movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
645 movdqu(state, Address(input_state, 0));
646 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
647
648 cmpl(blocks, 8);
649 jcc(Assembler::below, ONE_BLK_INIT);
650 // If we have 8 blocks or more data, then generate remaining powers of H
651 movdqu(tmp2, Address(htbl, 8 * 16));
652 ptest(tmp2, tmp2);
653 jcc(Assembler::notZero, PROCESS_8_BLOCKS);
654 call(GENERATE_HTBL_8_BLKS, relocInfo::none);
655
656 //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
657 //Each block = 16 bytes.
658 bind(PROCESS_8_BLOCKS);
659 subl(blocks, 8);
660 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
661 movdqu(data, Address(input_data, 16 * 7));
662 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
663 //Loading 1*16 as calculated powers of H required starts at that location.
664 movdqu(xmm15, Address(htbl, 1 * 16));
665 //Perform carryless multiplication of (H*2, data block #7)
666 vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
667 vpclmulldq(tmp0, data, xmm15);//a0 * b0
668 vpclmulhdq(tmp1, data, xmm15);//a1 * b1
669 vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
670 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
671
672 movdqu(data, Address(input_data, 16 * 6));
673 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
674 // Perform carryless multiplication of (H^2 * 2, data block #6)
675 schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
676
677 movdqu(data, Address(input_data, 16 * 5));
678 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
679 // Perform carryless multiplication of (H^3 * 2, data block #5)
680 schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
681 movdqu(data, Address(input_data, 16 * 4));
682 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
683 // Perform carryless multiplication of (H^4 * 2, data block #4)
684 schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
685 movdqu(data, Address(input_data, 16 * 3));
686 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
687 // Perform carryless multiplication of (H^5 * 2, data block #3)
688 schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
689 movdqu(data, Address(input_data, 16 * 2));
690 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
691 // Perform carryless multiplication of (H^6 * 2, data block #2)
692 schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
693 movdqu(data, Address(input_data, 16 * 1));
694 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
695 // Perform carryless multiplication of (H^7 * 2, data block #1)
696 schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
697 movdqu(data, Address(input_data, 16 * 0));
698 // xor data block#0 with input state before perfoming carry-less multiplication
699 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
700 vpxor(data, data, state, Assembler::AVX_128bit);
701 // Perform carryless multiplication of (H^8 * 2, data block #0)
702 schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
703 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
704 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
705 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
706 vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
707
708 // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
709 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
710 // Follows the reduction technique mentioned in
711 // Shift-XOR reduction described in Gueron-Kounavis May 2010
712 bind(BLOCK8_REDUCTION);
713 // First Phase of the reduction
714 vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
715 vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
716 vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
717 // xor the shifted versions
718 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
719 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
720
721 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
722 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
723
724 vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
725 // second phase of the reduction
726 vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
727 vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
728 vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
729 // xor the shifted versions
730 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
731 vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
732 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
733 vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
734 // Final result is in state
735 vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
736
737 lea(input_data, Address(input_data, 16 * 8));
738 cmpl(blocks, 8);
739 jcc(Assembler::below, ONE_BLK_INIT);
740 jmp(PROCESS_8_BLOCKS);
741
742 // Since this is one block operation we will only use H * 2 i.e. the first power of H
743 bind(ONE_BLK_INIT);
744 movdqu(tmp0, Address(htbl, 1 * 16));
745 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
746
747 //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
748 bind(PROCESS_1_BLOCK);
749 cmpl(blocks, 0);
750 jcc(Assembler::equal, SAVE_STATE);
751 subl(blocks, 1);
752 movdqu(data, Address(input_data, 0));
753 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
754 vpxor(state, state, data, Assembler::AVX_128bit);
755 // gfmul(H*2, state)
756 call(GFMUL, relocInfo::none);
757 addptr(input_data, 16);
758 jmp(PROCESS_1_BLOCK);
759
760 bind(SAVE_STATE);
761 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
762 movdqu(Address(input_state, 0), state);
763 jmp(EXIT_GHASH);
764
765 bind(GFMUL);
766 gfmul(tmp0, state);
767
768 bind(GENERATE_HTBL_1_BLK);
769 generateHtbl_one_block(htbl);
770
771 bind(GENERATE_HTBL_8_BLKS);
772 generateHtbl_eight_blocks(htbl);
773
774 bind(EXIT_GHASH);
775 // zero out xmm registers used for Htbl storage
776 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
777 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
778 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
779 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
780}
781
782// AES Counter Mode using VAES instructions
783void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
784 Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
785
786 const Register rounds = 0;
787 const Register pos = r12;
788
789 Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
790 AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
791 REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
792 AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
793 AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
794 EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
795
796 cmpl(len_reg, 0);
797 jcc(Assembler::belowEqual, EXIT);
798
799 movl(pos, 0);
800 // if the number of used encrypted counter bytes < 16,
801 // XOR PT with saved encrypted counter to obtain CT
802 bind(PRELOOP_START);
803 cmpl(used, 16);
804 jcc(Assembler::aboveEqual, EXIT_PRELOOP);
805 movb(rbx, Address(saved_encCounter_start, used));
806 xorb(rbx, Address(src_addr, pos));
807 movb(Address(dest_addr, pos), rbx);
808 addptr(pos, 1);
809 addptr(used, 1);
810 decrement(len_reg);
811 jmp(PRELOOP_START);
812
813 bind(EXIT_PRELOOP);
814 movl(Address(used_addr, 0), used);
815
816 // Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
817 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
818
819 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
820 // Move initial counter value in xmm0
821 movdqu(xmm0, Address(counter, 0));
822 // broadcast counter value to zmm8
823 evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
824
825 // load lbswap mask
826 evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15);
827
828 //shuffle counter using lbswap_mask
829 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
830
831 // pre-increment and propagate counter values to zmm9-zmm15 registers.
832 // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
833 // The counter is incremented after each block i.e. 16 bytes is processed;
834 // each zmm register has 4 counter values as its MSB
835 // the counters are incremented in parallel
836 vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0
837 vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip)
838 vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
839 vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
840 vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
841 vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
842 vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
843 vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
844
845 // load linc32 mask in zmm register.linc32 increments counter by 32
846 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32
847
848 // xmm31 contains the key shuffle mask.
849 movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
850 // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
851 // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
852 // that holds shuffled key value.
853 ev_load_key(xmm20, key, 0, xmm31);
854 ev_load_key(xmm21, key, 1 * 16, xmm31);
855 ev_load_key(xmm22, key, 2 * 16, xmm31);
856 ev_load_key(xmm23, key, 3 * 16, xmm31);
857 ev_load_key(xmm24, key, 4 * 16, xmm31);
858 ev_load_key(xmm25, key, 5 * 16, xmm31);
859 ev_load_key(xmm26, key, 6 * 16, xmm31);
860 ev_load_key(xmm27, key, 7 * 16, xmm31);
861 ev_load_key(xmm28, key, 8 * 16, xmm31);
862 ev_load_key(xmm29, key, 9 * 16, xmm31);
863 ev_load_key(xmm30, key, 10 * 16, xmm31);
864
865 // Process 32 blocks or 512 bytes of data
866 bind(LOOP);
867 cmpl(len_reg, 512);
868 jcc(Assembler::less, REMAINDER);
869 subq(len_reg, 512);
870 //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
871 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
872 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
873 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
874 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
875 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
876 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
877 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
878 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
879 vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
880 evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
881 vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
882 evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
883 vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
884 evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
885 vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
886 evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
887 // Perform AES encode operations and put results in zmm0-zmm7.
888 // This is followed by incrementing counter values in zmm8-zmm15.
889 // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
890 roundEnc(xmm21, 7);
891 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
892 roundEnc(xmm22, 7);
893 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
894 roundEnc(xmm23, 7);
895 vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
896 roundEnc(xmm24, 7);
897 vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
898 roundEnc(xmm25, 7);
899 vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
900 roundEnc(xmm26, 7);
901 vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
902 roundEnc(xmm27, 7);
903 vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
904 roundEnc(xmm28, 7);
905 vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
906 roundEnc(xmm29, 7);
907
908 cmpl(rounds, 52);
909 jcc(Assembler::aboveEqual, AES192);
910 lastroundEnc(xmm30, 7);
911 jmp(END_LOOP);
912
913 bind(AES192);
914 roundEnc(xmm30, 7);
915 ev_load_key(xmm18, key, 11 * 16, xmm31);
916 roundEnc(xmm18, 7);
917 cmpl(rounds, 60);
918 jcc(Assembler::aboveEqual, AES256);
919 ev_load_key(xmm18, key, 12 * 16, xmm31);
920 lastroundEnc(xmm18, 7);
921 jmp(END_LOOP);
922
923 bind(AES256);
924 ev_load_key(xmm18, key, 12 * 16, xmm31);
925 roundEnc(xmm18, 7);
926 ev_load_key(xmm18, key, 13 * 16, xmm31);
927 roundEnc(xmm18, 7);
928 ev_load_key(xmm18, key, 14 * 16, xmm31);
929 lastroundEnc(xmm18, 7);
930
931 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
932 // xor encrypted block cipher and input plaintext and store resultant ciphertext
933 bind(END_LOOP);
934 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
935 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
936 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
937 evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
938 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
939 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
940 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
941 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
942 evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
943 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
944 evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
945 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
946 evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
947 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
948 evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
949 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
950 addq(pos, 512);
951 jmp(LOOP);
952
953 // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
954 bind(REMAINDER);
955 cmpl(len_reg, 0);
956 jcc(Assembler::equal, END);
957 cmpl(len_reg, 256);
958 jcc(Assembler::aboveEqual, REMAINDER_16);
959 cmpl(len_reg, 128);
960 jcc(Assembler::aboveEqual, REMAINDER_8);
961 cmpl(len_reg, 64);
962 jcc(Assembler::aboveEqual, REMAINDER_4);
963 // At this point, we will process 16 bytes of data at a time.
964 // So load xmm19 with counter increment value as 1
965 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);
966 jmp(REMAINDER_LOOP);
967
968 // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
969 bind(REMAINDER_16);
970 subq(len_reg, 256);
971 // As we process 16 blocks at a time, load mask for incrementing the counter value by 16
972 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip)
973 // shuffle counter and XOR counter with roundkey1
974 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
975 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
976 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
977 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
978 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
979 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
980 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
981 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
982 // Increment counter values by 16
983 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
984 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
985 // AES encode rounds
986 roundEnc(xmm21, 3);
987 roundEnc(xmm22, 3);
988 roundEnc(xmm23, 3);
989 roundEnc(xmm24, 3);
990 roundEnc(xmm25, 3);
991 roundEnc(xmm26, 3);
992 roundEnc(xmm27, 3);
993 roundEnc(xmm28, 3);
994 roundEnc(xmm29, 3);
995
996 cmpl(rounds, 52);
997 jcc(Assembler::aboveEqual, AES192_REMAINDER16);
998 lastroundEnc(xmm30, 3);
999 jmp(REMAINDER16_END_LOOP);
1000
1001 bind(AES192_REMAINDER16);
1002 roundEnc(xmm30, 3);
1003 ev_load_key(xmm18, key, 11 * 16, xmm31);
1004 roundEnc(xmm18, 3);
1005 ev_load_key(xmm5, key, 12 * 16, xmm31);
1006
1007 cmpl(rounds, 60);
1008 jcc(Assembler::aboveEqual, AES256_REMAINDER16);
1009 lastroundEnc(xmm5, 3);
1010 jmp(REMAINDER16_END_LOOP);
1011 bind(AES256_REMAINDER16);
1012 roundEnc(xmm5, 3);
1013 ev_load_key(xmm6, key, 13 * 16, xmm31);
1014 roundEnc(xmm6, 3);
1015 ev_load_key(xmm7, key, 14 * 16, xmm31);
1016 lastroundEnc(xmm7, 3);
1017
1018 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
1019 // xor 256 bytes of PT with the encrypted counters to produce CT.
1020 bind(REMAINDER16_END_LOOP);
1021 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
1022 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1023 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1024 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1025 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1026 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
1027 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1028 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
1029 addq(pos, 256);
1030
1031 cmpl(len_reg, 128);
1032 jcc(Assembler::aboveEqual, REMAINDER_8);
1033
1034 cmpl(len_reg, 64);
1035 jcc(Assembler::aboveEqual, REMAINDER_4);
1036 //load mask for incrementing the counter value by 1
1037 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1038 jmp(REMAINDER_LOOP);
1039
1040 // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
1041 bind(REMAINDER_8);
1042 subq(len_reg, 128);
1043 // As we process 8 blocks at a time, load mask for incrementing the counter value by 8
1044 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip)
1045 // shuffle counters and xor with roundkey1
1046 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1047 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1048 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
1049 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
1050 // increment counter by 8
1051 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1052 // AES encode
1053 roundEnc(xmm21, 1);
1054 roundEnc(xmm22, 1);
1055 roundEnc(xmm23, 1);
1056 roundEnc(xmm24, 1);
1057 roundEnc(xmm25, 1);
1058 roundEnc(xmm26, 1);
1059 roundEnc(xmm27, 1);
1060 roundEnc(xmm28, 1);
1061 roundEnc(xmm29, 1);
1062
1063 cmpl(rounds, 52);
1064 jcc(Assembler::aboveEqual, AES192_REMAINDER8);
1065 lastroundEnc(xmm30, 1);
1066 jmp(REMAINDER8_END_LOOP);
1067
1068 bind(AES192_REMAINDER8);
1069 roundEnc(xmm30, 1);
1070 ev_load_key(xmm18, key, 11 * 16, xmm31);
1071 roundEnc(xmm18, 1);
1072 ev_load_key(xmm5, key, 12 * 16, xmm31);
1073 cmpl(rounds, 60);
1074 jcc(Assembler::aboveEqual, AES256_REMAINDER8);
1075 lastroundEnc(xmm5, 1);
1076 jmp(REMAINDER8_END_LOOP);
1077
1078 bind(AES256_REMAINDER8);
1079 roundEnc(xmm5, 1);
1080 ev_load_key(xmm6, key, 13 * 16, xmm31);
1081 roundEnc(xmm6, 1);
1082 ev_load_key(xmm7, key, 14 * 16, xmm31);
1083 lastroundEnc(xmm7, 1);
1084
1085 bind(REMAINDER8_END_LOOP);
1086 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
1087 // XOR PT with the encrypted counter and store as CT
1088 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1089 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
1090 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1091 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
1092 addq(pos, 128);
1093
1094 cmpl(len_reg, 64);
1095 jcc(Assembler::aboveEqual, REMAINDER_4);
1096 // load mask for incrementing the counter value by 1
1097 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1098 jmp(REMAINDER_LOOP);
1099
1100 // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
1101 bind(REMAINDER_4);
1102 subq(len_reg, 64);
1103 // As we process 4 blocks at a time, load mask for incrementing the counter value by 4
1104 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
1105 // XOR counter with first roundkey
1106 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
1107 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
1108 // Increment counter
1109 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1110 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
1111 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
1112 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
1113 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
1114 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
1115 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
1116 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
1117 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
1118 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
1119 cmpl(rounds, 52);
1120 jcc(Assembler::aboveEqual, AES192_REMAINDER4);
1121 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1122 jmp(END_REMAINDER4);
1123
1124 bind(AES192_REMAINDER4);
1125 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
1126 ev_load_key(xmm18, key, 11 * 16, xmm31);
1127 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
1128 ev_load_key(xmm5, key, 12 * 16, xmm31);
1129
1130 cmpl(rounds, 60);
1131 jcc(Assembler::aboveEqual, AES256_REMAINDER4);
1132 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1133 jmp(END_REMAINDER4);
1134
1135 bind(AES256_REMAINDER4);
1136 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
1137 ev_load_key(xmm6, key, 13 * 16, xmm31);
1138 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
1139 ev_load_key(xmm7, key, 14 * 16, xmm31);
1140 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
1141 // After AES encode rounds, the encrypted block cipher lies in zmm0.
1142 // XOR encrypted block cipher with PT and store 64 bytes of ciphertext
1143 bind(END_REMAINDER4);
1144 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1145 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1146 addq(pos, 64);
1147 // load mask for incrementing the counter value by 1
1148 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
1149
1150 // For a single block, the AES rounds start here.
1151 bind(REMAINDER_LOOP);
1152 cmpl(len_reg, 0);
1153 jcc(Assembler::belowEqual, END);
1154 // XOR counter with first roundkey
1155 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
1156 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
1157 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
1158 // Increment counter by 1
1159 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
1160 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
1161 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
1162 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
1163 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
1164 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
1165 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
1166 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
1167 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
1168
1169 cmpl(rounds, 52);
1170 jcc(Assembler::aboveEqual, AES192_REMAINDER);
1171 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1172 jmp(END_REMAINDER_LOOP);
1173
1174 bind(AES192_REMAINDER);
1175 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
1176 ev_load_key(xmm18, key, 11 * 16, xmm31);
1177 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
1178 ev_load_key(xmm5, key, 12 * 16, xmm31);
1179 cmpl(rounds, 60);
1180 jcc(Assembler::aboveEqual, AES256_REMAINDER);
1181 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1182 jmp(END_REMAINDER_LOOP);
1183
1184 bind(AES256_REMAINDER);
1185 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
1186 ev_load_key(xmm6, key, 13 * 16, xmm31);
1187 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
1188 ev_load_key(xmm7, key, 14 * 16, xmm31);
1189 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
1190
1191 bind(END_REMAINDER_LOOP);
1192 // If the length register is less than the blockSize i.e. 16
1193 // then we store only those bytes of the CT to the destination
1194 // corresponding to the length register value
1195 // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
1196 cmpl(len_reg, 16);
1197 jcc(Assembler::less, EXTRACT_TAILBYTES);
1198 subl(len_reg, 16);
1199 // After AES encode rounds, the encrypted block cipher lies in xmm0.
1200 // If the length register is equal to 16 bytes, store CT in dest after XOR operation.
1201 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1202 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
1203 addl(pos, 16);
1204
1205 jmp(REMAINDER_LOOP);
1206
1207 bind(EXTRACT_TAILBYTES);
1208 // Save encrypted counter value in xmm0 for next invocation, before XOR operation
1209 movdqu(Address(saved_encCounter_start, 0), xmm0);
1210 // XOR encryted block cipher in xmm0 with PT to produce CT
1211 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
1212 // extract upto 15 bytes of CT from xmm0 as specified by length register
1213 testptr(len_reg, 8);
1214 jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
1215 pextrq(Address(dest_addr, pos), xmm0, 0);
1216 psrldq(xmm0, 8);
1217 addl(pos, 8);
1218 bind(EXTRACT_TAIL_4BYTES);
1219 testptr(len_reg, 4);
1220 jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
1221 pextrd(Address(dest_addr, pos), xmm0, 0);
1222 psrldq(xmm0, 4);
1223 addq(pos, 4);
1224 bind(EXTRACT_TAIL_2BYTES);
1225 testptr(len_reg, 2);
1226 jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
1227 pextrw(Address(dest_addr, pos), xmm0, 0);
1228 psrldq(xmm0, 2);
1229 addl(pos, 2);
1230 bind(EXTRACT_TAIL_1BYTE);
1231 testptr(len_reg, 1);
1232 jcc(Assembler::zero, END);
1233 pextrb(Address(dest_addr, pos), xmm0, 0);
1234 addl(pos, 1);
1235
1236 bind(END);
1237 // If there are no tail bytes, store counter value and exit
1238 cmpl(len_reg, 0);
1239 jcc(Assembler::equal, STORE_CTR);
1240 movl(Address(used_addr, 0), len_reg);
1241
1242 bind(STORE_CTR);
1243 //shuffle updated counter and store it
1244 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
1245 movdqu(Address(counter, 0), xmm8);
1246 // Zero out counter and key registers
1247 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
1248 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
1249 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
1250 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
1251 evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
1252 evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
1253 evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
1254 evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
1255 evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
1256 evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
1257 evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
1258 evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
1259 cmpl(rounds, 44);
1260 jcc(Assembler::belowEqual, EXIT);
1261 evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
1262 evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
1263 cmpl(rounds, 52);
1264 jcc(Assembler::belowEqual, EXIT);
1265 evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
1266 evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
1267 bind(EXIT);
1268}
1269
1270void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
1271 const XMMRegister TMP1 = xmm0;
1272 const XMMRegister TMP2 = xmm1;
1273 const XMMRegister TMP3 = xmm2;
1274
1275 evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit);
1276 evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit);
1277 evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit);
1278 evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit);
1279 evpxorq(GH, GH, TMP3, Assembler::AVX_512bit);
1280 vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit);
1281 vpslldq(GH, GH, 8, Assembler::AVX_512bit);
1282 evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit);
1283 evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1284
1285 evmovdquq(TMP3, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, r15);
1286 evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit);
1287 vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit);
1288 evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
1289 evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit);
1290 vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit);
1291 evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit);
1292 vpslldq(GH, GH, 4, Assembler::AVX_512bit);
1293 vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
1294}
1295
1296void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) {
1297 const XMMRegister HK = xmm6;
1298 const XMMRegister ZT5 = xmm4;
1299 const XMMRegister ZT7 = xmm7;
1300 const XMMRegister ZT8 = xmm8;
1301
1302 Label GFMUL_AVX512;
1303
1304 movdqu(HK, Address(htbl, 0));
1305 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1306 vpshufb(HK, HK, xmm10, Assembler::AVX_128bit);
1307
1308 movdqu(xmm11, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 64)); // Poly
1309 movdqu(xmm12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 80)); // Twoone
1310 // Compute H ^ 2 from the input subkeyH
1311 movdqu(xmm2, xmm6);
1312 vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit);
1313 vpsrlq(xmm2, xmm2, 63, Assembler::AVX_128bit);
1314 movdqu(xmm1, xmm2);
1315 vpslldq(xmm2, xmm2, 8, Assembler::AVX_128bit);
1316 vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit);
1317 vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1318
1319 vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit);
1320 vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit);
1321 vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
1322 vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
1323 movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2
1324 // Compute the remaining three powers of H using XMM registers and all following powers using ZMM
1325 movdqu(ZT5, HK);
1326 vinserti32x4(ZT7, ZT7, HK, 3);
1327
1328 gfmul_avx512(ZT5, HK);
1329 movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2
1330 vinserti32x4(ZT7, ZT7, ZT5, 2);
1331
1332 gfmul_avx512(ZT5, HK);
1333 movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3
1334 vinserti32x4(ZT7, ZT7, ZT5, 1);
1335
1336 gfmul_avx512(ZT5, HK);
1337 movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4
1338 vinserti32x4(ZT7, ZT7, ZT5, 0);
1339
1340 evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit);
1341 evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);
1342 gfmul_avx512(ZT7, ZT5);
1343 evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit);
1344 evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);
1345 gfmul_avx512(ZT8, ZT5);
1346 evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit);
1347 gfmul_avx512(ZT7, ZT5);
1348 evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit);
1349 gfmul_avx512(ZT8, ZT5);
1350 evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit);
1351 gfmul_avx512(ZT7, ZT5);
1352 evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);
1353 gfmul_avx512(ZT8, ZT5);
1354 evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit);
1355 gfmul_avx512(ZT7, ZT5);
1356 evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit);
1357 gfmul_avx512(ZT8, ZT5);
1358 evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit);
1359 gfmul_avx512(ZT7, ZT5);
1360 evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit);
1361 gfmul_avx512(ZT8, ZT5);
1362 evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit);
1363 gfmul_avx512(ZT7, ZT5);
1364 evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit);
1365 ret(0);
1366}
1367
1368#define vclmul_reduce(out, poly, hi128, lo128, tmp0, tmp1)evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); vpslldq
(tmp0, tmp0, 8, Assembler::AVX_512bit); evpxorq(tmp0, lo128, tmp0
, Assembler::AVX_512bit); evpclmulqdq(tmp1, poly, tmp0, 0x00,
Assembler::AVX_512bit); vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit
); evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit);
vpslldq(out, out, 4, Assembler::AVX_512bit); vpternlogq(out,
0x96, tmp1, hi128, Assembler::AVX_512bit);
\
1369evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); \
1370vpslldq(tmp0, tmp0, 8, Assembler::AVX_512bit); \
1371evpxorq(tmp0, lo128, tmp0, Assembler::AVX_512bit); \
1372evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); \
1373vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit); \
1374evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); \
1375vpslldq(out, out, 4, Assembler::AVX_512bit); \
1376vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \
1377
1378#define vhpxori4x128(reg, tmp)vextracti64x4(tmp, reg, 1); evpxorq(reg, reg, tmp, Assembler::
AVX_256bit); vextracti32x4(tmp, reg, 1); evpxorq(reg, reg, tmp
, Assembler::AVX_128bit);
\
1379vextracti64x4(tmp, reg, 1); \
1380evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \
1381vextracti32x4(tmp, reg, 1); \
1382evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \
1383
1384#define roundEncode(key, dst1, dst2, dst3, dst4)vaesenc(dst1, dst1, key, Assembler::AVX_512bit); vaesenc(dst2
, dst2, key, Assembler::AVX_512bit); vaesenc(dst3, dst3, key,
Assembler::AVX_512bit); vaesenc(dst4, dst4, key, Assembler::
AVX_512bit);
\
1385vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \
1386vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \
1387vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \
1388vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \
1389
1390#define lastroundEncode(key, dst1, dst2, dst3, dst4)vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); vaesenclast
(dst2, dst2, key, Assembler::AVX_512bit); vaesenclast(dst3, dst3
, key, Assembler::AVX_512bit); vaesenclast(dst4, dst4, key, Assembler
::AVX_512bit);
\
1391vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \
1392vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \
1393vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \
1394vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \
1395
1396#define storeData(dst, position, src1, src2, src3, src4)evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1
, Assembler::AVX_512bit); evmovdquq(Address(dst, position, Address
::times_1, 1 * 64), src2, Assembler::AVX_512bit); evmovdquq(Address
(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit
); evmovdquq(Address(dst, position, Address::times_1, 3 * 64)
, src4, Assembler::AVX_512bit);
\
1397evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \
1398evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \
1399evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \
1400evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \
1401
1402#define loadData(src, position, dst1, dst2, dst3, dst4)evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64
), Assembler::AVX_512bit); evmovdquq(dst2, Address(src, position
, Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq
(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler
::AVX_512bit); evmovdquq(dst4, Address(src, position, Address
::times_1, 3 * 64), Assembler::AVX_512bit);
\
1403evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \
1404evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \
1405evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \
1406evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \
1407
1408#define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey)evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit)
; evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit
); evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit
); evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit
);
\
1409evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit); \
1410evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit); \
1411evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit); \
1412evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit); \
1413
1414#define shuffleExorRnd1Key(dst0, dst1, dst2, dst3, shufmask, rndkey)vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); evpxorq
(dst0, dst0, rndkey, Assembler::AVX_512bit); vpshufb(dst1, dst1
, shufmask, Assembler::AVX_512bit); evpxorq(dst1, dst1, rndkey
, Assembler::AVX_512bit); vpshufb(dst2, dst2, shufmask, Assembler
::AVX_512bit); evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit
); vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); evpxorq
(dst3, dst3, rndkey, Assembler::AVX_512bit);
\
1415vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); \
1416evpxorq(dst0, dst0, rndkey, Assembler::AVX_512bit); \
1417vpshufb(dst1, dst1, shufmask, Assembler::AVX_512bit); \
1418evpxorq(dst1, dst1, rndkey, Assembler::AVX_512bit); \
1419vpshufb(dst2, dst2, shufmask, Assembler::AVX_512bit); \
1420evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit); \
1421vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); \
1422evpxorq(dst3, dst3, rndkey, Assembler::AVX_512bit); \
1423
1424#define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3)evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); evpxorq(dst1
, dst1, src1, Assembler::AVX_512bit); evpxorq(dst2, dst2, src2
, Assembler::AVX_512bit); evpxorq(dst3, dst3, src3, Assembler
::AVX_512bit);
\
1425evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \
1426evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \
1427evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \
1428evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \
1429
1430#define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33)vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); vpternlogq
(dst1, 0x96, src12, src13, Assembler::AVX_512bit); vpternlogq
(dst2, 0x96, src22, src23, Assembler::AVX_512bit); vpternlogq
(dst3, 0x96, src32, src33, Assembler::AVX_512bit);
\
1431vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \
1432vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \
1433vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \
1434vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \
1435
1436void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx,
1437 Register in, Register out, Register data, Register pos, bool first_time_reduction, XMMRegister addmask, bool ghash_input, Register rounds,
1438 Register ghash_pos, bool final_reduction, int i, XMMRegister counter_inc_mask) {
1439
1440 Label AES_192, AES_256, LAST_AES_RND;
1441 const XMMRegister ZTMP0 = xmm0;
1442 const XMMRegister ZTMP1 = xmm3;
1443 const XMMRegister ZTMP2 = xmm4;
1444 const XMMRegister ZTMP3 = xmm5;
1445 const XMMRegister ZTMP5 = xmm7;
1446 const XMMRegister ZTMP6 = xmm10;
1447 const XMMRegister ZTMP7 = xmm11;
1448 const XMMRegister ZTMP8 = xmm12;
1449 const XMMRegister ZTMP9 = xmm13;
1450 const XMMRegister ZTMP10 = xmm15;
1451 const XMMRegister ZTMP11 = xmm16;
1452 const XMMRegister ZTMP12 = xmm17;
1453
1454 const XMMRegister ZTMP13 = xmm19;
1455 const XMMRegister ZTMP14 = xmm20;
1456 const XMMRegister ZTMP15 = xmm21;
1457 const XMMRegister ZTMP16 = xmm30;
1458 const XMMRegister ZTMP17 = xmm31;
1459 const XMMRegister ZTMP18 = xmm1;
1460 const XMMRegister ZTMP19 = xmm2;
1461 const XMMRegister ZTMP20 = xmm8;
1462 const XMMRegister ZTMP21 = xmm22;
1463 const XMMRegister ZTMP22 = xmm23;
1464
1465 // Pre increment counters
1466 vpaddd(ZTMP0, ctr_blockx, counter_inc_mask, Assembler::AVX_512bit);
1467 vpaddd(ZTMP1, ZTMP0, counter_inc_mask, Assembler::AVX_512bit);
1468 vpaddd(ZTMP2, ZTMP1, counter_inc_mask, Assembler::AVX_512bit);
1469 vpaddd(ZTMP3, ZTMP2, counter_inc_mask, Assembler::AVX_512bit);
1470 // Save counter value
1471 evmovdquq(ctr_blockx, ZTMP3, Assembler::AVX_512bit);
1472
1473 // Reuse ZTMP17 / ZTMP18 for loading AES Keys
1474 // Pre-load AES round keys
1475 ev_load_key(ZTMP17, key, 0, xmm29);
1476 ev_load_key(ZTMP18, key, 1 * 16, xmm29);
1477
1478 // ZTMP19 & ZTMP20 used for loading hash key
1479 // Pre-load hash key
1480 evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit);
1481 evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1482 // Load data for computing ghash
1483 evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1484 vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1485
1486 // Xor cipher block 0 with input ghash, if available
1487 if (ghash_input) {
1488 evpxorq(ZTMP21, ZTMP21, aad_hashx, Assembler::AVX_512bit);
1489 }
1490 // Load data for computing ghash
1491 evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1492 vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1493
1494 // stitch AES rounds with GHASH
1495 // AES round 0, xmm24 has shuffle mask
1496 shuffleExorRnd1Key(ZTMP0, ZTMP1, ZTMP2, ZTMP3, xmm24, ZTMP17)vpshufb(ZTMP0, ZTMP0, xmm24, Assembler::AVX_512bit); evpxorq(
ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vpshufb(ZTMP1, ZTMP1
, xmm24, Assembler::AVX_512bit); evpxorq(ZTMP1, ZTMP1, ZTMP17
, Assembler::AVX_512bit); vpshufb(ZTMP2, ZTMP2, xmm24, Assembler
::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit
); vpshufb(ZTMP3, ZTMP3, xmm24, Assembler::AVX_512bit); evpxorq
(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);
;
1497 // Reuse ZTMP17 / ZTMP18 for loading remaining AES Keys
1498 ev_load_key(ZTMP17, key, 2 * 16, xmm29);
1499 // GHASH 4 blocks
1500 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19)evpclmulqdq(ZTMP6, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit
);
;
1501 // Load the next hkey and Ghash data
1502 evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1503 evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
1504 vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
1505
1506 // AES round 1
1507 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1508 ev_load_key(ZTMP18, key, 3 * 16, xmm29);
1509
1510 // GHASH 4 blocks(11 to 8)
1511 carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit
);
;
1512 // Load the next hkey and GDATA
1513 evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
1514 evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
1515 vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
1516
1517 // AES round 2
1518 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1519 ev_load_key(ZTMP17, key, 4 * 16, xmm29);
1520
1521 // GHASH 4 blocks(7 to 4)
1522 carrylessMultiply(ZTMP14, ZTMP16, ZTMP15, ZTMP13, ZTMP21, ZTMP19)evpclmulqdq(ZTMP14, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP16, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP15, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP13, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit
);
;
1523 // AES rounds 3
1524 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1525 ev_load_key(ZTMP18, key, 5 * 16, xmm29);
1526
1527 // Gather(XOR) GHASH for 12 blocks
1528 xorGHASH(ZTMP5, ZTMP6, ZTMP8, ZTMP7, ZTMP9, ZTMP13, ZTMP10, ZTMP14, ZTMP12, ZTMP16, ZTMP11, ZTMP15)vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP13, Assembler::AVX_512bit)
; vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP14, Assembler::AVX_512bit
); vpternlogq(ZTMP8, 0x96, ZTMP12, ZTMP16, Assembler::AVX_512bit
); vpternlogq(ZTMP7, 0x96, ZTMP11, ZTMP15, Assembler::AVX_512bit
);
;
1529
1530 // AES rounds 4
1531 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1532 ev_load_key(ZTMP17, key, 6 * 16, xmm29);
1533
1534 // load plain / cipher text(recycle registers)
1535 loadData(in, pos, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evmovdquq(ZTMP13, Address(in, pos, Address::times_1, 0 * 64),
Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(in, pos, Address
::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP15,
Address(in, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit
); evmovdquq(ZTMP16, Address(in, pos, Address::times_1, 3 * 64
), Assembler::AVX_512bit);
;
1536
1537 // AES rounds 5
1538 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1539 ev_load_key(ZTMP18, key, 7 * 16, xmm29);
1540 // GHASH 4 blocks(3 to 0)
1541 carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit
);
;
1542
1543 // AES round 6
1544 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1545 ev_load_key(ZTMP17, key, 8 * 16, xmm29);
1546
1547 // gather GHASH in ZTMP6(low) and ZTMP5(high)
1548 if (first_time_reduction) {
1549 vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1550 evpxorq(xmm25, ZTMP7, ZTMP11, Assembler::AVX_512bit);
1551 evpxorq(xmm27, ZTMP5, ZTMP9, Assembler::AVX_512bit);
1552 evpxorq(xmm26, ZTMP6, ZTMP10, Assembler::AVX_512bit);
1553 }
1554 else if (!first_time_reduction && !final_reduction) {
1555 xorGHASH(ZTMP7, xmm25, xmm27, xmm26, ZTMP8, ZTMP12, ZTMP7, ZTMP11, ZTMP5, ZTMP9, ZTMP6, ZTMP10)vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit)
; vpternlogq(xmm25, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit
); vpternlogq(xmm27, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit
); vpternlogq(xmm26, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit
);
;
1556 }
1557
1558 if (final_reduction) {
1559 // Phase one: Add mid products together
1560 // Also load polynomial constant for reduction
1561 vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit);
1562 vpternlogq(ZTMP7, 0x96, xmm25, ZTMP11, Assembler::AVX_512bit);
1563 vpsrldq(ZTMP11, ZTMP7, 8, Assembler::AVX_512bit);
1564 vpslldq(ZTMP7, ZTMP7, 8, Assembler::AVX_512bit);
1565 evmovdquq(ZTMP12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1566 }
1567 // AES round 7
1568 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1569 ev_load_key(ZTMP18, key, 9 * 16, xmm29);
1570 if (final_reduction) {
1571 vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP11, Assembler::AVX_512bit);
1572 evpxorq(ZTMP5, ZTMP5, xmm27, Assembler::AVX_512bit);
1573 vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP7, Assembler::AVX_512bit);
1574 evpxorq(ZTMP6, ZTMP6, xmm26, Assembler::AVX_512bit);
1575 }
1576 // AES round 8
1577 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1578 ev_load_key(ZTMP17, key, 10 * 16, xmm29);
1579
1580 // Horizontal xor of low and high 4*128
1581 if (final_reduction) {
1582 vhpxori4x128(ZTMP5, ZTMP9)vextracti64x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler
::AVX_256bit); vextracti32x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5,
ZTMP5, ZTMP9, Assembler::AVX_128bit);
;
1583 vhpxori4x128(ZTMP6, ZTMP10)vextracti64x4(ZTMP10, ZTMP6, 1); evpxorq(ZTMP6, ZTMP6, ZTMP10
, Assembler::AVX_256bit); vextracti32x4(ZTMP10, ZTMP6, 1); evpxorq
(ZTMP6, ZTMP6, ZTMP10, Assembler::AVX_128bit);
;
1584 }
1585 // AES round 9
1586 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1587 // First phase of reduction
1588 if (final_reduction) {
1589 evpclmulqdq(ZTMP10, ZTMP12, ZTMP6, 0x01, Assembler::AVX_128bit);
1590 vpslldq(ZTMP10, ZTMP10, 8, Assembler::AVX_128bit);
1591 evpxorq(ZTMP10, ZTMP6, ZTMP10, Assembler::AVX_128bit);
1592 }
1593 cmpl(rounds, 52);
1594 jcc(Assembler::greaterEqual, AES_192);
1595 jmp(LAST_AES_RND);
1596 // AES rounds upto 11 (AES192) or 13 (AES256)
1597 bind(AES_192);
1598 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1599 ev_load_key(ZTMP18, key, 11 * 16, xmm29);
1600 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1601 ev_load_key(ZTMP17, key, 12 * 16, xmm29);
1602 cmpl(rounds, 60);
1603 jcc(Assembler::aboveEqual, AES_256);
1604 jmp(LAST_AES_RND);
1605
1606 bind(AES_256);
1607 roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP17, Assembler::AVX_512bit);
;
1608 ev_load_key(ZTMP18, key, 13 * 16, xmm29);
1609 roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc
(ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2,
ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3,
ZTMP18, Assembler::AVX_512bit);
;
1610 ev_load_key(ZTMP17, key, 14 * 16, xmm29);
1611
1612 bind(LAST_AES_RND);
1613 // Second phase of reduction
1614 if (final_reduction) {
1615 evpclmulqdq(ZTMP9, ZTMP12, ZTMP10, 0x00, Assembler::AVX_128bit);
1616 vpsrldq(ZTMP9, ZTMP9, 4, Assembler::AVX_128bit); // Shift-R 1-DW to obtain 2-DWs shift-R
1617 evpclmulqdq(ZTMP11, ZTMP12, ZTMP10, 0x10, Assembler::AVX_128bit);
1618 vpslldq(ZTMP11, ZTMP11, 4, Assembler::AVX_128bit); // Shift-L 1-DW for result
1619 // ZTMP5 = ZTMP5 X ZTMP11 X ZTMP9
1620 vpternlogq(ZTMP5, 0x96, ZTMP11, ZTMP9, Assembler::AVX_128bit);
1621 }
1622 // Last AES round
1623 lastroundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenclast(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenclast
(ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP2
, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP3, ZTMP3
, ZTMP17, Assembler::AVX_512bit);
;
1624 // XOR against plain / cipher text
1625 xorBeforeStore(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evpxorq(ZTMP0, ZTMP0, ZTMP13, Assembler::AVX_512bit); evpxorq
(ZTMP1, ZTMP1, ZTMP14, Assembler::AVX_512bit); evpxorq(ZTMP2,
ZTMP2, ZTMP15, Assembler::AVX_512bit); evpxorq(ZTMP3, ZTMP3,
ZTMP16, Assembler::AVX_512bit);
;
1626 // store cipher / plain text
1627 storeData(out, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP0,
Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address::
times_1, 1 * 64), ZTMP1, Assembler::AVX_512bit); evmovdquq(Address
(out, pos, Address::times_1, 2 * 64), ZTMP2, Assembler::AVX_512bit
); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP3
, Assembler::AVX_512bit);
;
1628}
1629
1630void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
1631 Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
1632 Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32,
1633 AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16;
1634 const XMMRegister CTR_BLOCKx = xmm9;
1635 const XMMRegister AAD_HASHx = xmm14;
1636 const Register pos = rax;
1
'pos' initialized to a null pointer value
1637 const Register rounds = r15;
1638 Register ghash_pos;
1639#ifndef _WIN64
1640 ghash_pos = r14;
1641#else
1642 ghash_pos = r11;
1643#endif // !_WIN64
1644 const XMMRegister ZTMP0 = xmm0;
1645 const XMMRegister ZTMP1 = xmm3;
1646 const XMMRegister ZTMP2 = xmm4;
1647 const XMMRegister ZTMP3 = xmm5;
1648 const XMMRegister ZTMP4 = xmm6;
1649 const XMMRegister ZTMP5 = xmm7;
1650 const XMMRegister ZTMP6 = xmm10;
1651 const XMMRegister ZTMP7 = xmm11;
1652 const XMMRegister ZTMP8 = xmm12;
1653 const XMMRegister ZTMP9 = xmm13;
1654 const XMMRegister ZTMP10 = xmm15;
1655 const XMMRegister ZTMP11 = xmm16;
1656 const XMMRegister ZTMP12 = xmm17;
1657 const XMMRegister ZTMP13 = xmm19;
1658 const XMMRegister ZTMP14 = xmm20;
1659 const XMMRegister ZTMP15 = xmm21;
1660 const XMMRegister ZTMP16 = xmm30;
1661 const XMMRegister COUNTER_INC_MASK = xmm18;
1662
1663 movl(pos, 0); // Total length processed
1664 // Min data size processed = 768 bytes
1665 cmpl(len, 768);
1666 jcc(Assembler::less, ENC_DEC_DONE);
1667
1668 // Generate 48 constants for htbl
1669 call(GENERATE_HTBL_48_BLKS, relocInfo::none);
1670 int index = 0; // Index for choosing subkeyHtbl entry
1671 movl(ghash_pos, 0); // Pointer for ghash read and store operations
1672
1673 // Move initial counter value and STATE value into variables
1674 movdqu(CTR_BLOCKx, Address(counter, 0));
1675 movdqu(AAD_HASHx, Address(state, 0));
1676 // Load lswap mask for ghash
1677 movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()), rbx);
1678 // Shuffle input state using lswap mask
1679 vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1680
1681 // Compute #rounds for AES based on the length of the key array
1682 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1683
1684 // Broadcast counter value to 512 bit register
1685 evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit);
1686 // Load counter shuffle mask
1687 evmovdquq(xmm24, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, rbx);
1688 // Shuffle counter
1689 vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1690
1691 // Load mask for incrementing counter
1692 evmovdquq(COUNTER_INC_MASK, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, rbx);
1693 // Pre-increment counter
1694 vpaddd(ZTMP5, CTR_BLOCKx, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, rbx);
1695 vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1696 vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1697 vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1698
1699 // Begin 32 blocks of AES processing
1700 bind(AES_32_BLOCKS);
1701 // Save incremented counter before overwriting it with AES data
1702 evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit);
1703
1704 // Move 256 bytes of data
1705 loadData(in, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(ZTMP0, Address(in, pos, Address::times_1, 0 * 64), Assembler
::AVX_512bit); evmovdquq(ZTMP1, Address(in, pos, Address::times_1
, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP2, Address(in
, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq
(ZTMP3, Address(in, pos, Address::times_1, 3 * 64), Assembler
::AVX_512bit);
;
2
Passing null pointer value via 2nd parameter 'index'
3
Calling constructor for 'Address'
1706 // Load key shuffle mask
1707 movdqu(xmm29, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx);
1708 // Load 0th AES round key
1709 ev_load_key(ZTMP4, key, 0, xmm29);
1710 // AES-ROUND0, xmm24 has the shuffle mask
1711 shuffleExorRnd1Key(ZTMP5, ZTMP6, ZTMP7, ZTMP8, xmm24, ZTMP4)vpshufb(ZTMP5, ZTMP5, xmm24, Assembler::AVX_512bit); evpxorq(
ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP6, ZTMP6
, xmm24, Assembler::AVX_512bit); evpxorq(ZTMP6, ZTMP6, ZTMP4,
Assembler::AVX_512bit); vpshufb(ZTMP7, ZTMP7, xmm24, Assembler
::AVX_512bit); evpxorq(ZTMP7, ZTMP7, ZTMP4, Assembler::AVX_512bit
); vpshufb(ZTMP8, ZTMP8, xmm24, Assembler::AVX_512bit); evpxorq
(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);
;
1712
1713 for (int j = 1; j < 10; j++) {
1714 ev_load_key(ZTMP4, key, j * 16, xmm29);
1715 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1716 }
1717 ev_load_key(ZTMP4, key, 10 * 16, xmm29);
1718 // AES rounds upto 11 (AES192) or 13 (AES256)
1719 cmpl(rounds, 52);
1720 jcc(Assembler::greaterEqual, AES_192);
1721 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1722 jmp(STORE_CT);
1723
1724 bind(AES_192);
1725 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1726 ev_load_key(ZTMP4, key, 11 * 16, xmm29);
1727 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1728 cmpl(rounds, 60);
1729 jcc(Assembler::aboveEqual, AES_256);
1730 ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1731 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1732 jmp(STORE_CT);
1733
1734 bind(AES_256);
1735 ev_load_key(ZTMP4, key, 12 * 16, xmm29);
1736 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1737 ev_load_key(ZTMP4, key, 13 * 16, xmm29);
1738 roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc(
ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7
, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4,
Assembler::AVX_512bit);
;
1739 ev_load_key(ZTMP4, key, 14 * 16, xmm29);
1740 // Last AES round
1741 lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast
(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7
, ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8
, ZTMP4, Assembler::AVX_512bit);
;
1742
1743 bind(STORE_CT);
1744 // Xor the encrypted key with PT to obtain CT
1745 xorBeforeStore(ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evpxorq(ZTMP5, ZTMP5, ZTMP0, Assembler::AVX_512bit); evpxorq(
ZTMP6, ZTMP6, ZTMP1, Assembler::AVX_512bit); evpxorq(ZTMP7, ZTMP7
, ZTMP2, Assembler::AVX_512bit); evpxorq(ZTMP8, ZTMP8, ZTMP3,
Assembler::AVX_512bit);
;
1746 storeData(out, pos, ZTMP5, ZTMP6, ZTMP7, ZTMP8)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP5,
Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address::
times_1, 1 * 64), ZTMP6, Assembler::AVX_512bit); evmovdquq(Address
(out, pos, Address::times_1, 2 * 64), ZTMP7, Assembler::AVX_512bit
); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP8
, Assembler::AVX_512bit);
;
1747 // 16 blocks encryption completed
1748 addl(pos, 256);
1749 cmpl(pos, 512);
1750 jcc(Assembler::aboveEqual, GHASH_AES_PARALLEL);
1751 vpaddd(ZTMP5, CTR_BLOCKx, COUNTER_INC_MASK, Assembler::AVX_512bit);
1752 vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit);
1753 vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit);
1754 vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit);
1755 jmp(AES_32_BLOCKS);
1756
1757 bind(GHASH_AES_PARALLEL);
1758 // Ghash16_encrypt16_parallel takes place in the order with three reduction values:
1759 // 1) First time -> cipher xor input ghash
1760 // 2) No reduction -> accumulate multiplication values
1761 // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round
1762 // Reduction value = first time
1763 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1764 addl(pos, 256);
1765 addl(ghash_pos, 256);
1766 index += 4;
1767
1768 // At this point we have processed 768 bytes of AES and 256 bytes of GHASH.
1769 // If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code
1770 subl(len, 768);
1771 cmpl(len, 768);
1772 jcc(Assembler::less, GHASH_LAST_32);
1773
1774 // AES 16 blocks and GHASH 16 blocks in parallel
1775 // For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times
1776 // Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations
1777 // Each call uses 4 subkeyHtbl values, so increment the index by 4.
1778 bind(GHASH_16_AES_16);
1779 // Reduction value = no reduction
1780 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1781 addl(pos, 256);
1782 addl(ghash_pos, 256);
1783 index += 4;
1784 // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash
1785 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK);
1786 addl(pos, 256);
1787 addl(ghash_pos, 256);
1788 // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline
1789 movdqu(AAD_HASHx, ZTMP5);
1790 index = 0; // Reset subkeyHtbl index
1791
1792 // Restart the pipeline
1793 // Reduction value = first time
1794 ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
1795 addl(pos, 256);
1796 addl(ghash_pos, 256);
1797 index += 4;
1798
1799 subl(len, 768);
1800 cmpl(len, 768);
1801 jcc(Assembler::greaterEqual, GHASH_16_AES_16);
1802
1803 // GHASH last 32 blocks processed here
1804 // GHASH products accumulated in ZMM27, ZMM25 and ZMM26 during GHASH16-AES16 operation is used
1805 bind(GHASH_LAST_32);
1806 // Use rbx as a pointer to the htbl; For last 32 blocks of GHASH, use key# 4-11 entry in subkeyHtbl
1807 movl(rbx, 256);
1808 // Load cipher blocks
1809 evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1810 evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1811 vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1812 vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1813 // Load ghash keys
1814 evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1815 evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1816
1817 // Ghash blocks 0 - 3
1818 carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15)evpclmulqdq(ZTMP2, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP3, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP4, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP1, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit
);
;
1819 // Ghash blocks 4 - 7
1820 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP14, ZTMP16)evpclmulqdq(ZTMP6, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit
);
;
1821
1822 vpternlogq(ZTMP1, 0x96, ZTMP5, xmm27, Assembler::AVX_512bit); // ZTMP1 = ZTMP1 + ZTMP5 + zmm27
1823 vpternlogq(ZTMP2, 0x96, ZTMP6, xmm26, Assembler::AVX_512bit); // ZTMP2 = ZTMP2 + ZTMP6 + zmm26
1824 vpternlogq(ZTMP3, 0x96, ZTMP7, xmm25, Assembler::AVX_512bit); // ZTMP3 = ZTMP3 + ZTMP7 + zmm25
1825 evpxorq(ZTMP4, ZTMP4, ZTMP8, Assembler::AVX_512bit); // ZTMP4 = ZTMP4 + ZTMP8
1826
1827 addl(ghash_pos, 128);
1828 addl(rbx, 128);
1829
1830 // Ghash remaining blocks
1831 bind(LOOP);
1832 cmpl(ghash_pos, pos);
1833 jcc(Assembler::aboveEqual, ACCUMULATE);
1834 // Load next cipher blocks and corresponding ghash keys
1835 evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1836 evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1837 vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
1838 vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
1839 evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
1840 evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
1841
1842 // ghash blocks 0 - 3
1843 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15)evpclmulqdq(ZTMP6, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP7, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP8, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP5, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit
);
;
1844
1845 // ghash blocks 4 - 7
1846 carrylessMultiply(ZTMP10, ZTMP11, ZTMP12, ZTMP9, ZTMP14, ZTMP16)evpclmulqdq(ZTMP10, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit
); evpclmulqdq(ZTMP11, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit
); evpclmulqdq(ZTMP12, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit
); evpclmulqdq(ZTMP9, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit
);
;
1847
1848 // update sums
1849 // ZTMP1 = ZTMP1 + ZTMP5 + ZTMP9
1850 // ZTMP2 = ZTMP2 + ZTMP6 + ZTMP10
1851 // ZTMP3 = ZTMP3 + ZTMP7 xor ZTMP11
1852 // ZTMP4 = ZTMP4 + ZTMP8 xor ZTMP12
1853 xorGHASH(ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP9, ZTMP6, ZTMP10, ZTMP7, ZTMP11, ZTMP8, ZTMP12)vpternlogq(ZTMP1, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit);
vpternlogq(ZTMP2, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit
); vpternlogq(ZTMP3, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit
); vpternlogq(ZTMP4, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit
);
;
1854 addl(ghash_pos, 128);
1855 addl(rbx, 128);
1856 jmp(LOOP);
1857
1858 // Integrate ZTMP3/ZTMP4 into ZTMP1 and ZTMP2
1859 bind(ACCUMULATE);
1860 evpxorq(ZTMP3, ZTMP3, ZTMP4, Assembler::AVX_512bit);
1861 vpsrldq(ZTMP7, ZTMP3, 8, Assembler::AVX_512bit);
1862 vpslldq(ZTMP8, ZTMP3, 8, Assembler::AVX_512bit);
1863 evpxorq(ZTMP1, ZTMP1, ZTMP7, Assembler::AVX_512bit);
1864 evpxorq(ZTMP2, ZTMP2, ZTMP8, Assembler::AVX_512bit);
1865
1866 // Add ZTMP1 and ZTMP2 128 - bit words horizontally
1867 vhpxori4x128(ZTMP1, ZTMP11)vextracti64x4(ZTMP11, ZTMP1, 1); evpxorq(ZTMP1, ZTMP1, ZTMP11
, Assembler::AVX_256bit); vextracti32x4(ZTMP11, ZTMP1, 1); evpxorq
(ZTMP1, ZTMP1, ZTMP11, Assembler::AVX_128bit);
;
1868 vhpxori4x128(ZTMP2, ZTMP12)vextracti64x4(ZTMP12, ZTMP2, 1); evpxorq(ZTMP2, ZTMP2, ZTMP12
, Assembler::AVX_256bit); vextracti32x4(ZTMP12, ZTMP2, 1); evpxorq
(ZTMP2, ZTMP2, ZTMP12, Assembler::AVX_128bit);
;
1869 // Load reduction polynomial and compute final reduction
1870 evmovdquq(ZTMP15, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx);
1871 vclmul_reduce(AAD_HASHx, ZTMP15, ZTMP1, ZTMP2, ZTMP3, ZTMP4)evpclmulqdq(ZTMP3, ZTMP15, ZTMP2, 0x01, Assembler::AVX_512bit
); vpslldq(ZTMP3, ZTMP3, 8, Assembler::AVX_512bit); evpxorq(ZTMP3
, ZTMP2, ZTMP3, Assembler::AVX_512bit); evpclmulqdq(ZTMP4, ZTMP15
, ZTMP3, 0x00, Assembler::AVX_512bit); vpsrldq(ZTMP4, ZTMP4, 4
, Assembler::AVX_512bit); evpclmulqdq(AAD_HASHx, ZTMP15, ZTMP3
, 0x10, Assembler::AVX_512bit); vpslldq(AAD_HASHx, AAD_HASHx,
4, Assembler::AVX_512bit); vpternlogq(AAD_HASHx, 0x96, ZTMP4
, ZTMP1, Assembler::AVX_512bit);
;
1872
1873 // Pre-increment counter for next operation
1874 vpaddd(CTR_BLOCKx, CTR_BLOCKx, xmm18, Assembler::AVX_128bit);
1875 // Shuffle counter and save the updated value
1876 vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit);
1877 movdqu(Address(counter, 0), CTR_BLOCKx);
1878 // Load ghash lswap mask
1879 movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
1880 // Shuffle ghash using lbswap_mask and store it
1881 vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
1882 movdqu(Address(state, 0), AAD_HASHx);
1883 jmp(ENC_DEC_DONE);
1884
1885 bind(GENERATE_HTBL_48_BLKS);
1886 generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl);
1887
1888 bind(ENC_DEC_DONE);
1889 movq(rax, pos);
1890}
1891
1892#endif // _LP64

/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp

1/*
2 * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef CPU_X86_ASSEMBLER_X86_HPP
26#define CPU_X86_ASSEMBLER_X86_HPP
27
28#include "asm/register.hpp"
29#include "utilities/powerOfTwo.hpp"
30
31// Contains all the definitions needed for x86 assembly code generation.
32
33// Calling convention
34class Argument {
35 public:
36 enum {
37#ifdef _LP641
38#ifdef _WIN64
39 n_int_register_parameters_c = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
40 n_float_register_parameters_c = 4, // xmm0 - xmm3 (c_farg0, c_farg1, ... )
41 n_int_register_returns_c = 1, // rax
42 n_float_register_returns_c = 1, // xmm0
43#else
44 n_int_register_parameters_c = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
45 n_float_register_parameters_c = 8, // xmm0 - xmm7 (c_farg0, c_farg1, ... )
46 n_int_register_returns_c = 2, // rax, rdx
47 n_float_register_returns_c = 2, // xmm0, xmm1
48#endif // _WIN64
49 n_int_register_parameters_j = 6, // j_rarg0, j_rarg1, ...
50 n_float_register_parameters_j = 8 // j_farg0, j_farg1, ...
51#else
52 n_register_parameters = 0 // 0 registers used to pass arguments
53#endif // _LP64
54 };
55};
56
57
58#ifdef _LP641
59// Symbolically name the register arguments used by the c calling convention.
60// Windows is different from linux/solaris. So much for standards...
61
62#ifdef _WIN64
63
64REGISTER_DECLARATION(Register, c_rarg0, rcx)const Register c_rarg0 = ((Register)rcx);
65REGISTER_DECLARATION(Register, c_rarg1, rdx)const Register c_rarg1 = ((Register)rdx);
66REGISTER_DECLARATION(Register, c_rarg2, r8)const Register c_rarg2 = ((Register)r8);
67REGISTER_DECLARATION(Register, c_rarg3, r9)const Register c_rarg3 = ((Register)r9);
68
69REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0)const XMMRegister c_farg0 = ((XMMRegister)xmm0);
70REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1)const XMMRegister c_farg1 = ((XMMRegister)xmm1);
71REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2)const XMMRegister c_farg2 = ((XMMRegister)xmm2);
72REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3)const XMMRegister c_farg3 = ((XMMRegister)xmm3);
73
74#else
75
76REGISTER_DECLARATION(Register, c_rarg0, rdi)const Register c_rarg0 = ((Register)rdi);
77REGISTER_DECLARATION(Register, c_rarg1, rsi)const Register c_rarg1 = ((Register)rsi);
78REGISTER_DECLARATION(Register, c_rarg2, rdx)const Register c_rarg2 = ((Register)rdx);
79REGISTER_DECLARATION(Register, c_rarg3, rcx)const Register c_rarg3 = ((Register)rcx);
80REGISTER_DECLARATION(Register, c_rarg4, r8)const Register c_rarg4 = ((Register)r8);
81REGISTER_DECLARATION(Register, c_rarg5, r9)const Register c_rarg5 = ((Register)r9);
82
83REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0)const XMMRegister c_farg0 = ((XMMRegister)xmm0);
84REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1)const XMMRegister c_farg1 = ((XMMRegister)xmm1);
85REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2)const XMMRegister c_farg2 = ((XMMRegister)xmm2);
86REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3)const XMMRegister c_farg3 = ((XMMRegister)xmm3);
87REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4)const XMMRegister c_farg4 = ((XMMRegister)xmm4);
88REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5)const XMMRegister c_farg5 = ((XMMRegister)xmm5);
89REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6)const XMMRegister c_farg6 = ((XMMRegister)xmm6);
90REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7)const XMMRegister c_farg7 = ((XMMRegister)xmm7);
91
92#endif // _WIN64
93
94// Symbolically name the register arguments used by the Java calling convention.
95// We have control over the convention for java so we can do what we please.
96// What pleases us is to offset the java calling convention so that when
97// we call a suitable jni method the arguments are lined up and we don't
98// have to do little shuffling. A suitable jni method is non-static and a
99// small number of arguments (two fewer args on windows)
100//
101// |-------------------------------------------------------|
102// | c_rarg0 c_rarg1 c_rarg2 c_rarg3 c_rarg4 c_rarg5 |
103// |-------------------------------------------------------|
104// | rcx rdx r8 r9 rdi* rsi* | windows (* not a c_rarg)
105// | rdi rsi rdx rcx r8 r9 | solaris/linux
106// |-------------------------------------------------------|
107// | j_rarg5 j_rarg0 j_rarg1 j_rarg2 j_rarg3 j_rarg4 |
108// |-------------------------------------------------------|
109
110REGISTER_DECLARATION(Register, j_rarg0, c_rarg1)const Register j_rarg0 = ((Register)c_rarg1);
111REGISTER_DECLARATION(Register, j_rarg1, c_rarg2)const Register j_rarg1 = ((Register)c_rarg2);
112REGISTER_DECLARATION(Register, j_rarg2, c_rarg3)const Register j_rarg2 = ((Register)c_rarg3);
113// Windows runs out of register args here
114#ifdef _WIN64
115REGISTER_DECLARATION(Register, j_rarg3, rdi)const Register j_rarg3 = ((Register)rdi);
116REGISTER_DECLARATION(Register, j_rarg4, rsi)const Register j_rarg4 = ((Register)rsi);
117#else
118REGISTER_DECLARATION(Register, j_rarg3, c_rarg4)const Register j_rarg3 = ((Register)c_rarg4);
119REGISTER_DECLARATION(Register, j_rarg4, c_rarg5)const Register j_rarg4 = ((Register)c_rarg5);
120#endif /* _WIN64 */
121REGISTER_DECLARATION(Register, j_rarg5, c_rarg0)const Register j_rarg5 = ((Register)c_rarg0);
122
123REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0)const XMMRegister j_farg0 = ((XMMRegister)xmm0);
124REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1)const XMMRegister j_farg1 = ((XMMRegister)xmm1);
125REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2)const XMMRegister j_farg2 = ((XMMRegister)xmm2);
126REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3)const XMMRegister j_farg3 = ((XMMRegister)xmm3);
127REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4)const XMMRegister j_farg4 = ((XMMRegister)xmm4);
128REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5)const XMMRegister j_farg5 = ((XMMRegister)xmm5);
129REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6)const XMMRegister j_farg6 = ((XMMRegister)xmm6);
130REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7)const XMMRegister j_farg7 = ((XMMRegister)xmm7);
131
132REGISTER_DECLARATION(Register, rscratch1, r10)const Register rscratch1 = ((Register)r10); // volatile
133REGISTER_DECLARATION(Register, rscratch2, r11)const Register rscratch2 = ((Register)r11); // volatile
134
135REGISTER_DECLARATION(Register, r12_heapbase, r12)const Register r12_heapbase = ((Register)r12); // callee-saved
136REGISTER_DECLARATION(Register, r15_thread, r15)const Register r15_thread = ((Register)r15); // callee-saved
137
138#else
139// rscratch1 will apear in 32bit code that is dead but of course must compile
140// Using noreg ensures if the dead code is incorrectly live and executed it
141// will cause an assertion failure
142#define rscratch1 noreg
143#define rscratch2 noreg
144
145#endif // _LP64
146
147// JSR 292
148// On x86, the SP does not have to be saved when invoking method handle intrinsics
149// or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
150REGISTER_DECLARATION(Register, rbp_mh_SP_save, noreg)const Register rbp_mh_SP_save = ((Register)noreg);
151
152// Address is an abstraction used to represent a memory location
153// using any of the amd64 addressing modes with one object.
154//
155// Note: A register location is represented via a Register, not
156// via an address for efficiency & simplicity reasons.
157
158class ArrayAddress;
159
160class Address {
161 public:
162 enum ScaleFactor {
163 no_scale = -1,
164 times_1 = 0,
165 times_2 = 1,
166 times_4 = 2,
167 times_8 = 3,
168 times_ptr = LP64_ONLY(times_8)times_8 NOT_LP64(times_4)
169 };
170 static ScaleFactor times(int size) {
171 assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size")do { if (!(size >= 1 && size <= 8 && is_power_of_2
(size))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 171, "assert(" "size >= 1 && size <= 8 && is_power_of_2(size)"
") failed", "bad scale size"); ::breakpoint(); } } while (0)
;
172 if (size == 8) return times_8;
173 if (size == 4) return times_4;
174 if (size == 2) return times_2;
175 return times_1;
176 }
177 static int scale_size(ScaleFactor scale) {
178 assert(scale != no_scale, "")do { if (!(scale != no_scale)) { (*g_assert_poison) = 'X';; report_vm_error
("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 178, "assert(" "scale != no_scale" ") failed", ""); ::breakpoint
(); } } while (0)
;
179 assert(((1 << (int)times_1) == 1 &&do { if (!(((1 << (int)times_1) == 1 && (1 <<
(int)times_2) == 2 && (1 << (int)times_4) == 4
&& (1 << (int)times_8) == 8))) { (*g_assert_poison
) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 182, "assert(" "((1 << (int)times_1) == 1 && (1 << (int)times_2) == 2 && (1 << (int)times_4) == 4 && (1 << (int)times_8) == 8)"
") failed", ""); ::breakpoint(); } } while (0)
180 (1 << (int)times_2) == 2 &&do { if (!(((1 << (int)times_1) == 1 && (1 <<
(int)times_2) == 2 && (1 << (int)times_4) == 4
&& (1 << (int)times_8) == 8))) { (*g_assert_poison
) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 182, "assert(" "((1 << (int)times_1) == 1 && (1 << (int)times_2) == 2 && (1 << (int)times_4) == 4 && (1 << (int)times_8) == 8)"
") failed", ""); ::breakpoint(); } } while (0)
181 (1 << (int)times_4) == 4 &&do { if (!(((1 << (int)times_1) == 1 && (1 <<
(int)times_2) == 2 && (1 << (int)times_4) == 4
&& (1 << (int)times_8) == 8))) { (*g_assert_poison
) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 182, "assert(" "((1 << (int)times_1) == 1 && (1 << (int)times_2) == 2 && (1 << (int)times_4) == 4 && (1 << (int)times_8) == 8)"
") failed", ""); ::breakpoint(); } } while (0)
182 (1 << (int)times_8) == 8), "")do { if (!(((1 << (int)times_1) == 1 && (1 <<
(int)times_2) == 2 && (1 << (int)times_4) == 4
&& (1 << (int)times_8) == 8))) { (*g_assert_poison
) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 182, "assert(" "((1 << (int)times_1) == 1 && (1 << (int)times_2) == 2 && (1 << (int)times_4) == 4 && (1 << (int)times_8) == 8)"
") failed", ""); ::breakpoint(); } } while (0)
;
183 return (1 << (int)scale);
184 }
185
186 private:
187 Register _base;
188 Register _index;
189 XMMRegister _xmmindex;
190 ScaleFactor _scale;
191 int _disp;
192 bool _isxmmindex;
193 RelocationHolder _rspec;
194
195 // Easily misused constructors make them private
196 // %%% can we make these go away?
197 NOT_LP64(Address(address loc, RelocationHolder spec);)
198 Address(int disp, address loc, relocInfo::relocType rtype);
199 Address(int disp, address loc, RelocationHolder spec);
200
201 public:
202
203 int disp() { return _disp; }
204 // creation
205 Address()
206 : _base(noreg),
207 _index(noreg),
208 _xmmindex(xnoreg),
209 _scale(no_scale),
210 _disp(0),
211 _isxmmindex(false){
212 }
213
214 // No default displacement otherwise Register can be implicitly
215 // converted to 0(Register) which is quite a different animal.
216
217 Address(Register base, int disp)
218 : _base(base),
219 _index(noreg),
220 _xmmindex(xnoreg),
221 _scale(no_scale),
222 _disp(disp),
223 _isxmmindex(false){
224 }
225
226 Address(Register base, Register index, ScaleFactor scale, int disp = 0)
227 : _base (base),
228 _index(index),
229 _xmmindex(xnoreg),
230 _scale(scale),
231 _disp (disp),
232 _isxmmindex(false) {
233 assert(!index->is_valid() == (scale == Address::no_scale),do { if (!(!index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 234, "assert(" "!index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
4
Called C++ object pointer is null
234 "inconsistent address")do { if (!(!index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 234, "assert(" "!index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
;
235 }
236
237 Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
238 : _base (base),
239 _index(index.register_or_noreg()),
240 _xmmindex(xnoreg),
241 _scale(scale),
242 _disp (disp + (index.constant_or_zero() * scale_size(scale))),
243 _isxmmindex(false){
244 if (!index.is_register()) scale = Address::no_scale;
245 assert(!_index->is_valid() == (scale == Address::no_scale),do { if (!(!_index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 246, "assert(" "!_index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
246 "inconsistent address")do { if (!(!_index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 246, "assert(" "!_index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
;
247 }
248
249 Address(Register base, XMMRegister index, ScaleFactor scale, int disp = 0)
250 : _base (base),
251 _index(noreg),
252 _xmmindex(index),
253 _scale(scale),
254 _disp(disp),
255 _isxmmindex(true) {
256 assert(!index->is_valid() == (scale == Address::no_scale),do { if (!(!index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 257, "assert(" "!index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
257 "inconsistent address")do { if (!(!index->is_valid() == (scale == Address::no_scale
))) { (*g_assert_poison) = 'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 257, "assert(" "!index->is_valid() == (scale == Address::no_scale)"
") failed", "inconsistent address"); ::breakpoint(); } } while
(0)
;
258 }
259
260 // The following overloads are used in connection with the
261 // ByteSize type (see sizes.hpp). They simplify the use of
262 // ByteSize'd arguments in assembly code.
263
264 Address(Register base, ByteSize disp)
265 : Address(base, in_bytes(disp)) {}
266
267 Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
268 : Address(base, index, scale, in_bytes(disp)) {}
269
270 Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
271 : Address(base, index, scale, in_bytes(disp)) {}
272
273 Address plus_disp(int disp) const {
274 Address a = (*this);
275 a._disp += disp;
276 return a;
277 }
278 Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
279 Address a = (*this);
280 a._disp += disp.constant_or_zero() * scale_size(scale);
281 if (disp.is_register()) {
282 assert(!a.index()->is_valid(), "competing indexes")do { if (!(!a.index()->is_valid())) { (*g_assert_poison) =
'X';; report_vm_error("/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 282, "assert(" "!a.index()->is_valid()" ") failed", "competing indexes"
); ::breakpoint(); } } while (0)
;
283 a._index = disp.as_register();
284 a._scale = scale;
285 }
286 return a;
287 }
288 bool is_same_address(Address a) const {
289 // disregard _rspec
290 return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
291 }
292
293 // accessors
294 bool uses(Register reg) const { return _base == reg || _index == reg; }
295 Register base() const { return _base; }
296 Register index() const { return _index; }
297 XMMRegister xmmindex() const { return _xmmindex; }
298 ScaleFactor scale() const { return _scale; }
299 int disp() const { return _disp; }
300 bool isxmmindex() const { return _isxmmindex; }
301
302 // Convert the raw encoding form into the form expected by the constructor for
303 // Address. An index of 4 (rsp) corresponds to having no index, so convert
304 // that to noreg for the Address constructor.
305 static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
306
307 static Address make_array(ArrayAddress);
308
309 private:
310 bool base_needs_rex() const {
311 return _base->is_valid() && _base->encoding() >= 8;
312 }
313
314 bool index_needs_rex() const {
315 return _index->is_valid() &&_index->encoding() >= 8;
316 }
317
318 bool xmmindex_needs_rex() const {
319 return _xmmindex->is_valid() && _xmmindex->encoding() >= 8;
320 }
321
322 relocInfo::relocType reloc() const { return _rspec.type(); }
323
324 friend class Assembler;
325 friend class MacroAssembler;
326 friend class LIR_Assembler; // base/index/scale/disp
327};
328
329//
330// AddressLiteral has been split out from Address because operands of this type
331// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
332// the few instructions that need to deal with address literals are unique and the
333// MacroAssembler does not have to implement every instruction in the Assembler
334// in order to search for address literals that may need special handling depending
335// on the instruction and the platform. As small step on the way to merging i486/amd64
336// directories.
337//
338class AddressLiteral {
339 friend class ArrayAddress;
340 RelocationHolder _rspec;
341 // Typically we use AddressLiterals we want to use their rval
342 // However in some situations we want the lval (effect address) of the item.
343 // We provide a special factory for making those lvals.
344 bool _is_lval;
345
346 // If the target is far we'll need to load the ea of this to
347 // a register to reach it. Otherwise if near we can do rip
348 // relative addressing.
349
350 address _target;
351
352 protected:
353 // creation
354 AddressLiteral()
355 : _is_lval(false),
356 _target(NULL__null)
357 {}
358
359 public:
360
361
362 AddressLiteral(address target, relocInfo::relocType rtype);
363
364 AddressLiteral(address target, RelocationHolder const& rspec)
365 : _rspec(rspec),
366 _is_lval(false),
367 _target(target)
368 {}
369
370 AddressLiteral addr() {
371 AddressLiteral ret = *this;
372 ret._is_lval = true;
373 return ret;
374 }
375
376
377 private:
378
379 address target() { return _target; }
380 bool is_lval() { return _is_lval; }
381
382 relocInfo::relocType reloc() const { return _rspec.type(); }
383 const RelocationHolder& rspec() const { return _rspec; }
384
385 friend class Assembler;
386 friend class MacroAssembler;
387 friend class Address;
388 friend class LIR_Assembler;
389};
390
391// Convience classes
392class RuntimeAddress: public AddressLiteral {
393
394 public:
395
396 RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
397
398};
399
400class ExternalAddress: public AddressLiteral {
401 private:
402 static relocInfo::relocType reloc_for_target(address target) {
403 // Sometimes ExternalAddress is used for values which aren't
404 // exactly addresses, like the card table base.
405 // external_word_type can't be used for values in the first page
406 // so just skip the reloc in that case.
407 return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
408 }
409
410 public:
411
412 ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
413
414};
415
416class InternalAddress: public AddressLiteral {
417
418 public:
419
420 InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
421
422};
423
424// x86 can do array addressing as a single operation since disp can be an absolute
425// address amd64 can't. We create a class that expresses the concept but does extra
426// magic on amd64 to get the final result
427
428class ArrayAddress {
429 private:
430
431 AddressLiteral _base;
432 Address _index;
433
434 public:
435
436 ArrayAddress() {};
437 ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
438 AddressLiteral base() { return _base; }
439 Address index() { return _index; }
440
441};
442
443class InstructionAttr;
444
445// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
446// See fxsave and xsave(EVEX enabled) documentation for layout
447const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize)2688 / wordSize;
448
449// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
450// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
451// is what you get. The Assembler is generating code into a CodeBuffer.
452
453class Assembler : public AbstractAssembler {
454 friend class AbstractAssembler; // for the non-virtual hack
455 friend class LIR_Assembler; // as_Address()
456 friend class StubGenerator;
457
458 public:
459 enum Condition { // The x86 condition codes used for conditional jumps/moves.
460 zero = 0x4,
461 notZero = 0x5,
462 equal = 0x4,
463 notEqual = 0x5,
464 less = 0xc,
465 lessEqual = 0xe,
466 greater = 0xf,
467 greaterEqual = 0xd,
468 below = 0x2,
469 belowEqual = 0x6,
470 above = 0x7,
471 aboveEqual = 0x3,
472 overflow = 0x0,
473 noOverflow = 0x1,
474 carrySet = 0x2,
475 carryClear = 0x3,
476 negative = 0x8,
477 positive = 0x9,
478 parity = 0xa,
479 noParity = 0xb
480 };
481
482 enum Prefix {
483 // segment overrides
484 CS_segment = 0x2e,
485 SS_segment = 0x36,
486 DS_segment = 0x3e,
487 ES_segment = 0x26,
488 FS_segment = 0x64,
489 GS_segment = 0x65,
490
491 REX = 0x40,
492
493 REX_B = 0x41,
494 REX_X = 0x42,
495 REX_XB = 0x43,
496 REX_R = 0x44,
497 REX_RB = 0x45,
498 REX_RX = 0x46,
499 REX_RXB = 0x47,
500
501 REX_W = 0x48,
502
503 REX_WB = 0x49,
504 REX_WX = 0x4A,
505 REX_WXB = 0x4B,
506 REX_WR = 0x4C,
507 REX_WRB = 0x4D,
508 REX_WRX = 0x4E,
509 REX_WRXB = 0x4F,
510
511 VEX_3bytes = 0xC4,
512 VEX_2bytes = 0xC5,
513 EVEX_4bytes = 0x62,
514 Prefix_EMPTY = 0x0
515 };
516
517 enum VexPrefix {
518 VEX_B = 0x20,
519 VEX_X = 0x40,
520 VEX_R = 0x80,
521 VEX_W = 0x80
522 };
523
524 enum ExexPrefix {
525 EVEX_F = 0x04,
526 EVEX_V = 0x08,
527 EVEX_Rb = 0x10,
528 EVEX_X = 0x40,
529 EVEX_Z = 0x80
530 };
531
532 enum VexSimdPrefix {
533 VEX_SIMD_NONE = 0x0,
534 VEX_SIMD_66 = 0x1,
535 VEX_SIMD_F3 = 0x2,
536 VEX_SIMD_F2 = 0x3
537 };
538
539 enum VexOpcode {
540 VEX_OPCODE_NONE = 0x0,
541 VEX_OPCODE_0F = 0x1,
542 VEX_OPCODE_0F_38 = 0x2,
543 VEX_OPCODE_0F_3A = 0x3,
544 VEX_OPCODE_MASK = 0x1F
545 };
546
547 enum AvxVectorLen {
548 AVX_128bit = 0x0,
549 AVX_256bit = 0x1,
550 AVX_512bit = 0x2,
551 AVX_NoVec = 0x4
552 };
553
554 enum EvexTupleType {
555 EVEX_FV = 0,
556 EVEX_HV = 4,
557 EVEX_FVM = 6,
558 EVEX_T1S = 7,
559 EVEX_T1F = 11,
560 EVEX_T2 = 13,
561 EVEX_T4 = 15,
562 EVEX_T8 = 17,
563 EVEX_HVM = 18,
564 EVEX_QVM = 19,
565 EVEX_OVM = 20,
566 EVEX_M128 = 21,
567 EVEX_DUP = 22,
568 EVEX_ETUP = 23
569 };
570
571 enum EvexInputSizeInBits {
572 EVEX_8bit = 0,
573 EVEX_16bit = 1,
574 EVEX_32bit = 2,
575 EVEX_64bit = 3,
576 EVEX_NObit = 4
577 };
578
579 enum WhichOperand {
580 // input to locate_operand, and format code for relocations
581 imm_operand = 0, // embedded 32-bit|64-bit immediate operand
582 disp32_operand = 1, // embedded 32-bit displacement or address
583 call32_operand = 2, // embedded 32-bit self-relative displacement
584#ifndef _LP641
585 _WhichOperand_limit = 3
586#else
587 narrow_oop_operand = 3, // embedded 32-bit immediate narrow oop
588 _WhichOperand_limit = 4
589#endif
590 };
591
592 // Comparison predicates for integral types & FP types when using SSE
593 enum ComparisonPredicate {
594 eq = 0,
595 lt = 1,
596 le = 2,
597 _false = 3,
598 neq = 4,
599 nlt = 5,
600 nle = 6,
601 _true = 7
602 };
603
604 // Comparison predicates for FP types when using AVX
605 // O means ordered. U is unordered. When using ordered, any NaN comparison is false. Otherwise, it is true.
606 // S means signaling. Q means non-signaling. When signaling is true, instruction signals #IA on NaN.
607 enum ComparisonPredicateFP {
608 EQ_OQ = 0,
609 LT_OS = 1,
610 LE_OS = 2,
611 UNORD_Q = 3,
612 NEQ_UQ = 4,
613 NLT_US = 5,
614 NLE_US = 6,
615 ORD_Q = 7,
616 EQ_UQ = 8,
617 NGE_US = 9,
618 NGT_US = 0xA,
619 FALSE_OQ = 0XB,
620 NEQ_OQ = 0xC,
621 GE_OS = 0xD,
622 GT_OS = 0xE,
623 TRUE_UQ = 0xF,
624 EQ_OS = 0x10,
625 LT_OQ = 0x11,
626 LE_OQ = 0x12,
627 UNORD_S = 0x13,
628 NEQ_US = 0x14,
629 NLT_UQ = 0x15,
630 NLE_UQ = 0x16,
631 ORD_S = 0x17,
632 EQ_US = 0x18,
633 NGE_UQ = 0x19,
634 NGT_UQ = 0x1A,
635 FALSE_OS = 0x1B,
636 NEQ_OS = 0x1C,
637 GE_OQ = 0x1D,
638 GT_OQ = 0x1E,
639 TRUE_US =0x1F
640 };
641
642 enum Width {
643 B = 0,
644 W = 1,
645 D = 2,
646 Q = 3
647 };
648
649 //---< calculate length of instruction >---
650 // As instruction size can't be found out easily on x86/x64,
651 // we just use '4' for len and maxlen.
652 // instruction must start at passed address
653 static unsigned int instr_len(unsigned char *instr) { return 4; }
654
655 //---< longest instructions >---
656 // Max instruction length is not specified in architecture documentation.
657 // We could use a "safe enough" estimate (15), but just default to
658 // instruction length guess from above.
659 static unsigned int instr_maxlen() { return 4; }
660
661 // NOTE: The general philopsophy of the declarations here is that 64bit versions
662 // of instructions are freely declared without the need for wrapping them an ifdef.
663 // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
664 // In the .cpp file the implementations are wrapped so that they are dropped out
665 // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
666 // to the size it was prior to merging up the 32bit and 64bit assemblers.
667 //
668 // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
669 // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
670
671private:
672
673 bool _legacy_mode_bw;
674 bool _legacy_mode_dq;
675 bool _legacy_mode_vl;
676 bool _legacy_mode_vlbw;
677 NOT_LP64(bool _is_managed;)
678
679 class InstructionAttr *_attributes;
680
681 // 64bit prefixes
682 void prefix(Register reg);
683 void prefix(Register dst, Register src, Prefix p);
684 void prefix(Register dst, Address adr, Prefix p);
685
686 void prefix(Address adr);
687 void prefix(Address adr, Register reg, bool byteinst = false);
688 void prefix(Address adr, XMMRegister reg);
689
690 int prefix_and_encode(int reg_enc, bool byteinst = false);
691 int prefix_and_encode(int dst_enc, int src_enc) {
692 return prefix_and_encode(dst_enc, false, src_enc, false);
693 }
694 int prefix_and_encode(int dst_enc, bool dst_is_byte, int src_enc, bool src_is_byte);
695
696 // Some prefixq variants always emit exactly one prefix byte, so besides a
697 // prefix-emitting method we provide a method to get the prefix byte to emit,
698 // which can then be folded into a byte stream.
699 int8_t get_prefixq(Address adr);
700 int8_t get_prefixq(Address adr, Register reg);
701
702 void prefixq(Address adr);
703 void prefixq(Address adr, Register reg);
704 void prefixq(Address adr, XMMRegister reg);
705
706 int prefixq_and_encode(int reg_enc);
707 int prefixq_and_encode(int dst_enc, int src_enc);
708
709 void rex_prefix(Address adr, XMMRegister xreg,
710 VexSimdPrefix pre, VexOpcode opc, bool rex_w);
711 int rex_prefix_and_encode(int dst_enc, int src_enc,
712 VexSimdPrefix pre, VexOpcode opc, bool rex_w);
713
714 void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);
715
716 void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, bool evex_v,
717 int nds_enc, VexSimdPrefix pre, VexOpcode opc);
718
719 void vex_prefix(Address adr, int nds_enc, int xreg_enc,
720 VexSimdPrefix pre, VexOpcode opc,
721 InstructionAttr *attributes);
722
723 int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
724 VexSimdPrefix pre, VexOpcode opc,
725 InstructionAttr *attributes);
726
727 void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
728 VexOpcode opc, InstructionAttr *attributes);
729
730 int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
731 VexOpcode opc, InstructionAttr *attributes);
732
733 // Helper functions for groups of instructions
734 void emit_arith_b(int op1, int op2, Register dst, int imm8);
735
736 void emit_arith(int op1, int op2, Register dst, int32_t imm32);
737 // Force generation of a 4 byte immediate value even if it fits into 8bit
738 void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
739 void emit_arith(int op1, int op2, Register dst, Register src);
740
741 bool emit_compressed_disp_byte(int &disp);
742
743 void emit_modrm(int mod, int dst_enc, int src_enc);
744 void emit_modrm_disp8(int mod, int dst_enc, int src_enc,
745 int disp);
746 void emit_modrm_sib(int mod, int dst_enc, int src_enc,
747 Address::ScaleFactor scale, int index_enc, int base_enc);
748 void emit_modrm_sib_disp8(int mod, int dst_enc, int src_enc,
749 Address::ScaleFactor scale, int index_enc, int base_enc,
750 int disp);
751
752 void emit_operand_helper(int reg_enc,
753 int base_enc, int index_enc, Address::ScaleFactor scale,
754 int disp,
755 RelocationHolder const& rspec,
756 int rip_relative_correction = 0);
757
758 void emit_operand(Register reg,
759 Register base, Register index, Address::ScaleFactor scale,
760 int disp,
761 RelocationHolder const& rspec,
762 int rip_relative_correction = 0);
763
764 void emit_operand(Register reg,
765 Register base, XMMRegister index, Address::ScaleFactor scale,
766 int disp,
767 RelocationHolder const& rspec);
768
769 void emit_operand(XMMRegister xreg,
770 Register base, XMMRegister xindex, Address::ScaleFactor scale,
771 int disp,
772 RelocationHolder const& rspec);
773
774 void emit_operand(Register reg, Address adr,
775 int rip_relative_correction = 0);
776
777 void emit_operand(XMMRegister reg,
778 Register base, Register index, Address::ScaleFactor scale,
779 int disp,
780 RelocationHolder const& rspec);
781
782 void emit_operand(XMMRegister reg, Address adr);
783
784 // Immediate-to-memory forms
785 void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
786
787 protected:
788 #ifdef ASSERT1
789 void check_relocation(RelocationHolder const& rspec, int format);
790 #endif
791
792 void emit_data(jint data, relocInfo::relocType rtype, int format);
793 void emit_data(jint data, RelocationHolder const& rspec, int format);
794 void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
795 void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
796
797 bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
798
799 // These are all easily abused and hence protected
800
801 // 32BIT ONLY SECTION
802#ifndef _LP641
803 // Make these disappear in 64bit mode since they would never be correct
804 void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
805 void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
806
807 void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
808 void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
809
810 void push_literal32(int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
811#else
812 // 64BIT ONLY SECTION
813 void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec); // 64BIT ONLY
814
815 void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
816 void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
817
818 void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
819 void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
820#endif // _LP64
821
822 // These are unique in that we are ensured by the caller that the 32bit
823 // relative in these instructions will always be able to reach the potentially
824 // 64bit address described by entry. Since they can take a 64bit address they
825 // don't have the 32 suffix like the other instructions in this class.
826
827 void call_literal(address entry, RelocationHolder const& rspec);
828 void jmp_literal(address entry, RelocationHolder const& rspec);
829
830 // Avoid using directly section
831 // Instructions in this section are actually usable by anyone without danger
832 // of failure but have performance issues that are addressed my enhanced
833 // instructions which will do the proper thing base on the particular cpu.
834 // We protect them because we don't trust you...
835
836 // Don't use next inc() and dec() methods directly. INC & DEC instructions
837 // could cause a partial flag stall since they don't set CF flag.
838 // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
839 // which call inc() & dec() or add() & sub() in accordance with
840 // the product flag UseIncDec value.
841
842 void decl(Register dst);
843 void decl(Address dst);
844 void decq(Address dst);
845
846 void incl(Register dst);
847 void incl(Address dst);
848 void incq(Register dst);
849 void incq(Address dst);
850
851 // New cpus require use of movsd and movss to avoid partial register stall
852 // when loading from memory. But for old Opteron use movlpd instead of movsd.
853 // The selection is done in MacroAssembler::movdbl() and movflt().
854
855 // Move Scalar Single-Precision Floating-Point Values
856 void movss(XMMRegister dst, Address src);
857 void movss(XMMRegister dst, XMMRegister src);
858 void movss(Address dst, XMMRegister src);
859
860 // Move Scalar Double-Precision Floating-Point Values
861 void movsd(XMMRegister dst, Address src);
862 void movsd(XMMRegister dst, XMMRegister src);
863 void movsd(Address dst, XMMRegister src);
864 void movlpd(XMMRegister dst, Address src);
865
866 // New cpus require use of movaps and movapd to avoid partial register stall
867 // when moving between registers.
868 void movaps(XMMRegister dst, XMMRegister src);
869 void movapd(XMMRegister dst, XMMRegister src);
870
871 // End avoid using directly
872
873
874 // Instruction prefixes
875 void prefix(Prefix p);
876
877 public:
878
879 // Creation
880 Assembler(CodeBuffer* code) : AbstractAssembler(code) {
881 init_attributes();
882 }
883
884 // Decoding
885 static address locate_operand(address inst, WhichOperand which);
886 static address locate_next_instruction(address inst);
887
888 // Utilities
889 static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
890 int cur_tuple_type, int in_size_in_bits, int cur_encoding);
891
892 // Generic instructions
893 // Does 32bit or 64bit as needed for the platform. In some sense these
894 // belong in macro assembler but there is no need for both varieties to exist
895
896 void init_attributes(void);
897
898 void set_attributes(InstructionAttr *attributes) { _attributes = attributes; }
899 void clear_attributes(void) { _attributes = NULL__null; }
900
901 void set_managed(void) { NOT_LP64(_is_managed = true;) }
902 void clear_managed(void) { NOT_LP64(_is_managed = false;) }
903 bool is_managed(void) {
904 NOT_LP64(return _is_managed;)
905 LP64_ONLY(return false;)return false; }
906
907 void lea(Register dst, Address src);
908
909 void mov(Register dst, Register src);
910
911#ifdef _LP641
912 // support caching the result of some routines
913
914 // must be called before pusha(), popa(), vzeroupper() - checked with asserts
915 static void precompute_instructions();
916
917 void pusha_uncached();
918 void popa_uncached();
919#endif
920 void vzeroupper_uncached();
921 void decq(Register dst);
922
923 void pusha();
924 void popa();
925
926 void pushf();
927 void popf();
928
929 void push(int32_t imm32);
930
931 void push(Register src);
932
933 void pop(Register dst);
934
935 // These are dummies to prevent surprise implicit conversions to Register
936 void push(void* v);
937 void pop(void* v);
938
939 // These do register sized moves/scans
940 void rep_mov();
941 void rep_stos();
942 void rep_stosb();
943 void repne_scan();
944#ifdef _LP641
945 void repne_scanl();
946#endif
947
948 // Vanilla instructions in lexical order
949
950 void adcl(Address dst, int32_t imm32);
951 void adcl(Address dst, Register src);
952 void adcl(Register dst, int32_t imm32);
953 void adcl(Register dst, Address src);
954 void adcl(Register dst, Register src);
955
956 void adcq(Register dst, int32_t imm32);
957 void adcq(Register dst, Address src);
958 void adcq(Register dst, Register src);
959
960 void addb(Address dst, int imm8);
961 void addw(Register dst, Register src);
962 void addw(Address dst, int imm16);
963
964 void addl(Address dst, int32_t imm32);
965 void addl(Address dst, Register src);
966 void addl(Register dst, int32_t imm32);
967 void addl(Register dst, Address src);
968 void addl(Register dst, Register src);
969
970 void addq(Address dst, int32_t imm32);
971 void addq(Address dst, Register src);
972 void addq(Register dst, int32_t imm32);
973 void addq(Register dst, Address src);
974 void addq(Register dst, Register src);
975
976#ifdef _LP641
977 //Add Unsigned Integers with Carry Flag
978 void adcxq(Register dst, Register src);
979
980 //Add Unsigned Integers with Overflow Flag
981 void adoxq(Register dst, Register src);
982#endif
983
984 void addr_nop_4();
985 void addr_nop_5();
986 void addr_nop_7();
987 void addr_nop_8();
988
989 // Add Scalar Double-Precision Floating-Point Values
990 void addsd(XMMRegister dst, Address src);
991 void addsd(XMMRegister dst, XMMRegister src);
992
993 // Add Scalar Single-Precision Floating-Point Values
994 void addss(XMMRegister dst, Address src);
995 void addss(XMMRegister dst, XMMRegister src);
996
997 // AES instructions
998 void aesdec(XMMRegister dst, Address src);
999 void aesdec(XMMRegister dst, XMMRegister src);
1000 void aesdeclast(XMMRegister dst, Address src);
1001 void aesdeclast(XMMRegister dst, XMMRegister src);
1002 void aesenc(XMMRegister dst, Address src);
1003 void aesenc(XMMRegister dst, XMMRegister src);
1004 void aesenclast(XMMRegister dst, Address src);
1005 void aesenclast(XMMRegister dst, XMMRegister src);
1006 // Vector AES instructions
1007 void vaesenc(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1008 void vaesenclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1009 void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1010 void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1011
1012 void andw(Register dst, Register src);
1013 void andb(Address dst, Register src);
1014
1015 void andl(Address dst, int32_t imm32);
1016 void andl(Register dst, int32_t imm32);
1017 void andl(Register dst, Address src);
1018 void andl(Register dst, Register src);
1019 void andl(Address dst, Register src);
1020
1021 void andq(Address dst, int32_t imm32);
1022 void andq(Register dst, int32_t imm32);
1023 void andq(Register dst, Address src);
1024 void andq(Register dst, Register src);
1025 void andq(Address dst, Register src);
1026
1027 // BMI instructions
1028 void andnl(Register dst, Register src1, Register src2);
1029 void andnl(Register dst, Register src1, Address src2);
1030 void andnq(Register dst, Register src1, Register src2);
1031 void andnq(Register dst, Register src1, Address src2);
1032
1033 void blsil(Register dst, Register src);
1034 void blsil(Register dst, Address src);
1035 void blsiq(Register dst, Register src);
1036 void blsiq(Register dst, Address src);
1037
1038 void blsmskl(Register dst, Register src);
1039 void blsmskl(Register dst, Address src);
1040 void blsmskq(Register dst, Register src);
1041 void blsmskq(Register dst, Address src);
1042
1043 void blsrl(Register dst, Register src);
1044 void blsrl(Register dst, Address src);
1045 void blsrq(Register dst, Register src);
1046 void blsrq(Register dst, Address src);
1047
1048 void bsfl(Register dst, Register src);
1049 void bsrl(Register dst, Register src);
1050
1051#ifdef _LP641
1052 void bsfq(Register dst, Register src);
1053 void bsrq(Register dst, Register src);
1054#endif
1055
1056 void bswapl(Register reg);
1057
1058 void bswapq(Register reg);
1059
1060 void call(Label& L, relocInfo::relocType rtype);
1061 void call(Register reg); // push pc; pc <- reg
1062 void call(Address adr); // push pc; pc <- adr
1063
1064 void cdql();
1065
1066 void cdqq();
1067
1068 void cld();
1069
1070 void clflush(Address adr);
1071 void clflushopt(Address adr);
1072 void clwb(Address adr);
1073
1074 void cmovl(Condition cc, Register dst, Register src);
1075 void cmovl(Condition cc, Register dst, Address src);
1076
1077 void cmovq(Condition cc, Register dst, Register src);
1078 void cmovq(Condition cc, Register dst, Address src);
1079
1080
1081 void cmpb(Address dst, int imm8);
1082
1083 void cmpl(Address dst, int32_t imm32);
1084
1085 void cmp(Register dst, int32_t imm32);
1086 void cmpl(Register dst, int32_t imm32);
1087 void cmpl(Register dst, Register src);
1088 void cmpl(Register dst, Address src);
1089
1090 void cmpq(Address dst, int32_t imm32);
1091 void cmpq(Address dst, Register src);
1092
1093 void cmpq(Register dst, int32_t imm32);
1094 void cmpq(Register dst, Register src);
1095 void cmpq(Register dst, Address src);
1096
1097 // these are dummies used to catch attempting to convert NULL to Register
1098 void cmpl(Register dst, void* junk); // dummy
1099 void cmpq(Register dst, void* junk); // dummy
1100
1101 void cmpw(Address dst, int imm16);
1102
1103 void cmpxchg8 (Address adr);
1104
1105 void cmpxchgb(Register reg, Address adr);
1106 void cmpxchgl(Register reg, Address adr);
1107
1108 void cmpxchgq(Register reg, Address adr);
1109 void cmpxchgw(Register reg, Address adr);
1110
1111 // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1112 void comisd(XMMRegister dst, Address src);
1113 void comisd(XMMRegister dst, XMMRegister src);
1114
1115 // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1116 void comiss(XMMRegister dst, Address src);
1117 void comiss(XMMRegister dst, XMMRegister src);
1118
1119 // Identify processor type and features
1120 void cpuid();
1121
1122 // CRC32C
1123 void crc32(Register crc, Register v, int8_t sizeInBytes);
1124 void crc32(Register crc, Address adr, int8_t sizeInBytes);
1125
1126 // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
1127 void cvtsd2ss(XMMRegister dst, XMMRegister src);
1128 void cvtsd2ss(XMMRegister dst, Address src);
1129
1130 // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
1131 void cvtsi2sdl(XMMRegister dst, Register src);
1132 void cvtsi2sdl(XMMRegister dst, Address src);
1133 void cvtsi2sdq(XMMRegister dst, Register src);
1134 void cvtsi2sdq(XMMRegister dst, Address src);
1135
1136 // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
1137 void cvtsi2ssl(XMMRegister dst, Register src);
1138 void cvtsi2ssl(XMMRegister dst, Address src);
1139 void cvtsi2ssq(XMMRegister dst, Register src);
1140 void cvtsi2ssq(XMMRegister dst, Address src);
1141
1142 // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
1143 void cvtdq2pd(XMMRegister dst, XMMRegister src);
1144 void vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len);
1145
1146 // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
1147 void cvtdq2ps(XMMRegister dst, XMMRegister src);
1148 void vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len);
1149
1150 // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
1151 void cvtss2sd(XMMRegister dst, XMMRegister src);
1152 void cvtss2sd(XMMRegister dst, Address src);
1153
1154 // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
1155 void cvttsd2sil(Register dst, Address src);
1156 void cvttsd2sil(Register dst, XMMRegister src);
1157 void cvttsd2siq(Register dst, Address src);
1158 void cvttsd2siq(Register dst, XMMRegister src);
1159
1160 // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
1161 void cvttss2sil(Register dst, XMMRegister src);
1162 void cvttss2siq(Register dst, XMMRegister src);
1163
1164 // Convert vector double to int
1165 void cvttpd2dq(XMMRegister dst, XMMRegister src);
1166
1167 // Convert vector float and double
1168 void vcvtps2pd(XMMRegister dst, XMMRegister src, int vector_len);
1169 void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len);
1170
1171 // Convert vector float and int
1172 void vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len);
1173
1174 // Convert vector long to vector FP
1175 void evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len);
1176 void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len);
1177
1178 // Convert vector double to long
1179 void evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len);
1180
1181 // Evex casts with truncation
1182 void evpmovwb(XMMRegister dst, XMMRegister src, int vector_len);
1183 void evpmovdw(XMMRegister dst, XMMRegister src, int vector_len);
1184 void evpmovdb(XMMRegister dst, XMMRegister src, int vector_len);
1185 void evpmovqd(XMMRegister dst, XMMRegister src, int vector_len);
1186 void evpmovqb(XMMRegister dst, XMMRegister src, int vector_len);
1187 void evpmovqw(XMMRegister dst, XMMRegister src, int vector_len);
1188
1189 //Abs of packed Integer values
1190 void pabsb(XMMRegister dst, XMMRegister src);
1191 void pabsw(XMMRegister dst, XMMRegister src);
1192 void pabsd(XMMRegister dst, XMMRegister src);
1193 void vpabsb(XMMRegister dst, XMMRegister src, int vector_len);
1194 void vpabsw(XMMRegister dst, XMMRegister src, int vector_len);
1195 void vpabsd(XMMRegister dst, XMMRegister src, int vector_len);
1196 void evpabsq(XMMRegister dst, XMMRegister src, int vector_len);
1197
1198 // Divide Scalar Double-Precision Floating-Point Values
1199 void divsd(XMMRegister dst, Address src);
1200 void divsd(XMMRegister dst, XMMRegister src);
1201
1202 // Divide Scalar Single-Precision Floating-Point Values
1203 void divss(XMMRegister dst, Address src);
1204 void divss(XMMRegister dst, XMMRegister src);
1205
1206
1207#ifndef _LP641
1208 private:
1209
1210 void emit_farith(int b1, int b2, int i);
1211
1212 public:
1213 void emms();
1214
1215 void fabs();
1216
1217 void fadd(int i);
1218
1219 void fadd_d(Address src);
1220 void fadd_s(Address src);
1221
1222 // "Alternate" versions of x87 instructions place result down in FPU
1223 // stack instead of on TOS
1224
1225 void fadda(int i); // "alternate" fadd
1226 void faddp(int i = 1);
1227
1228 void fchs();
1229
1230 void fcom(int i);
1231
1232 void fcomp(int i = 1);
1233 void fcomp_d(Address src);
1234 void fcomp_s(Address src);
1235
1236 void fcompp();
1237
1238 void fcos();
1239
1240 void fdecstp();
1241
1242 void fdiv(int i);
1243 void fdiv_d(Address src);
1244 void fdivr_s(Address src);
1245 void fdiva(int i); // "alternate" fdiv
1246 void fdivp(int i = 1);
1247
1248 void fdivr(int i);
1249 void fdivr_d(Address src);
1250 void fdiv_s(Address src);
1251
1252 void fdivra(int i); // "alternate" reversed fdiv
1253
1254 void fdivrp(int i = 1);
1255
1256 void ffree(int i = 0);
1257
1258 void fild_d(Address adr);
1259 void fild_s(Address adr);
1260
1261 void fincstp();
1262
1263 void finit();
1264
1265 void fist_s (Address adr);
1266 void fistp_d(Address adr);
1267 void fistp_s(Address adr);
1268
1269 void fld1();
1270
1271 void fld_d(Address adr);
1272 void fld_s(Address adr);
1273 void fld_s(int index);
1274
1275 void fldcw(Address src);
1276
1277 void fldenv(Address src);
1278
1279 void fldlg2();
1280
1281 void fldln2();
1282
1283 void fldz();
1284
1285 void flog();
1286 void flog10();
1287
1288 void fmul(int i);
1289
1290 void fmul_d(Address src);
1291 void fmul_s(Address src);
1292
1293 void fmula(int i); // "alternate" fmul
1294
1295 void fmulp(int i = 1);
1296
1297 void fnsave(Address dst);
1298
1299 void fnstcw(Address src);
1300
1301 void fnstsw_ax();
1302
1303 void fprem();
1304 void fprem1();
1305
1306 void frstor(Address src);
1307
1308 void fsin();
1309
1310 void fsqrt();
1311
1312 void fst_d(Address adr);
1313 void fst_s(Address adr);
1314
1315 void fstp_d(Address adr);
1316 void fstp_d(int index);
1317 void fstp_s(Address adr);
1318
1319 void fsub(int i);
1320 void fsub_d(Address src);
1321 void fsub_s(Address src);
1322
1323 void fsuba(int i); // "alternate" fsub
1324
1325 void fsubp(int i = 1);
1326
1327 void fsubr(int i);
1328 void fsubr_d(Address src);
1329 void fsubr_s(Address src);
1330
1331 void fsubra(int i); // "alternate" reversed fsub
1332
1333 void fsubrp(int i = 1);
1334
1335 void ftan();
1336
1337 void ftst();
1338
1339 void fucomi(int i = 1);
1340 void fucomip(int i = 1);
1341
1342 void fwait();
1343
1344 void fxch(int i = 1);
1345
1346 void fyl2x();
1347 void frndint();
1348 void f2xm1();
1349 void fldl2e();
1350#endif // !_LP64
1351
1352 // operands that only take the original 32bit registers
1353 void emit_operand32(Register reg, Address adr);
1354
1355 void fld_x(Address adr); // extended-precision (80-bit) format
1356 void fstp_x(Address adr); // extended-precision (80-bit) format
1357 void fxrstor(Address src);
1358 void xrstor(Address src);
1359
1360 void fxsave(Address dst);
1361 void xsave(Address dst);
1362
1363 void hlt();
1364
1365 void idivl(Register src);
1366 void divl(Register src); // Unsigned division
1367
1368#ifdef _LP641
1369 void idivq(Register src);
1370#endif
1371
1372 void imull(Register src);
1373 void imull(Register dst, Register src);
1374 void imull(Register dst, Register src, int value);
1375 void imull(Register dst, Address src, int value);
1376 void imull(Register dst, Address src);
1377
1378#ifdef _LP641
1379 void imulq(Register dst, Register src);
1380 void imulq(Register dst, Register src, int value);
1381 void imulq(Register dst, Address src, int value);
1382 void imulq(Register dst, Address src);
1383 void imulq(Register dst);
1384#endif
1385
1386 // jcc is the generic conditional branch generator to run-
1387 // time routines, jcc is used for branches to labels. jcc
1388 // takes a branch opcode (cc) and a label (L) and generates
1389 // either a backward branch or a forward branch and links it
1390 // to the label fixup chain. Usage:
1391 //
1392 // Label L; // unbound label
1393 // jcc(cc, L); // forward branch to unbound label
1394 // bind(L); // bind label to the current pc
1395 // jcc(cc, L); // backward branch to bound label
1396 // bind(L); // illegal: a label may be bound only once
1397 //
1398 // Note: The same Label can be used for forward and backward branches
1399 // but it may be bound only once.
1400
1401 void jcc(Condition cc, Label& L, bool maybe_short = true);
1402
1403 // Conditional jump to a 8-bit offset to L.
1404 // WARNING: be very careful using this for forward jumps. If the label is
1405 // not bound within an 8-bit offset of this instruction, a run-time error
1406 // will occur.
1407
1408 // Use macro to record file and line number.
1409 #define jccb(cc, L)jccb_0(cc, L, "/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 1409)
jccb_0(cc, L, __FILE__"/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp", __LINE__1409)
1410
1411 void jccb_0(Condition cc, Label& L, const char* file, int line);
1412
1413 void jmp(Address entry); // pc <- entry
1414
1415 // Label operations & relative jumps (PPUM Appendix D)
1416 void jmp(Label& L, bool maybe_short = true); // unconditional jump to L
1417
1418 void jmp(Register entry); // pc <- entry
1419
1420 // Unconditional 8-bit offset jump to L.
1421 // WARNING: be very careful using this for forward jumps. If the label is
1422 // not bound within an 8-bit offset of this instruction, a run-time error
1423 // will occur.
1424
1425 // Use macro to record file and line number.
1426 #define jmpb(L)jmpb_0(L, "/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp"
, 1426)
jmpb_0(L, __FILE__"/home/daniel/Projects/java/jdk/src/hotspot/cpu/x86/assembler_x86.hpp", __LINE__1426)
1427
1428 void jmpb_0(Label& L, const char* file, int line);
1429
1430 void ldmxcsr( Address src );
1431
1432 void leal(Register dst, Address src);
1433
1434 void leaq(Register dst, Address src);
1435
1436 void lfence();
1437
1438 void lock();
1439 void size_prefix();
1440
1441 void lzcntl(Register dst, Register src);
1442
1443#ifdef _LP641
1444 void lzcntq(Register dst, Register src);
1445#endif
1446
1447 enum Membar_mask_bits {
1448 StoreStore = 1 << 3,
1449 LoadStore = 1 << 2,
1450 StoreLoad = 1 << 1,
1451 LoadLoad = 1 << 0
1452 };
1453
1454 // Serializes memory and blows flags
1455 void membar(Membar_mask_bits order_constraint);
1456
1457 void mfence();
1458 void sfence();
1459
1460 // Moves
1461
1462 void mov64(Register dst, int64_t imm64);
1463 void mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format);
1464
1465 void movb(Address dst, Register src);
1466 void movb(Address dst, int imm8);
1467 void movb(Register dst, Address src);
1468
1469 void movddup(XMMRegister dst, XMMRegister src);
1470
1471 void kandbl(KRegister dst, KRegister src1, KRegister src2);
1472 void kandwl(KRegister dst, KRegister src1, KRegister src2);
1473 void kanddl(KRegister dst, KRegister src1, KRegister src2);
1474 void kandql(KRegister dst, KRegister src1, KRegister src2);
1475
1476 void korbl(KRegister dst, KRegister src1, KRegister src2);
1477 void korwl(KRegister dst, KRegister src1, KRegister src2);
1478 void kordl(KRegister dst, KRegister src1, KRegister src2);
1479 void korql(KRegister dst, KRegister src1, KRegister src2);
1480
1481 void kxorbl(KRegister dst, KRegister src1, KRegister src2);
1482 void kxorwl(KRegister dst, KRegister src1, KRegister src2);
1483 void kxordl(KRegister dst, KRegister src1, KRegister src2);
1484 void kxorql(KRegister dst, KRegister src1, KRegister src2);
1485 void kmovbl(KRegister dst, Register src);
1486 void kmovbl(Register dst, KRegister src);
1487 void kmovbl(KRegister dst, KRegister src);
1488 void kmovwl(KRegister dst, Register src);
1489 void kmovwl(KRegister dst, Address src);
1490 void kmovwl(Register dst, KRegister src);
1491 void kmovwl(Address dst, KRegister src);
1492 void kmovwl(KRegister dst, KRegister src);
1493 void kmovdl(KRegister dst, Register src);
1494 void kmovdl(Register dst, KRegister src);
1495 void kmovql(KRegister dst, KRegister src);
1496 void kmovql(Address dst, KRegister src);
1497 void kmovql(KRegister dst, Address src);
1498 void kmovql(KRegister dst, Register src);
1499 void kmovql(Register dst, KRegister src);
1500
1501 void knotbl(KRegister dst, KRegister src);
1502 void knotwl(KRegister dst, KRegister src);
1503 void knotdl(KRegister dst, KRegister src);
1504 void knotql(KRegister dst, KRegister src);
1505
1506 void kortestbl(KRegister dst, KRegister src);
1507 void kortestwl(KRegister dst, KRegister src);
1508 void kortestdl(KRegister dst, KRegister src);
1509 void kortestql(KRegister dst, KRegister src);
1510
1511 void kxnorbl(KRegister dst, KRegister src1, KRegister src2);
1512 void kshiftlbl(KRegister dst, KRegister src, int imm8);
1513 void kshiftrbl(KRegister dst, KRegister src, int imm8);
1514 void kshiftrwl(KRegister dst, KRegister src, int imm8);
1515 void kshiftrdl(KRegister dst, KRegister src, int imm8);
1516 void kshiftrql(KRegister dst, KRegister src, int imm8);
1517 void ktestq(KRegister src1, KRegister src2);
1518 void ktestd(KRegister src1, KRegister src2);
1519
1520 void ktestql(KRegister dst, KRegister src);
1521 void ktestdl(KRegister dst, KRegister src);
1522 void ktestwl(KRegister dst, KRegister src);
1523 void ktestbl(KRegister dst, KRegister src);
1524
1525 void movdl(XMMRegister dst, Register src);
1526 void movdl(Register dst, XMMRegister src);
1527 void movdl(XMMRegister dst, Address src);
1528 void movdl(Address dst, XMMRegister src);
1529
1530 // Move Double Quadword
1531 void movdq(XMMRegister dst, Register src);
1532 void movdq(Register dst, XMMRegister src);
1533
1534 // Move Aligned Double Quadword
1535 void movdqa(XMMRegister dst, XMMRegister src);
1536 void movdqa(XMMRegister dst, Address src);
1537
1538 // Move Unaligned Double Quadword
1539 void movdqu(Address dst, XMMRegister src);
1540 void movdqu(XMMRegister dst, Address src);
1541 void movdqu(XMMRegister dst, XMMRegister src);
1542
1543 // Move Unaligned 256bit Vector
1544 void vmovdqu(Address dst, XMMRegister src);
1545 void vmovdqu(XMMRegister dst, Address src);
1546 void vmovdqu(XMMRegister dst, XMMRegister src);
1547
1548 // Move Unaligned 512bit Vector
1549 void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len);
1550 void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
1551 void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
1552 void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1553 void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1554 void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
1555 void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1556 void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
1557 void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1558 void evmovdqul(Address dst, XMMRegister src, int vector_len);
1559 void evmovdqul(XMMRegister dst, Address src, int vector_len);
1560 void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
1561 void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1562 void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1563 void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1564 void evmovdquq(Address dst, XMMRegister src, int vector_len);
1565 void evmovdquq(XMMRegister dst, Address src, int vector_len);
1566 void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
1567 void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1568 void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1569 void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1570
1571 // Move lower 64bit to high 64bit in 128bit register
1572 void movlhps(XMMRegister dst, XMMRegister src);
1573
1574 void movl(Register dst, int32_t imm32);
1575 void movl(Address dst, int32_t imm32);
1576 void movl(Register dst, Register src);
1577 void movl(Register dst, Address src);
1578 void movl(Address dst, Register src);
1579
1580 // These dummies prevent using movl from converting a zero (like NULL) into Register
1581 // by giving the compiler two choices it can't resolve
1582
1583 void movl(Address dst, void* junk);
1584 void movl(Register dst, void* junk);
1585
1586#ifdef _LP641
1587 void movq(Register dst, Register src);
1588 void movq(Register dst, Address src);
1589 void movq(Address dst, Register src);
1590 void movq(Address dst, int32_t imm32);
1591 void movq(Register dst, int32_t imm32);
1592
1593 // These dummies prevent using movq from converting a zero (like NULL) into Register
1594 // by giving the compiler two choices it can't resolve
1595
1596 void movq(Address dst, void* dummy);
1597 void movq(Register dst, void* dummy);
1598#endif
1599
1600 // Move Quadword
1601 void movq(Address dst, XMMRegister src);
1602 void movq(XMMRegister dst, Address src);
1603 void movq(XMMRegister dst, XMMRegister src);
1604 void movq(Register dst, XMMRegister src);
1605 void movq(XMMRegister dst, Register src);
1606
1607 void movsbl(Register dst, Address src);
1608 void movsbl(Register dst, Register src);
1609
1610#ifdef _LP641
1611 void movsbq(Register dst, Address src);
1612 void movsbq(Register dst, Register src);
1613
1614 // Move signed 32bit immediate to 64bit extending sign
1615 void movslq(Address dst, int32_t imm64);
1616 void movslq(Register dst, int32_t imm64);
1617
1618 void movslq(Register dst, Address src);
1619 void movslq(Register dst, Register src);
1620 void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
1621#endif
1622
1623 void movswl(Register dst, Address src);
1624 void movswl(Register dst, Register src);
1625
1626#ifdef _LP641
1627 void movswq(Register dst, Address src);
1628 void movswq(Register dst, Register src);
1629#endif
1630
1631 void movw(Address dst, int imm16);
1632 void movw(Register dst, Address src);
1633 void movw(Address dst, Register src);
1634
1635 void movzbl(Register dst, Address src);
1636 void movzbl(Register dst, Register src);
1637
1638#ifdef _LP641
1639 void movzbq(Register dst, Address src);
1640 void movzbq(Register dst, Register src);
1641#endif
1642
1643 void movzwl(Register dst, Address src);
1644 void movzwl(Register dst, Register src);
1645
1646#ifdef _LP641
1647 void movzwq(Register dst, Address src);
1648 void movzwq(Register dst, Register src);
1649#endif
1650
1651 // Unsigned multiply with RAX destination register
1652 void mull(Address src);
1653 void mull(Register src);
1654
1655#ifdef _LP641
1656 void mulq(Address src);
1657 void mulq(Register src);
1658 void mulxq(Register dst1, Register dst2, Register src);
1659#endif
1660
1661 // Multiply Scalar Double-Precision Floating-Point Values
1662 void mulsd(XMMRegister dst, Address src);
1663 void mulsd(XMMRegister dst, XMMRegister src);
1664
1665 // Multiply Scalar Single-Precision Floating-Point Values
1666 void mulss(XMMRegister dst, Address src);
1667 void mulss(XMMRegister dst, XMMRegister src);
1668
1669 void negl(Register dst);
1670 void negl(Address dst);
1671
1672#ifdef _LP641
1673 void negq(Register dst);
1674 void negq(Address dst);
1675#endif
1676
1677 void nop(int i = 1);
1678
1679 void notl(Register dst);
1680
1681#ifdef _LP641
1682 void notq(Register dst);
1683
1684 void btsq(Address dst, int imm8);
1685 void btrq(Address dst, int imm8);
1686#endif
1687
1688 void orw(Register dst, Register src);
1689
1690 void orl(Address dst, int32_t imm32);
1691 void orl(Register dst, int32_t imm32);
1692 void orl(Register dst, Address src);
1693 void orl(Register dst, Register src);
1694 void orl(Address dst, Register src);
1695
1696 void orb(Address dst, int imm8);
1697 void orb(Address dst, Register src);
1698
1699 void orq(Address dst, int32_t imm32);
1700 void orq(Address dst, Register src);
1701 void orq(Register dst, int32_t imm32);
1702 void orq(Register dst, Address src);
1703 void orq(Register dst, Register src);
1704
1705 // Pack with signed saturation
1706 void packsswb(XMMRegister dst, XMMRegister src);
1707 void vpacksswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1708 void packssdw(XMMRegister dst, XMMRegister src);
1709 void vpackssdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1710
1711 // Pack with unsigned saturation
1712 void packuswb(XMMRegister dst, XMMRegister src);
1713 void packuswb(XMMRegister dst, Address src);
1714 void packusdw(XMMRegister dst, XMMRegister src);
1715 void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1716 void vpackusdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1717
1718 // Permutations
1719 void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1720 void vpermq(XMMRegister dst, XMMRegister src, int imm8);
1721 void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1722 void vpermb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1723 void vpermb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1724 void vpermw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1725 void vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1726 void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1727 void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
1728 void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
1729 void vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1730 void vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1731 void vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1732 void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1733 void evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1734 void evpmultishiftqb(XMMRegister dst, XMMRegister ctl, XMMRegister src, int vector_len);
1735
1736 void pause();
1737
1738 // Undefined Instruction
1739 void ud2();
1740
1741 // SSE4.2 string instructions
1742 void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1743 void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1744
1745 void pcmpeqb(XMMRegister dst, XMMRegister src);
1746 void vpcmpCCbwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
1747
1748 void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1749 void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1750 void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1751 void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
1752
1753 void vpcmpgtb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1754 void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1755 void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
1756
1757 void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
1758 void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
1759
1760 void pcmpeqw(XMMRegister dst, XMMRegister src);
1761 void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1762 void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1763 void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1764
1765 void vpcmpgtw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1766
1767 void pcmpeqd(XMMRegister dst, XMMRegister src);
1768 void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1769 void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len);
1770 void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
1771
1772 void pcmpeqq(XMMRegister dst, XMMRegister src);
1773 void vpcmpCCq(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
1774 void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1775 void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1776 void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1777
1778 void pcmpgtq(XMMRegister dst, XMMRegister src);
1779 void vpcmpgtq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1780
1781 void pmovmskb(Register dst, XMMRegister src);
1782 void vpmovmskb(Register dst, XMMRegister src, int vec_enc);
1783 void vmovmskps(Register dst, XMMRegister src, int vec_enc);
1784 void vmovmskpd(Register dst, XMMRegister src, int vec_enc);
1785 void vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1786
1787 // SSE 4.1 extract
1788 void pextrd(Register dst, XMMRegister src, int imm8);
1789 void pextrq(Register dst, XMMRegister src, int imm8);
1790 void pextrd(Address dst, XMMRegister src, int imm8);
1791 void pextrq(Address dst, XMMRegister src, int imm8);
1792 void pextrb(Register dst, XMMRegister src, int imm8);
1793 void pextrb(Address dst, XMMRegister src, int imm8);
1794 // SSE 2 extract
1795 void pextrw(Register dst, XMMRegister src, int imm8);
1796 void pextrw(Address dst, XMMRegister src, int imm8);
1797
1798 // SSE 4.1 insert
1799 void pinsrd(XMMRegister dst, Register src, int imm8);
1800 void pinsrq(XMMRegister dst, Register src, int imm8);
1801 void pinsrb(XMMRegister dst, Register src, int imm8);
1802 void pinsrd(XMMRegister dst, Address src, int imm8);
1803 void pinsrq(XMMRegister dst, Address src, int imm8);
1804 void pinsrb(XMMRegister dst, Address src, int imm8);
1805 void insertps(XMMRegister dst, XMMRegister src, int imm8);
1806 // SSE 2 insert
1807 void pinsrw(XMMRegister dst, Register src, int imm8);
1808 void pinsrw(XMMRegister dst, Address src, int imm8);
1809
1810 // AVX insert
1811 void vpinsrd(XMMRegister dst, XMMRegister nds, Register src, int imm8);
1812 void vpinsrb(XMMRegister dst, XMMRegister nds, Register src, int imm8);
1813 void vpinsrq(XMMRegister dst, XMMRegister nds, Register src, int imm8);
1814 void vpinsrw(XMMRegister dst, XMMRegister nds, Register src, int imm8);
1815 void vinsertps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
1816
1817 // Zero extend moves
1818 void pmovzxbw(XMMRegister dst, XMMRegister src);
1819 void pmovzxbw(XMMRegister dst, Address src);
1820 void pmovzxbd(XMMRegister dst, XMMRegister src);
1821 void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
1822 void pmovzxdq(XMMRegister dst, XMMRegister src);
1823 void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len);
1824 void vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len);
1825 void vpmovzxbd(XMMRegister dst, XMMRegister src, int vector_len);
1826 void vpmovzxbq(XMMRegister dst, XMMRegister src, int vector_len);
1827 void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
1828
1829 // Sign extend moves
1830 void pmovsxbd(XMMRegister dst, XMMRegister src);
1831 void pmovsxbq(XMMRegister dst, XMMRegister src);
1832 void pmovsxbw(XMMRegister dst, XMMRegister src);
1833 void pmovsxwd(XMMRegister dst, XMMRegister src);
1834 void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len);
1835 void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len);
1836 void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);
1837 void vpmovsxwd(XMMRegister dst, XMMRegister src, int vector_len);
1838 void vpmovsxwq(XMMRegister dst, XMMRegister src, int vector_len);
1839 void vpmovsxdq(XMMRegister dst, XMMRegister src, int vector_len);
1840
1841 void evpmovwb(Address dst, XMMRegister src, int vector_len);
1842 void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);
1843
1844 void vpmovzxwd(XMMRegister dst, XMMRegister src, int vector_len);
1845
1846 void evpmovdb(Address dst, XMMRegister src, int vector_len);
1847
1848 // Multiply add
1849 void pmaddwd(XMMRegister dst, XMMRegister src);
1850 void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1851 void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
1852
1853 // Multiply add accumulate
1854 void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1855
1856#ifndef _LP641 // no 32bit push/pop on amd64
1857 void popl(Address dst);
1858#endif
1859
1860#ifdef _LP641
1861 void popq(Address dst);
1862 void popq(Register dst);
1863#endif
1864
1865 void popcntl(Register dst, Address src);
1866 void popcntl(Register dst, Register src);
1867
1868 void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
1869
1870#ifdef _LP641
1871 void popcntq(Register dst, Address src);
1872 void popcntq(Register dst, Register src);
1873#endif
1874
1875 // Prefetches (SSE, SSE2, 3DNOW only)
1876
1877 void prefetchnta(Address src);
1878 void prefetchr(Address src);
1879 void prefetcht0(Address src);
1880 void prefetcht1(Address src);
1881 void prefetcht2(Address src);
1882 void prefetchw(Address src);
1883
1884 // Shuffle Bytes
1885 void pshufb(XMMRegister dst, XMMRegister src);
1886 void pshufb(XMMRegister dst, Address src);
1887 void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1888
1889 // Shuffle Packed Doublewords
1890 void pshufd(XMMRegister dst, XMMRegister src, int mode);
1891 void pshufd(XMMRegister dst, Address src, int mode);
1892 void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len);
1893
1894 // Shuffle Packed High/Low Words
1895 void pshufhw(XMMRegister dst, XMMRegister src, int mode);
1896 void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1897 void pshuflw(XMMRegister dst, Address src, int mode);
1898
1899 //shuffle floats and doubles
1900 void pshufps(XMMRegister, XMMRegister, int);
1901 void pshufpd(XMMRegister, XMMRegister, int);
1902 void vpshufps(XMMRegister, XMMRegister, XMMRegister, int, int);
1903 void vpshufpd(XMMRegister, XMMRegister, XMMRegister, int, int);
1904
1905 // Shuffle packed values at 128 bit granularity
1906 void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
1907
1908 // Shift Right by bytes Logical DoubleQuadword Immediate
1909 void psrldq(XMMRegister dst, int shift);
1910 // Shift Left by bytes Logical DoubleQuadword Immediate
1911 void pslldq(XMMRegister dst, int shift);
1912
1913 // Logical Compare 128bit
1914 void ptest(XMMRegister dst, XMMRegister src);
1915 void ptest(XMMRegister dst, Address src);
1916 // Logical Compare 256bit
1917 void vptest(XMMRegister dst, XMMRegister src);
1918 void vptest(XMMRegister dst, Address src);
1919
1920 void evptestmb(KRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1921
1922 // Vector compare
1923 void vptest(XMMRegister dst, XMMRegister src, int vector_len);
1924
1925 // Interleave Low Bytes
1926 void punpcklbw(XMMRegister dst, XMMRegister src);
1927 void punpcklbw(XMMRegister dst, Address src);
1928
1929 // Interleave Low Doublewords
1930 void punpckldq(XMMRegister dst, XMMRegister src);
1931 void punpckldq(XMMRegister dst, Address src);
1932
1933 // Interleave Low Quadwords
1934 void punpcklqdq(XMMRegister dst, XMMRegister src);
1935
1936#ifndef _LP641 // no 32bit push/pop on amd64
1937 void pushl(Address src);
1938#endif
1939
1940 void pushq(Address src);
1941
1942 void rcll(Register dst, int imm8);
1943
1944 void rclq(Register dst, int imm8);
1945
1946 void rcrq(Register dst, int imm8);
1947
1948 void rcpps(XMMRegister dst, XMMRegister src);
1949
1950 void rcpss(XMMRegister dst, XMMRegister src);
1951
1952 void rdtsc();
1953
1954 void ret(int imm16);
1955
1956 void roll(Register dst);
1957
1958 void roll(Register dst, int imm8);
1959
1960 void rorl(Register dst);
1961
1962 void rorl(Register dst, int imm8);
1963
1964#ifdef _LP641
1965 void rolq(Register dst);
1966 void rolq(Register dst, int imm8);
1967 void rorq(Register dst);
1968 void rorq(Register dst, int imm8);
1969 void rorxq(Register dst, Register src, int imm8);
1970 void rorxd(Register dst, Register src, int imm8);
1971#endif
1972
1973 void sahf();
1974
1975 void sall(Register dst, int imm8);
1976 void sall(Register dst);
1977 void sall(Address dst, int imm8);
1978 void sall(Address dst);
1979
1980 void sarl(Address dst, int imm8);
1981 void sarl(Address dst);
1982 void sarl(Register dst, int imm8);
1983 void sarl(Register dst);
1984
1985#ifdef _LP641
1986 void salq(Register dst, int imm8);
1987 void salq(Register dst);
1988 void salq(Address dst, int imm8);
1989 void salq(Address dst);
1990
1991 void sarq(Address dst, int imm8);
1992 void sarq(Address dst);
1993 void sarq(Register dst, int imm8);
1994 void sarq(Register dst);
1995#endif
1996
1997 void sbbl(Address dst, int32_t imm32);
1998 void sbbl(Register dst, int32_t imm32);
1999 void sbbl(Register dst, Address src);
2000 void sbbl(Register dst, Register src);
2001
2002 void sbbq(Address dst, int32_t imm32);
2003 void sbbq(Register dst, int32_t imm32);
2004 void sbbq(Register dst, Address src);
2005 void sbbq(Register dst, Register src);
2006
2007 void setb(Condition cc, Register dst);
2008
2009 void sete(Register dst);
2010 void setl(Register dst);
2011 void setne(Register dst);
2012
2013 void palignr(XMMRegister dst, XMMRegister src, int imm8);
2014 void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
2015 void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2016
2017 void pblendw(XMMRegister dst, XMMRegister src, int imm8);
2018 void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
2019
2020 void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
2021 void sha1nexte(XMMRegister dst, XMMRegister src);
2022 void sha1msg1(XMMRegister dst, XMMRegister src);
2023 void sha1msg2(XMMRegister dst, XMMRegister src);
2024 // xmm0 is implicit additional source to the following instruction.
2025 void sha256rnds2(XMMRegister dst, XMMRegister src);
2026 void sha256msg1(XMMRegister dst, XMMRegister src);
2027 void sha256msg2(XMMRegister dst, XMMRegister src);
2028
2029 void shldl(Register dst, Register src);
2030 void shldl(Register dst, Register src, int8_t imm8);
2031 void shrdl(Register dst, Register src);
2032 void shrdl(Register dst, Register src, int8_t imm8);
2033
2034 void shll(Register dst, int imm8);
2035 void shll(Register dst);
2036
2037 void shlq(Register dst, int imm8);
2038 void shlq(Register dst);
2039
2040 void shrl(Register dst, int imm8);
2041 void shrl(Register dst);
2042 void shrl(Address dst);
2043 void shrl(Address dst, int imm8);
2044
2045 void shrq(Register dst, int imm8);
2046 void shrq(Register dst);
2047 void shrq(Address dst);
2048 void shrq(Address dst, int imm8);
2049
2050 void smovl(); // QQQ generic?
2051
2052 // Compute Square Root of Scalar Double-Precision Floating-Point Value
2053 void sqrtsd(XMMRegister dst, Address src);
2054 void sqrtsd(XMMRegister dst, XMMRegister src);
2055
2056 void roundsd(XMMRegister dst, Address src, int32_t rmode);
2057 void roundsd(XMMRegister dst, XMMRegister src, int32_t rmode);
2058
2059 // Compute Square Root of Scalar Single-Precision Floating-Point Value
2060 void sqrtss(XMMRegister dst, Address src);
2061 void sqrtss(XMMRegister dst, XMMRegister src);
2062
2063 void std();
2064
2065 void stmxcsr( Address dst );
2066
2067 void subl(Address dst, int32_t imm32);
2068 void subl(Address dst, Register src);
2069 void subl(Register dst, int32_t imm32);
2070 void subl(Register dst, Address src);
2071 void subl(Register dst, Register src);
2072
2073 void subq(Address dst, int32_t imm32);
2074 void subq(Address dst, Register src);
2075 void subq(Register dst, int32_t imm32);
2076 void subq(Register dst, Address src);
2077 void subq(Register dst, Register src);
2078
2079 // Force generation of a 4 byte immediate value even if it fits into 8bit
2080 void subl_imm32(Register dst, int32_t imm32);
2081 void subq_imm32(Register dst, int32_t imm32);
2082
2083 // Subtract Scalar Double-Precision Floating-Point Values
2084 void subsd(XMMRegister dst, Address src);
2085 void subsd(XMMRegister dst, XMMRegister src);
2086
2087 // Subtract Scalar Single-Precision Floating-Point Values
2088 void subss(XMMRegister dst, Address src);
2089 void subss(XMMRegister dst, XMMRegister src);
2090
2091 void testb(Register dst, int imm8);
2092 void testb(Address dst, int imm8);
2093
2094 void testl(Register dst, int32_t imm32);
2095 void testl(Register dst, Register src);
2096 void testl(Register dst, Address src);
2097
2098 void testq(Address dst, int32_t imm32);
2099 void testq(Register dst, int32_t imm32);
2100 void testq(Register dst, Register src);
2101 void testq(Register dst, Address src);
2102
2103 // BMI - count trailing zeros
2104 void tzcntl(Register dst, Register src);
2105 void tzcntq(Register dst, Register src);
2106
2107 // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
2108 void ucomisd(XMMRegister dst, Address src);
2109 void ucomisd(XMMRegister dst, XMMRegister src);
2110
2111 // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
2112 void ucomiss(XMMRegister dst, Address src);
2113 void ucomiss(XMMRegister dst, XMMRegister src);
2114
2115 void xabort(int8_t imm8);
2116
2117 void xaddb(Address dst, Register src);
2118 void xaddw(Address dst, Register src);
2119 void xaddl(Address dst, Register src);
2120 void xaddq(Address dst, Register src);
2121
2122 void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
2123
2124 void xchgb(Register reg, Address adr);
2125 void xchgw(Register reg, Address adr);
2126 void xchgl(Register reg, Address adr);
2127 void xchgl(Register dst, Register src);
2128
2129 void xchgq(Register reg, Address adr);
2130 void xchgq(Register dst, Register src);
2131
2132 void xend();
2133
2134 // Get Value of Extended Control Register
2135 void xgetbv();
2136
2137 void xorl(Register dst, int32_t imm32);
2138 void xorl(Address dst, int32_t imm32);
2139 void xorl(Register dst, Address src);
2140 void xorl(Register dst, Register src);
2141 void xorl(Address dst, Register src);
2142
2143 void xorb(Address dst, Register src);
2144 void xorb(Register dst, Address src);
2145 void xorw(Register dst, Register src);
2146
2147 void xorq(Register dst, Address src);
2148 void xorq(Address dst, int32_t imm32);
2149 void xorq(Register dst, Register src);
2150 void xorq(Register dst, int32_t imm32);
2151 void xorq(Address dst, Register src);
2152
2153 void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
2154
2155 // AVX 3-operands scalar instructions (encoded with VEX prefix)
2156
2157 void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
2158 void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2159 void vaddss(XMMRegister dst, XMMRegister nds, Address src);
2160 void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2161 void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
2162 void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2163 void vdivss(XMMRegister dst, XMMRegister nds, Address src);
2164 void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2165 void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2166 void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2167 void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
2168 void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2169 void vmulss(XMMRegister dst, XMMRegister nds, Address src);
2170 void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2171 void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
2172 void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2173 void vsubss(XMMRegister dst, XMMRegister nds, Address src);
2174 void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2175
2176 void vmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2177 void vmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2178 void vminss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2179 void vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2180
2181 void shlxl(Register dst, Register src1, Register src2);
2182 void shlxq(Register dst, Register src1, Register src2);
2183 void shrxl(Register dst, Register src1, Register src2);
2184 void shrxq(Register dst, Register src1, Register src2);
2185
2186 void bzhiq(Register dst, Register src1, Register src2);
2187 void pdep(Register dst, Register src1, Register src2);
2188 void pext(Register dst, Register src1, Register src2);
2189
2190
2191 //====================VECTOR ARITHMETIC=====================================
2192 // Add Packed Floating-Point Values
2193 void addpd(XMMRegister dst, XMMRegister src);
2194 void addpd(XMMRegister dst, Address src);
2195 void addps(XMMRegister dst, XMMRegister src);
2196 void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2197 void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2198 void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2199 void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2200
2201 // Subtract Packed Floating-Point Values
2202 void subpd(XMMRegister dst, XMMRegister src);
2203 void subps(XMMRegister dst, XMMRegister src);
2204 void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2205 void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2206 void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2207 void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2208
2209 // Multiply Packed Floating-Point Values
2210 void mulpd(XMMRegister dst, XMMRegister src);
2211 void mulpd(XMMRegister dst, Address src);
2212 void mulps(XMMRegister dst, XMMRegister src);
2213 void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2214 void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2215 void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2216 void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2217
2218 void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2219 void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2220 void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2221 void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2222
2223 // Divide Packed Floating-Point Values
2224 void divpd(XMMRegister dst, XMMRegister src);
2225 void divps(XMMRegister dst, XMMRegister src);
2226 void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2227 void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2228 void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2229 void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2230
2231 // Sqrt Packed Floating-Point Values
2232 void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
2233 void vsqrtpd(XMMRegister dst, Address src, int vector_len);
2234 void vsqrtps(XMMRegister dst, XMMRegister src, int vector_len);
2235 void vsqrtps(XMMRegister dst, Address src, int vector_len);
2236
2237 // Round Packed Double precision value.
2238 void vroundpd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
2239 void vroundpd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
2240 void vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
2241 void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
2242
2243 // Bitwise Logical AND of Packed Floating-Point Values
2244 void andpd(XMMRegister dst, XMMRegister src);
2245 void andps(XMMRegister dst, XMMRegister src);
2246 void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2247 void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2248 void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2249 void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2250
2251 void unpckhpd(XMMRegister dst, XMMRegister src);
2252 void unpcklpd(XMMRegister dst, XMMRegister src);
2253
2254 // Bitwise Logical XOR of Packed Floating-Point Values
2255 void xorpd(XMMRegister dst, XMMRegister src);
2256 void xorps(XMMRegister dst, XMMRegister src);
2257 void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2258 void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2259 void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2260 void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2261
2262 // Add horizontal packed integers
2263 void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2264 void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2265 void phaddw(XMMRegister dst, XMMRegister src);
2266 void phaddd(XMMRegister dst, XMMRegister src);
2267
2268 // Add packed integers
2269 void paddb(XMMRegister dst, XMMRegister src);
2270 void paddw(XMMRegister dst, XMMRegister src);
2271 void paddd(XMMRegister dst, XMMRegister src);
2272 void paddd(XMMRegister dst, Address src);
2273 void paddq(XMMRegister dst, XMMRegister src);
2274 void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2275 void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2276 void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2277 void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2278 void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2279 void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2280 void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2281 void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2282
2283 // Leaf level assembler routines for masked operations.
2284 void evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2285 void evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2286 void evpaddw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2287 void evpaddw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2288 void evpaddd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2289 void evpaddd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2290 void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2291 void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2292 void evaddps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2293 void evaddps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2294 void evaddpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2295 void evaddpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2296 void evpsubb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2297 void evpsubb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2298 void evpsubw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2299 void evpsubw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2300 void evpsubd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2301 void evpsubd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2302 void evpsubq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2303 void evpsubq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2304 void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2305 void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2306 void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2307 void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2308 void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2309 void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2310 void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2311 void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2312 void evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2313 void evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2314 void evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2315 void evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2316 void evmulpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2317 void evmulpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2318 void evdivps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2319 void evdivps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2320 void evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2321 void evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2322 void evpabsb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2323 void evpabsb(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2324 void evpabsw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2325 void evpabsw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2326 void evpabsd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2327 void evpabsd(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2328 void evpabsq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2329 void evpabsq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2330 void evpfma213ps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2331 void evpfma213ps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2332 void evpfma213pd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2333 void evpfma213pd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2334 void evpermb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2335 void evpermb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2336 void evpermw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2337 void evpermw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2338 void evpermd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2339 void evpermd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2340 void evpermq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2341 void evpermq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2342 void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2343 void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2344 void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2345 void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2346 void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2347 void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2348 void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2349 void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2350 void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2351 void evsqrtps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2352 void evsqrtps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2353 void evsqrtpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2354 void evsqrtpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2355
2356 void evpsllw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2357 void evpslld(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2358 void evpsllq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2359 void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2360 void evpsrld(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2361 void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2362 void evpsraw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2363 void evpsrad(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2364 void evpsraq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2365
2366 void evpsllvw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2367 void evpsllvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2368 void evpsllvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2369 void evpsrlvw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2370 void evpsrlvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2371 void evpsrlvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2372 void evpsravw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2373 void evpsravd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2374 void evpsravq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2375 void evpmaxsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2376 void evpmaxsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2377 void evpmaxsd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2378 void evpmaxsq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2379 void evpminsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2380 void evpminsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2381 void evpminsd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2382 void evpminsq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2383 void evpmaxsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2384 void evpmaxsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2385 void evpmaxsd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2386 void evpmaxsq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2387 void evpminsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2388 void evpminsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2389 void evpminsd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2390 void evpminsq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2391 void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2392 void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2393 void evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2394 void evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2395 void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2396 void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2397 void evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2398 void evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2399 void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2400 void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2401 void evpxorq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2402 void evpxorq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2403
2404 void evprold(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2405 void evprolq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2406 void evprolvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2407 void evprolvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2408 void evprord(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2409 void evprorq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2410 void evprorvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2411 void evprorvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2412
2413 // Sub packed integers
2414 void psubb(XMMRegister dst, XMMRegister src);
2415 void psubw(XMMRegister dst, XMMRegister src);
2416 void psubd(XMMRegister dst, XMMRegister src);
2417 void psubq(XMMRegister dst, XMMRegister src);
2418 void vpsubusb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2419 void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2420 void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2421 void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2422 void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2423 void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2424 void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2425 void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2426 void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2427
2428 // Multiply packed integers (only shorts and ints)
2429 void pmullw(XMMRegister dst, XMMRegister src);
2430 void pmulld(XMMRegister dst, XMMRegister src);
2431 void pmuludq(XMMRegister dst, XMMRegister src);
2432 void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2433 void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2434 void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2435 void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2436 void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2437 void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2438 void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2439 void vpmulhuw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2440
2441 // Minimum of packed integers
2442 void pminsb(XMMRegister dst, XMMRegister src);
2443 void vpminsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2444 void pminsw(XMMRegister dst, XMMRegister src);
2445 void vpminsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2446 void pminsd(XMMRegister dst, XMMRegister src);
2447 void vpminsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2448 void vpminsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2449 void minps(XMMRegister dst, XMMRegister src);
2450 void vminps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2451 void minpd(XMMRegister dst, XMMRegister src);
2452 void vminpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2453
2454 // Maximum of packed integers
2455 void pmaxsb(XMMRegister dst, XMMRegister src);
2456 void vpmaxsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2457 void pmaxsw(XMMRegister dst, XMMRegister src);
2458 void vpmaxsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2459 void pmaxsd(XMMRegister dst, XMMRegister src);
2460 void vpmaxsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2461 void vpmaxsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2462 void maxps(XMMRegister dst, XMMRegister src);
2463 void vmaxps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2464 void maxpd(XMMRegister dst, XMMRegister src);
2465 void vmaxpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2466
2467 // Shift left packed integers
2468 void psllw(XMMRegister dst, int shift);
2469 void pslld(XMMRegister dst, int shift);
2470 void psllq(XMMRegister dst, int shift);
2471 void psllw(XMMRegister dst, XMMRegister shift);
2472 void pslld(XMMRegister dst, XMMRegister shift);
2473 void psllq(XMMRegister dst, XMMRegister shift);
2474 void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2475 void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2476 void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2477 void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2478 void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2479 void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2480 void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2481
2482 // Logical shift right packed integers
2483 void psrlw(XMMRegister dst, int shift);
2484 void psrld(XMMRegister dst, int shift);
2485 void psrlq(XMMRegister dst, int shift);
2486 void psrlw(XMMRegister dst, XMMRegister shift);
2487 void psrld(XMMRegister dst, XMMRegister shift);
2488 void psrlq(XMMRegister dst, XMMRegister shift);
2489 void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2490 void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2491 void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2492 void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2493 void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2494 void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2495 void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2496 void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2497 void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2498
2499 // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
2500 void psraw(XMMRegister dst, int shift);
2501 void psrad(XMMRegister dst, int shift);
2502 void psraw(XMMRegister dst, XMMRegister shift);
2503 void psrad(XMMRegister dst, XMMRegister shift);
2504 void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2505 void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2506 void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2507 void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2508 void evpsravw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2509 void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2510 void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2511
2512 // Variable shift left packed integers
2513 void vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2514 void vpsllvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2515
2516 // Variable shift right packed integers
2517 void vpsrlvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2518 void vpsrlvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2519
2520 // Variable shift right arithmetic packed integers
2521 void vpsravd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2522 void evpsravq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2523
2524 void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2525 void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2526
2527 // And packed integers
2528 void pand(XMMRegister dst, XMMRegister src);
2529 void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2530 void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2531 void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2532
2533 // Andn packed integers
2534 void pandn(XMMRegister dst, XMMRegister src);
2535 void vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2536
2537 // Or packed integers
2538 void por(XMMRegister dst, XMMRegister src);
2539 void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2540 void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2541 void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2542
2543 // Xor packed integers
2544 void pxor(XMMRegister dst, XMMRegister src);
2545 void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2546 void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2547 void vpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2548 void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2549 void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2550
2551 // Ternary logic instruction.
2552 void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
2553 void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);
2554 void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
2555
2556 // Vector Rotate Left/Right instruction.
2557 void evprolvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2558 void evprolvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2559 void evprorvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2560 void evprorvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2561 void evprold(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2562 void evprolq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2563 void evprord(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2564 void evprorq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2565
2566 // vinserti forms
2567 void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2568 void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
2569 void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2570 void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
2571 void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2572
2573 // vinsertf forms
2574 void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2575 void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
2576 void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2577 void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
2578 void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2579 void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
2580
2581 // vextracti forms
2582 void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
2583 void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
2584 void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2585 void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
2586 void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
2587 void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2588 void vextracti64x4(Address dst, XMMRegister src, uint8_t imm8);
2589
2590 // vextractf forms
2591 void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
2592 void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
2593 void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2594 void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
2595 void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
2596 void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2597 void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
2598
2599 // xmm/mem sourced byte/word/dword/qword replicate
2600 void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
2601 void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
2602 void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
2603 void vpbroadcastw(XMMRegister dst, Address src, int vector_len);
2604 void vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
2605 void vpbroadcastd(XMMRegister dst, Address src, int vector_len);
2606 void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
2607 void vpbroadcastq(XMMRegister dst, Address src, int vector_len);
2608
2609 void evbroadcasti32x4(XMMRegister dst, Address src, int vector_len);
2610 void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
2611 void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
2612
2613 // scalar single/double/128bit precision replicate
2614 void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
2615 void vbroadcastss(XMMRegister dst, Address src, int vector_len);
2616 void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
2617 void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
2618 void vbroadcastf128(XMMRegister dst, Address src, int vector_len);
2619
2620 // gpr sourced byte/word/dword/qword replicate
2621 void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
2622 void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
2623 void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
2624 void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
2625
2626 // Gather AVX2 and AVX3
2627 void vpgatherdd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2628 void vpgatherdq(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2629 void vgatherdpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2630 void vgatherdps(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2631 void evpgatherdd(XMMRegister dst, KRegister mask, Address src, int vector_len);
2632 void evpgatherdq(XMMRegister dst, KRegister mask, Address src, int vector_len);
2633 void evgatherdpd(XMMRegister dst, KRegister mask, Address src, int vector_len);
2634 void evgatherdps(XMMRegister dst, KRegister mask, Address src, int vector_len);
2635
2636 //Scatter AVX3 only
2637 void evpscatterdd(Address dst, KRegister mask, XMMRegister src, int vector_len);
2638 void evpscatterdq(Address dst, KRegister mask, XMMRegister src, int vector_len);
2639 void evscatterdps(Address dst, KRegister mask, XMMRegister src, int vector_len);
2640 void evscatterdpd(Address dst, KRegister mask, XMMRegister src, int vector_len);
2641
2642 // Carry-Less Multiplication Quadword
2643 void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
2644 void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
2645 void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len);
2646 // AVX instruction which is used to clear upper 128 bits of YMM registers and
2647 // to avoid transaction penalty between AVX and SSE states. There is no
2648 // penalty if legacy SSE instructions are encoded using VEX prefix because
2649 // they always clear upper 128 bits. It should be used before calling
2650 // runtime code and native libraries.
2651 void vzeroupper();
2652
2653 // Vector double compares
2654 void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
2655 void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2656 ComparisonPredicateFP comparison, int vector_len);
2657
2658 // Vector float compares
2659 void vcmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int comparison, int vector_len);
2660 void evcmpps(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2661 ComparisonPredicateFP comparison, int vector_len);
2662
2663 // Vector integer compares
2664 void vpcmpgtd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2665 void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2666 int comparison, bool is_signed, int vector_len);
2667 void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
2668 int comparison, bool is_signed, int vector_len);
2669
2670 // Vector long compares
2671 void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2672 int comparison, bool is_signed, int vector_len);
2673 void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
2674 int comparison, bool is_signed, int vector_len);
2675
2676 // Vector byte compares
2677 void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2678 int comparison, bool is_signed, int vector_len);
2679 void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
2680 int comparison, bool is_signed, int vector_len);
2681
2682 // Vector short compares
2683 void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
2684 int comparison, bool is_signed, int vector_len);
2685 void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
2686 int comparison, bool is_signed, int vector_len);
2687
2688 void evpmovb2m(KRegister dst, XMMRegister src, int vector_len);
2689 void evpmovw2m(KRegister dst, XMMRegister src, int vector_len);
2690 void evpmovd2m(KRegister dst, XMMRegister src, int vector_len);
2691 void evpmovq2m(KRegister dst, XMMRegister src, int vector_len);
2692 void evpmovm2b(XMMRegister dst, KRegister src, int vector_len);
2693 void evpmovm2w(XMMRegister dst, KRegister src, int vector_len);
2694 void evpmovm2d(XMMRegister dst, KRegister src, int vector_len);
2695 void evpmovm2q(XMMRegister dst, KRegister src, int vector_len);
2696
2697 // Vector blends
2698 void blendvps(XMMRegister dst, XMMRegister src);
2699 void blendvpd(XMMRegister dst, XMMRegister src);
2700 void pblendvb(XMMRegister dst, XMMRegister src);
2701 void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
2702 void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
2703 void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
2704 void vpblendvb(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
2705 void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
2706 void evblendmpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2707 void evblendmps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2708 void evpblendmb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2709 void evpblendmw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2710 void evpblendmd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2711 void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2712 protected:
2713 // Next instructions require address alignment 16 bytes SSE mode.
2714 // They should be called only from corresponding MacroAssembler instructions.
2715 void andpd(XMMRegister dst, Address src);
2716 void andps(XMMRegister dst, Address src);
2717 void xorpd(XMMRegister dst, Address src);
2718 void xorps(XMMRegister dst, Address src);
2719
2720};
2721
2722// The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
2723// Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
2724// are applied.
2725class InstructionAttr {
2726public:
2727 InstructionAttr(
2728 int vector_len, // The length of vector to be applied in encoding - for both AVX and EVEX
2729 bool rex_vex_w, // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
2730 bool legacy_mode, // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
2731 bool no_reg_mask, // when true, k0 is used when EVEX encoding is chosen, else embedded_opmask_register_specifier is used
2732 bool uses_vl) // This instruction may have legacy constraints based on vector length for EVEX
2733 :
2734 _rex_vex_w(rex_vex_w),
2735 _legacy_mode(legacy_mode || UseAVX < 3),
2736 _no_reg_mask(no_reg_mask),
2737 _uses_vl(uses_vl),
2738 _rex_vex_w_reverted(false),
2739 _is_evex_instruction(false),
2740 _is_clear_context(true),
2741 _is_extended_context(false),
2742 _avx_vector_len(vector_len),
2743 _tuple_type(Assembler::EVEX_ETUP),
2744 _input_size_in_bits(Assembler::EVEX_NObit),
2745 _evex_encoding(0),
2746 _embedded_opmask_register_specifier(0), // hard code k0
2747 _current_assembler(NULL__null) { }
2748
2749 ~InstructionAttr() {
2750 if (_current_assembler != NULL__null) {
2751 _current_assembler->clear_attributes();
2752 }
2753 _current_assembler = NULL__null;
2754 }
2755
2756private:
2757 bool _rex_vex_w;
2758 bool _legacy_mode;
2759 bool _no_reg_mask;
2760 bool _uses_vl;
2761 bool _rex_vex_w_reverted;
2762 bool _is_evex_instruction;
2763 bool _is_clear_context;
2764 bool _is_extended_context;
2765 int _avx_vector_len;
2766 int _tuple_type;
2767 int _input_size_in_bits;
2768 int _evex_encoding;
2769 int _embedded_opmask_register_specifier;
2770
2771 Assembler *_current_assembler;
2772
2773public:
2774 // query functions for field accessors
2775 bool is_rex_vex_w(void) const { return _rex_vex_w; }
2776 bool is_legacy_mode(void) const { return _legacy_mode; }
2777 bool is_no_reg_mask(void) const { return _no_reg_mask; }
2778 bool uses_vl(void) const { return _uses_vl; }
2779 bool is_rex_vex_w_reverted(void) { return _rex_vex_w_reverted; }
2780 bool is_evex_instruction(void) const { return _is_evex_instruction; }
2781 bool is_clear_context(void) const { return _is_clear_context; }
2782 bool is_extended_context(void) const { return _is_extended_context; }
2783 int get_vector_len(void) const { return _avx_vector_len; }
2784 int get_tuple_type(void) const { return _tuple_type; }
2785 int get_input_size(void) const { return _input_size_in_bits; }
2786 int get_evex_encoding(void) const { return _evex_encoding; }
2787 int get_embedded_opmask_register_specifier(void) const { return _embedded_opmask_register_specifier; }
2788
2789 // Set the vector len manually
2790 void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
2791
2792 // Set revert rex_vex_w for avx encoding
2793 void set_rex_vex_w_reverted(void) { _rex_vex_w_reverted = true; }
2794
2795 // Set rex_vex_w based on state
2796 void set_rex_vex_w(bool state) { _rex_vex_w = state; }
2797
2798 // Set the instruction to be encoded in AVX mode
2799 void set_is_legacy_mode(void) { _legacy_mode = true; }
2800
2801 // Set the current instuction to be encoded as an EVEX instuction
2802 void set_is_evex_instruction(void) { _is_evex_instruction = true; }
2803
2804 // Internal encoding data used in compressed immediate offset programming
2805 void set_evex_encoding(int value) { _evex_encoding = value; }
2806
2807 // When the Evex.Z field is set (true), it is used to clear all non directed XMM/YMM/ZMM components.
2808 // This method unsets it so that merge semantics are used instead.
2809 void reset_is_clear_context(void) { _is_clear_context = false; }
2810
2811 // Map back to current asembler so that we can manage object level assocation
2812 void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }
2813
2814 // Address modifiers used for compressed displacement calculation
2815 void set_address_attributes(int tuple_type, int input_size_in_bits);
2816
2817 // Set embedded opmask register specifier.
2818 void set_embedded_opmask_register_specifier(KRegister mask) {
2819 _embedded_opmask_register_specifier = (*mask).encoding() & 0x7;
2820 }
2821
2822};
2823
2824#endif // CPU_X86_ASSEMBLER_X86_HPP