File: | jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp |
Warning: | line 1797, column 5 Value stored to 'index' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright (c) 2019, 2021, Intel Corporation. All rights reserved. |
3 | * |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
5 | * |
6 | * This code is free software; you can redistribute it and/or modify it |
7 | * under the terms of the GNU General Public License version 2 only, as |
8 | * published by the Free Software Foundation. |
9 | * |
10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
13 | * version 2 for more details (a copy is included in the LICENSE file that |
14 | * accompanied this code). |
15 | * |
16 | * You should have received a copy of the GNU General Public License version |
17 | * 2 along with this work; if not, write to the Free Software Foundation, |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
19 | * |
20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
21 | * or visit www.oracle.com if you need additional information or have any |
22 | * questions. |
23 | * |
24 | */ |
25 | |
26 | #include "precompiled.hpp" |
27 | #include "asm/assembler.hpp" |
28 | #include "asm/assembler.inline.hpp" |
29 | #include "runtime/stubRoutines.hpp" |
30 | #include "macroAssembler_x86.hpp" |
31 | |
32 | #ifdef _LP641 |
33 | |
34 | void MacroAssembler::roundEnc(XMMRegister key, int rnum) { |
35 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
36 | vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
37 | } |
38 | } |
39 | |
40 | void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) { |
41 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
42 | vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
43 | } |
44 | } |
45 | |
46 | void MacroAssembler::roundDec(XMMRegister key, int rnum) { |
47 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
48 | vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
49 | } |
50 | } |
51 | |
52 | void MacroAssembler::lastroundDec(XMMRegister key, int rnum) { |
53 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
54 | vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
55 | } |
56 | } |
57 | |
58 | // Load key and shuffle operation |
59 | void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL__null) { |
60 | movdqu(xmmdst, Address(key, offset)); |
61 | if (xmm_shuf_mask != NULL__null) { |
62 | pshufb(xmmdst, xmm_shuf_mask); |
63 | } else { |
64 | pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
65 | } |
66 | evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit); |
67 | } |
68 | |
69 | // AES-ECB Encrypt Operation |
70 | void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) { |
71 | |
72 | const Register pos = rax; |
73 | const Register rounds = r12; |
74 | |
75 | Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT; |
76 | push(r13); |
77 | push(r12); |
78 | |
79 | // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
80 | // context for the registers used, where all instructions below are using 128-bit mode |
81 | // On EVEX without VL and BW, these instructions will all be AVX. |
82 | if (VM_Version::supports_avx512vlbw()) { |
83 | movl(rax, 0xffff); |
84 | kmovql(k1, rax); |
85 | } |
86 | push(len); // Save |
87 | push(rbx); |
88 | |
89 | vzeroupper(); |
90 | |
91 | xorptr(pos, pos); |
92 | |
93 | // Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds |
94 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
95 | |
96 | // Load Key shuf mask |
97 | const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front |
98 | movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
99 | |
100 | // Load and shuffle key based on number of rounds |
101 | ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask); |
102 | ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask); |
103 | ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask); |
104 | ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask); |
105 | ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask); |
106 | ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask); |
107 | ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask); |
108 | ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask); |
109 | ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask); |
110 | ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask); |
111 | ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask); |
112 | cmpl(rounds, 52); |
113 | jcc(Assembler::greaterEqual, KEY_192); |
114 | jmp(Loop_start); |
115 | |
116 | bind(KEY_192); |
117 | ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask); |
118 | ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask); |
119 | cmpl(rounds, 60); |
120 | jcc(Assembler::equal, KEY_256); |
121 | jmp(Loop_start); |
122 | |
123 | bind(KEY_256); |
124 | ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask); |
125 | ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask); |
126 | |
127 | bind(Loop_start); |
128 | movq(rbx, len); |
129 | // Divide length by 16 to convert it to number of blocks |
130 | shrq(len, 4); |
131 | shlq(rbx, 60); |
132 | jcc(Assembler::equal, NO_PARTS); |
133 | addq(len, 1); |
134 | // Check if number of blocks is greater than or equal to 32 |
135 | // If true, 512 bytes are processed at a time (code marked by label LOOP) |
136 | // If not, 16 bytes are processed (code marked by REMAINDER label) |
137 | bind(NO_PARTS); |
138 | movq(rbx, len); |
139 | shrq(len, 5); |
140 | jcc(Assembler::equal, REMAINDER); |
141 | movl(r13, len); |
142 | // Compute number of blocks that will be processed 512 bytes at a time |
143 | // Subtract this from the total number of blocks which will then be processed by REMAINDER loop |
144 | shlq(r13, 5); |
145 | subq(rbx, r13); |
146 | //Begin processing 512 bytes |
147 | bind(LOOP); |
148 | // Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7 |
149 | evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
150 | evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
151 | evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
152 | evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
153 | evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
154 | evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
155 | evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
156 | evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
157 | // Xor with the first round key |
158 | evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit); |
159 | evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit); |
160 | evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit); |
161 | evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit); |
162 | evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit); |
163 | evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit); |
164 | evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit); |
165 | evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit); |
166 | // 9 Aes encode round operations |
167 | roundEnc(xmm9, 7); |
168 | roundEnc(xmm10, 7); |
169 | roundEnc(xmm23, 7); |
170 | roundEnc(xmm12, 7); |
171 | roundEnc(xmm13, 7); |
172 | roundEnc(xmm14, 7); |
173 | roundEnc(xmm15, 7); |
174 | roundEnc(xmm16, 7); |
175 | roundEnc(xmm17, 7); |
176 | cmpl(rounds, 52); |
177 | jcc(Assembler::aboveEqual, AES192); |
178 | // Aesenclast round operation for keysize = 128 |
179 | lastroundEnc(xmm24, 7); |
180 | jmp(END_LOOP); |
181 | //Additional 2 rounds of Aesenc operation for keysize = 192 |
182 | bind(AES192); |
183 | roundEnc(xmm24, 7); |
184 | roundEnc(xmm19, 7); |
185 | cmpl(rounds, 60); |
186 | jcc(Assembler::aboveEqual, AES256); |
187 | // Aesenclast round for keysize = 192 |
188 | lastroundEnc(xmm20, 7); |
189 | jmp(END_LOOP); |
190 | // 2 rounds of Aesenc operation and Aesenclast for keysize = 256 |
191 | bind(AES256); |
192 | roundEnc(xmm20, 7); |
193 | roundEnc(xmm21, 7); |
194 | lastroundEnc(xmm22, 7); |
195 | |
196 | bind(END_LOOP); |
197 | // Move 512 bytes of CT to destination |
198 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
199 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
200 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
201 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
202 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
203 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
204 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
205 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
206 | |
207 | addq(pos, 512); |
208 | decq(len); |
209 | jcc(Assembler::notEqual, LOOP); |
210 | |
211 | bind(REMAINDER); |
212 | vzeroupper(); |
213 | cmpq(rbx, 0); |
214 | jcc(Assembler::equal, END); |
215 | // Process 16 bytes at a time |
216 | bind(LOOP2); |
217 | movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0)); |
218 | vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit); |
219 | // xmm2 contains shuffled key for Aesenclast operation. |
220 | vmovdqu(xmm2, xmm24); |
221 | |
222 | vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit); |
223 | vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit); |
224 | vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit); |
225 | vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit); |
226 | vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit); |
227 | vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit); |
228 | vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit); |
229 | vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit); |
230 | vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit); |
231 | |
232 | cmpl(rounds, 52); |
233 | jcc(Assembler::below, LAST2); |
234 | vmovdqu(xmm2, xmm20); |
235 | vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit); |
236 | vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit); |
237 | cmpl(rounds, 60); |
238 | jcc(Assembler::below, LAST2); |
239 | vmovdqu(xmm2, xmm22); |
240 | vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit); |
241 | vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit); |
242 | |
243 | bind(LAST2); |
244 | // Aesenclast round |
245 | vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit); |
246 | // Write 16 bytes of CT to destination |
247 | movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1); |
248 | addq(pos, 16); |
249 | decq(rbx); |
250 | jcc(Assembler::notEqual, LOOP2); |
251 | |
252 | bind(END); |
253 | // Zero out the round keys |
254 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
255 | evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit); |
256 | evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit); |
257 | evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit); |
258 | evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit); |
259 | evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit); |
260 | evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit); |
261 | evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit); |
262 | evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit); |
263 | evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit); |
264 | evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit); |
265 | cmpl(rounds, 44); |
266 | jcc(Assembler::belowEqual, EXIT); |
267 | evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit); |
268 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
269 | cmpl(rounds, 52); |
270 | jcc(Assembler::belowEqual, EXIT); |
271 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
272 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
273 | bind(EXIT); |
274 | pop(rbx); |
275 | pop(rax); // return length |
276 | pop(r12); |
277 | pop(r13); |
278 | } |
279 | |
280 | // AES-ECB Decrypt Operation |
281 | void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) { |
282 | |
283 | Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT; |
284 | const Register pos = rax; |
285 | const Register rounds = r12; |
286 | push(r13); |
287 | push(r12); |
288 | |
289 | // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
290 | // context for the registers used, where all instructions below are using 128-bit mode |
291 | // On EVEX without VL and BW, these instructions will all be AVX. |
292 | if (VM_Version::supports_avx512vlbw()) { |
293 | movl(rax, 0xffff); |
294 | kmovql(k1, rax); |
295 | } |
296 | |
297 | push(len); // Save |
298 | push(rbx); |
299 | |
300 | vzeroupper(); |
301 | |
302 | xorptr(pos, pos); |
303 | // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds |
304 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
305 | |
306 | // Load Key shuf mask |
307 | const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front |
308 | movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
309 | |
310 | // Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption. |
311 | // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16 |
312 | ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask); |
313 | ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask); |
314 | ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask); |
315 | ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask); |
316 | ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask); |
317 | ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask); |
318 | ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask); |
319 | ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask); |
320 | ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask); |
321 | ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask); |
322 | ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask); |
323 | cmpl(rounds, 52); |
324 | jcc(Assembler::greaterEqual, KEY_192); |
325 | jmp(Loop_start); |
326 | |
327 | bind(KEY_192); |
328 | ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask); |
329 | ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask); |
330 | cmpl(rounds, 60); |
331 | jcc(Assembler::equal, KEY_256); |
332 | jmp(Loop_start); |
333 | |
334 | bind(KEY_256); |
335 | ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask); |
336 | ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask); |
337 | bind(Loop_start); |
338 | movq(rbx, len); |
339 | // Convert input length to number of blocks |
340 | shrq(len, 4); |
341 | shlq(rbx, 60); |
342 | jcc(Assembler::equal, NO_PARTS); |
343 | addq(len, 1); |
344 | // Check if number of blocks is greater than/ equal to 32 |
345 | // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP) |
346 | // If not, 16 bytes are processed (code marked by label REMAINDER) |
347 | bind(NO_PARTS); |
348 | movq(rbx, len); |
349 | shrq(len, 5); |
350 | jcc(Assembler::equal, REMAINDER); |
351 | movl(r13, len); |
352 | // Compute number of blocks that will be processed as 512 bytes at a time |
353 | // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop. |
354 | shlq(r13, 5); |
355 | subq(rbx, r13); |
356 | |
357 | bind(LOOP); |
358 | // Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7 |
359 | evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
360 | evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
361 | evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
362 | evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
363 | evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
364 | evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
365 | evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
366 | evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
367 | // Xor with the first round key |
368 | evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit); |
369 | evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit); |
370 | evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit); |
371 | evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit); |
372 | evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit); |
373 | evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit); |
374 | evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit); |
375 | evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit); |
376 | // 9 rounds of Aesdec |
377 | roundDec(xmm10, 7); |
378 | roundDec(xmm11, 7); |
379 | roundDec(xmm12, 7); |
380 | roundDec(xmm13, 7); |
381 | roundDec(xmm14, 7); |
382 | roundDec(xmm15, 7); |
383 | roundDec(xmm16, 7); |
384 | roundDec(xmm17, 7); |
385 | roundDec(xmm18, 7); |
386 | cmpl(rounds, 52); |
387 | jcc(Assembler::aboveEqual, AES192); |
388 | // Aesdeclast round for keysize = 128 |
389 | lastroundDec(xmm27, 7); |
390 | jmp(END_LOOP); |
391 | |
392 | bind(AES192); |
393 | // 2 Additional rounds for keysize = 192 |
394 | roundDec(xmm19, 7); |
395 | roundDec(xmm20, 7); |
396 | cmpl(rounds, 60); |
397 | jcc(Assembler::aboveEqual, AES256); |
398 | // Aesdeclast round for keysize = 192 |
399 | lastroundDec(xmm27, 7); |
400 | jmp(END_LOOP); |
401 | bind(AES256); |
402 | // 2 Additional rounds and Aesdeclast for keysize = 256 |
403 | roundDec(xmm21, 7); |
404 | roundDec(xmm22, 7); |
405 | lastroundDec(xmm27, 7); |
406 | |
407 | bind(END_LOOP); |
408 | // Write 512 bytes of PT to the destination |
409 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
410 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
411 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
412 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
413 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
414 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
415 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
416 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
417 | |
418 | addq(pos, 512); |
419 | decq(len); |
420 | jcc(Assembler::notEqual, LOOP); |
421 | |
422 | bind(REMAINDER); |
423 | vzeroupper(); |
424 | cmpq(rbx, 0); |
425 | jcc(Assembler::equal, END); |
426 | // Process 16 bytes at a time |
427 | bind(LOOP2); |
428 | movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0)); |
429 | vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit); |
430 | // xmm2 contains shuffled key for Aesdeclast operation. |
431 | vmovdqu(xmm2, xmm27); |
432 | |
433 | vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit); |
434 | vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit); |
435 | vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit); |
436 | vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit); |
437 | vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit); |
438 | vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit); |
439 | vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit); |
440 | vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit); |
441 | vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit); |
442 | |
443 | cmpl(rounds, 52); |
444 | jcc(Assembler::below, LAST2); |
445 | vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit); |
446 | vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit); |
447 | cmpl(rounds, 60); |
448 | jcc(Assembler::below, LAST2); |
449 | vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit); |
450 | vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit); |
451 | |
452 | bind(LAST2); |
453 | // Aesdeclast round |
454 | vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit); |
455 | // Write 16 bytes of PT to destination |
456 | movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1); |
457 | addq(pos, 16); |
458 | decq(rbx); |
459 | jcc(Assembler::notEqual, LOOP2); |
460 | |
461 | bind(END); |
462 | // Zero out the round keys |
463 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
464 | evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit); |
465 | evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit); |
466 | evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit); |
467 | evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit); |
468 | evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit); |
469 | evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit); |
470 | evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit); |
471 | evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit); |
472 | evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit); |
473 | evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit); |
474 | evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit); |
475 | cmpl(rounds, 44); |
476 | jcc(Assembler::belowEqual, EXIT); |
477 | evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit); |
478 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
479 | cmpl(rounds, 52); |
480 | jcc(Assembler::belowEqual, EXIT); |
481 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
482 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
483 | bind(EXIT); |
484 | pop(rbx); |
485 | pop(rax); // return length |
486 | pop(r12); |
487 | pop(r13); |
488 | } |
489 | |
490 | // Multiply 128 x 128 bits, using 4 pclmulqdq operations |
491 | void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, |
492 | XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { |
493 | movdqu(xmm15, Address(htbl, i * 16)); |
494 | vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 |
495 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
496 | vpclmulldq(tmp3, data, xmm15); // 0x00 |
497 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); |
498 | vpclmulhdq(tmp3, data, xmm15); // 0x11 |
499 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); |
500 | vpclmullqhqdq(tmp3, data, xmm15); // 0x10 |
501 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
502 | } |
503 | |
504 | // Multiply two 128 bit numbers resulting in a 256 bit value |
505 | // Result of the multiplication followed by reduction stored in state |
506 | void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { |
507 | const XMMRegister tmp1 = xmm4; |
508 | const XMMRegister tmp2 = xmm5; |
509 | const XMMRegister tmp3 = xmm6; |
510 | const XMMRegister tmp4 = xmm7; |
511 | |
512 | vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) |
513 | vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) |
514 | vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) |
515 | vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) |
516 | |
517 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) |
518 | |
519 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
520 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
521 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result |
522 | vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication |
523 | // Follows the reduction technique mentioned in |
524 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
525 | // First phase of reduction |
526 | // |
527 | vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 |
528 | vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 |
529 | vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 |
530 | // xor the shifted versions |
531 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
532 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
533 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
534 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
535 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete |
536 | // |
537 | // Second phase of the reduction |
538 | // |
539 | vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 |
540 | vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 |
541 | vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 |
542 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions |
543 | vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); |
544 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
545 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); |
546 | vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state |
547 | ret(0); |
548 | } |
549 | |
550 | // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. |
551 | // The power of H is used in reduction process for one block ghash |
552 | void MacroAssembler::generateHtbl_one_block(Register htbl) { |
553 | const XMMRegister t = xmm13; |
554 | |
555 | // load the original subkey hash |
556 | movdqu(t, Address(htbl, 0)); |
557 | // shuffle using long swap mask |
558 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
559 | vpshufb(t, t, xmm10, Assembler::AVX_128bit); |
560 | |
561 | // Compute H' = GFMUL(H, 2) |
562 | vpsrld(xmm3, t, 7, Assembler::AVX_128bit); |
563 | movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); |
564 | vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); |
565 | movl(rax, 0xff00); |
566 | movdl(xmm4, rax); |
567 | vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); |
568 | movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); |
569 | vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); |
570 | vpsrld(xmm3, t, 31, Assembler::AVX_128bit); |
571 | vpslld(xmm4, t, 1, Assembler::AVX_128bit); |
572 | vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); |
573 | vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 |
574 | |
575 | //Adding p(x)<<1 to xmm5 which holds the reduction polynomial |
576 | vpxor(t, t, xmm5, Assembler::AVX_128bit); |
577 | movdqu(Address(htbl, 1 * 16), t); // H * 2 |
578 | |
579 | ret(0); |
580 | } |
581 | |
582 | // This method takes the subkey after expansion as input and generates the remaining powers of subkey H. |
583 | // The power of H is used in reduction process for eight block ghash |
584 | void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { |
585 | const XMMRegister t = xmm13; |
586 | const XMMRegister tmp0 = xmm1; |
587 | Label GFMUL; |
588 | |
589 | movdqu(t, Address(htbl, 1 * 16)); |
590 | movdqu(tmp0, t); |
591 | |
592 | // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) |
593 | call(GFMUL, relocInfo::none); |
594 | movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 |
595 | call(GFMUL, relocInfo::none); |
596 | movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 |
597 | call(GFMUL, relocInfo::none); |
598 | movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 |
599 | call(GFMUL, relocInfo::none); |
600 | movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 |
601 | call(GFMUL, relocInfo::none); |
602 | movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 |
603 | call(GFMUL, relocInfo::none); |
604 | movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 |
605 | call(GFMUL, relocInfo::none); |
606 | movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 |
607 | ret(0); |
608 | |
609 | bind(GFMUL); |
610 | gfmul(tmp0, t); |
611 | } |
612 | |
613 | // Multiblock and single block GHASH computation using Shift XOR reduction technique |
614 | void MacroAssembler::avx_ghash(Register input_state, Register htbl, |
615 | Register input_data, Register blocks) { |
616 | |
617 | // temporary variables to hold input data and input state |
618 | const XMMRegister data = xmm1; |
619 | const XMMRegister state = xmm0; |
620 | // temporary variables to hold intermediate results |
621 | const XMMRegister tmp0 = xmm3; |
622 | const XMMRegister tmp1 = xmm4; |
623 | const XMMRegister tmp2 = xmm5; |
624 | const XMMRegister tmp3 = xmm6; |
625 | // temporary variables to hold byte and long swap masks |
626 | const XMMRegister bswap_mask = xmm2; |
627 | const XMMRegister lswap_mask = xmm14; |
628 | |
629 | Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, |
630 | ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; |
631 | |
632 | testptr(blocks, blocks); |
633 | jcc(Assembler::zero, EXIT_GHASH); |
634 | |
635 | // Check if Hashtable (1*16) has been already generated |
636 | // For anything less than 8 blocks, we generate only the first power of H. |
637 | movdqu(tmp2, Address(htbl, 1 * 16)); |
638 | ptest(tmp2, tmp2); |
639 | jcc(Assembler::notZero, BEGIN_PROCESS); |
640 | call(GENERATE_HTBL_1_BLK, relocInfo::none); |
641 | |
642 | // Shuffle the input state |
643 | bind(BEGIN_PROCESS); |
644 | movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
645 | movdqu(state, Address(input_state, 0)); |
646 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
647 | |
648 | cmpl(blocks, 8); |
649 | jcc(Assembler::below, ONE_BLK_INIT); |
650 | // If we have 8 blocks or more data, then generate remaining powers of H |
651 | movdqu(tmp2, Address(htbl, 8 * 16)); |
652 | ptest(tmp2, tmp2); |
653 | jcc(Assembler::notZero, PROCESS_8_BLOCKS); |
654 | call(GENERATE_HTBL_8_BLKS, relocInfo::none); |
655 | |
656 | //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time |
657 | //Each block = 16 bytes. |
658 | bind(PROCESS_8_BLOCKS); |
659 | subl(blocks, 8); |
660 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
661 | movdqu(data, Address(input_data, 16 * 7)); |
662 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
663 | //Loading 1*16 as calculated powers of H required starts at that location. |
664 | movdqu(xmm15, Address(htbl, 1 * 16)); |
665 | //Perform carryless multiplication of (H*2, data block #7) |
666 | vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 |
667 | vpclmulldq(tmp0, data, xmm15);//a0 * b0 |
668 | vpclmulhdq(tmp1, data, xmm15);//a1 * b1 |
669 | vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 |
670 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) |
671 | |
672 | movdqu(data, Address(input_data, 16 * 6)); |
673 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
674 | // Perform carryless multiplication of (H^2 * 2, data block #6) |
675 | schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); |
676 | |
677 | movdqu(data, Address(input_data, 16 * 5)); |
678 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
679 | // Perform carryless multiplication of (H^3 * 2, data block #5) |
680 | schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); |
681 | movdqu(data, Address(input_data, 16 * 4)); |
682 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
683 | // Perform carryless multiplication of (H^4 * 2, data block #4) |
684 | schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); |
685 | movdqu(data, Address(input_data, 16 * 3)); |
686 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
687 | // Perform carryless multiplication of (H^5 * 2, data block #3) |
688 | schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); |
689 | movdqu(data, Address(input_data, 16 * 2)); |
690 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
691 | // Perform carryless multiplication of (H^6 * 2, data block #2) |
692 | schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); |
693 | movdqu(data, Address(input_data, 16 * 1)); |
694 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
695 | // Perform carryless multiplication of (H^7 * 2, data block #1) |
696 | schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); |
697 | movdqu(data, Address(input_data, 16 * 0)); |
698 | // xor data block#0 with input state before perfoming carry-less multiplication |
699 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
700 | vpxor(data, data, state, Assembler::AVX_128bit); |
701 | // Perform carryless multiplication of (H^8 * 2, data block #0) |
702 | schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); |
703 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
704 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
705 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of |
706 | vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation |
707 | |
708 | // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 |
709 | // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 |
710 | // Follows the reduction technique mentioned in |
711 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
712 | bind(BLOCK8_REDUCTION); |
713 | // First Phase of the reduction |
714 | vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 |
715 | vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 |
716 | vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 |
717 | // xor the shifted versions |
718 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
719 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
720 | |
721 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
722 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
723 | |
724 | vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete |
725 | // second phase of the reduction |
726 | vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 |
727 | vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 |
728 | vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 |
729 | // xor the shifted versions |
730 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); |
731 | vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); |
732 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
733 | vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); |
734 | // Final result is in state |
735 | vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); |
736 | |
737 | lea(input_data, Address(input_data, 16 * 8)); |
738 | cmpl(blocks, 8); |
739 | jcc(Assembler::below, ONE_BLK_INIT); |
740 | jmp(PROCESS_8_BLOCKS); |
741 | |
742 | // Since this is one block operation we will only use H * 2 i.e. the first power of H |
743 | bind(ONE_BLK_INIT); |
744 | movdqu(tmp0, Address(htbl, 1 * 16)); |
745 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
746 | |
747 | //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. |
748 | bind(PROCESS_1_BLOCK); |
749 | cmpl(blocks, 0); |
750 | jcc(Assembler::equal, SAVE_STATE); |
751 | subl(blocks, 1); |
752 | movdqu(data, Address(input_data, 0)); |
753 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
754 | vpxor(state, state, data, Assembler::AVX_128bit); |
755 | // gfmul(H*2, state) |
756 | call(GFMUL, relocInfo::none); |
757 | addptr(input_data, 16); |
758 | jmp(PROCESS_1_BLOCK); |
759 | |
760 | bind(SAVE_STATE); |
761 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
762 | movdqu(Address(input_state, 0), state); |
763 | jmp(EXIT_GHASH); |
764 | |
765 | bind(GFMUL); |
766 | gfmul(tmp0, state); |
767 | |
768 | bind(GENERATE_HTBL_1_BLK); |
769 | generateHtbl_one_block(htbl); |
770 | |
771 | bind(GENERATE_HTBL_8_BLKS); |
772 | generateHtbl_eight_blocks(htbl); |
773 | |
774 | bind(EXIT_GHASH); |
775 | // zero out xmm registers used for Htbl storage |
776 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
777 | vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
778 | vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
779 | vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
780 | } |
781 | |
782 | // AES Counter Mode using VAES instructions |
783 | void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, |
784 | Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) { |
785 | |
786 | const Register rounds = 0; |
787 | const Register pos = r12; |
788 | |
789 | Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP, |
790 | AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16, |
791 | REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER, |
792 | AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP, |
793 | AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES, |
794 | EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR; |
795 | |
796 | cmpl(len_reg, 0); |
797 | jcc(Assembler::belowEqual, EXIT); |
798 | |
799 | movl(pos, 0); |
800 | // if the number of used encrypted counter bytes < 16, |
801 | // XOR PT with saved encrypted counter to obtain CT |
802 | bind(PRELOOP_START); |
803 | cmpl(used, 16); |
804 | jcc(Assembler::aboveEqual, EXIT_PRELOOP); |
805 | movb(rbx, Address(saved_encCounter_start, used)); |
806 | xorb(rbx, Address(src_addr, pos)); |
807 | movb(Address(dest_addr, pos), rbx); |
808 | addptr(pos, 1); |
809 | addptr(used, 1); |
810 | decrement(len_reg); |
811 | jmp(PRELOOP_START); |
812 | |
813 | bind(EXIT_PRELOOP); |
814 | movl(Address(used_addr, 0), used); |
815 | |
816 | // Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256). |
817 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
818 | |
819 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
820 | // Move initial counter value in xmm0 |
821 | movdqu(xmm0, Address(counter, 0)); |
822 | // broadcast counter value to zmm8 |
823 | evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit); |
824 | |
825 | // load lbswap mask |
826 | evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15); |
827 | |
828 | //shuffle counter using lbswap_mask |
829 | vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit); |
830 | |
831 | // pre-increment and propagate counter values to zmm9-zmm15 registers. |
832 | // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4 |
833 | // The counter is incremented after each block i.e. 16 bytes is processed; |
834 | // each zmm register has 4 counter values as its MSB |
835 | // the counters are incremented in parallel |
836 | vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0 |
837 | vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip) |
838 | vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
839 | vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
840 | vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
841 | vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
842 | vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
843 | vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
844 | |
845 | // load linc32 mask in zmm register.linc32 increments counter by 32 |
846 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32 |
847 | |
848 | // xmm31 contains the key shuffle mask. |
849 | movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
850 | // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value. |
851 | // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register |
852 | // that holds shuffled key value. |
853 | ev_load_key(xmm20, key, 0, xmm31); |
854 | ev_load_key(xmm21, key, 1 * 16, xmm31); |
855 | ev_load_key(xmm22, key, 2 * 16, xmm31); |
856 | ev_load_key(xmm23, key, 3 * 16, xmm31); |
857 | ev_load_key(xmm24, key, 4 * 16, xmm31); |
858 | ev_load_key(xmm25, key, 5 * 16, xmm31); |
859 | ev_load_key(xmm26, key, 6 * 16, xmm31); |
860 | ev_load_key(xmm27, key, 7 * 16, xmm31); |
861 | ev_load_key(xmm28, key, 8 * 16, xmm31); |
862 | ev_load_key(xmm29, key, 9 * 16, xmm31); |
863 | ev_load_key(xmm30, key, 10 * 16, xmm31); |
864 | |
865 | // Process 32 blocks or 512 bytes of data |
866 | bind(LOOP); |
867 | cmpl(len_reg, 512); |
868 | jcc(Assembler::less, REMAINDER); |
869 | subq(len_reg, 512); |
870 | //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7 |
871 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
872 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
873 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
874 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
875 | vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
876 | evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
877 | vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
878 | evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
879 | vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit); |
880 | evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit); |
881 | vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit); |
882 | evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit); |
883 | vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit); |
884 | evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit); |
885 | vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit); |
886 | evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit); |
887 | // Perform AES encode operations and put results in zmm0-zmm7. |
888 | // This is followed by incrementing counter values in zmm8-zmm15. |
889 | // Since we will be processing 32 blocks at a time, the counter is incremented by 32. |
890 | roundEnc(xmm21, 7); |
891 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
892 | roundEnc(xmm22, 7); |
893 | vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
894 | roundEnc(xmm23, 7); |
895 | vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit); |
896 | roundEnc(xmm24, 7); |
897 | vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit); |
898 | roundEnc(xmm25, 7); |
899 | vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit); |
900 | roundEnc(xmm26, 7); |
901 | vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit); |
902 | roundEnc(xmm27, 7); |
903 | vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit); |
904 | roundEnc(xmm28, 7); |
905 | vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit); |
906 | roundEnc(xmm29, 7); |
907 | |
908 | cmpl(rounds, 52); |
909 | jcc(Assembler::aboveEqual, AES192); |
910 | lastroundEnc(xmm30, 7); |
911 | jmp(END_LOOP); |
912 | |
913 | bind(AES192); |
914 | roundEnc(xmm30, 7); |
915 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
916 | roundEnc(xmm18, 7); |
917 | cmpl(rounds, 60); |
918 | jcc(Assembler::aboveEqual, AES256); |
919 | ev_load_key(xmm18, key, 12 * 16, xmm31); |
920 | lastroundEnc(xmm18, 7); |
921 | jmp(END_LOOP); |
922 | |
923 | bind(AES256); |
924 | ev_load_key(xmm18, key, 12 * 16, xmm31); |
925 | roundEnc(xmm18, 7); |
926 | ev_load_key(xmm18, key, 13 * 16, xmm31); |
927 | roundEnc(xmm18, 7); |
928 | ev_load_key(xmm18, key, 14 * 16, xmm31); |
929 | lastroundEnc(xmm18, 7); |
930 | |
931 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7 |
932 | // xor encrypted block cipher and input plaintext and store resultant ciphertext |
933 | bind(END_LOOP); |
934 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
935 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
936 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
937 | evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit); |
938 | evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
939 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
940 | evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
941 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
942 | evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
943 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
944 | evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
945 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
946 | evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
947 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
948 | evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
949 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
950 | addq(pos, 512); |
951 | jmp(LOOP); |
952 | |
953 | // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes |
954 | bind(REMAINDER); |
955 | cmpl(len_reg, 0); |
956 | jcc(Assembler::equal, END); |
957 | cmpl(len_reg, 256); |
958 | jcc(Assembler::aboveEqual, REMAINDER_16); |
959 | cmpl(len_reg, 128); |
960 | jcc(Assembler::aboveEqual, REMAINDER_8); |
961 | cmpl(len_reg, 64); |
962 | jcc(Assembler::aboveEqual, REMAINDER_4); |
963 | // At this point, we will process 16 bytes of data at a time. |
964 | // So load xmm19 with counter increment value as 1 |
965 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15); |
966 | jmp(REMAINDER_LOOP); |
967 | |
968 | // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data |
969 | bind(REMAINDER_16); |
970 | subq(len_reg, 256); |
971 | // As we process 16 blocks at a time, load mask for incrementing the counter value by 16 |
972 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip) |
973 | // shuffle counter and XOR counter with roundkey1 |
974 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
975 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
976 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
977 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
978 | vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
979 | evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
980 | vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
981 | evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
982 | // Increment counter values by 16 |
983 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
984 | vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
985 | // AES encode rounds |
986 | roundEnc(xmm21, 3); |
987 | roundEnc(xmm22, 3); |
988 | roundEnc(xmm23, 3); |
989 | roundEnc(xmm24, 3); |
990 | roundEnc(xmm25, 3); |
991 | roundEnc(xmm26, 3); |
992 | roundEnc(xmm27, 3); |
993 | roundEnc(xmm28, 3); |
994 | roundEnc(xmm29, 3); |
995 | |
996 | cmpl(rounds, 52); |
997 | jcc(Assembler::aboveEqual, AES192_REMAINDER16); |
998 | lastroundEnc(xmm30, 3); |
999 | jmp(REMAINDER16_END_LOOP); |
1000 | |
1001 | bind(AES192_REMAINDER16); |
1002 | roundEnc(xmm30, 3); |
1003 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
1004 | roundEnc(xmm18, 3); |
1005 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
1006 | |
1007 | cmpl(rounds, 60); |
1008 | jcc(Assembler::aboveEqual, AES256_REMAINDER16); |
1009 | lastroundEnc(xmm5, 3); |
1010 | jmp(REMAINDER16_END_LOOP); |
1011 | bind(AES256_REMAINDER16); |
1012 | roundEnc(xmm5, 3); |
1013 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
1014 | roundEnc(xmm6, 3); |
1015 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
1016 | lastroundEnc(xmm7, 3); |
1017 | |
1018 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3 |
1019 | // xor 256 bytes of PT with the encrypted counters to produce CT. |
1020 | bind(REMAINDER16_END_LOOP); |
1021 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit); |
1022 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
1023 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1024 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
1025 | evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
1026 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
1027 | evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
1028 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
1029 | addq(pos, 256); |
1030 | |
1031 | cmpl(len_reg, 128); |
1032 | jcc(Assembler::aboveEqual, REMAINDER_8); |
1033 | |
1034 | cmpl(len_reg, 64); |
1035 | jcc(Assembler::aboveEqual, REMAINDER_4); |
1036 | //load mask for incrementing the counter value by 1 |
1037 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
1038 | jmp(REMAINDER_LOOP); |
1039 | |
1040 | // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data |
1041 | bind(REMAINDER_8); |
1042 | subq(len_reg, 128); |
1043 | // As we process 8 blocks at a time, load mask for incrementing the counter value by 8 |
1044 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip) |
1045 | // shuffle counters and xor with roundkey1 |
1046 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
1047 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
1048 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
1049 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
1050 | // increment counter by 8 |
1051 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
1052 | // AES encode |
1053 | roundEnc(xmm21, 1); |
1054 | roundEnc(xmm22, 1); |
1055 | roundEnc(xmm23, 1); |
1056 | roundEnc(xmm24, 1); |
1057 | roundEnc(xmm25, 1); |
1058 | roundEnc(xmm26, 1); |
1059 | roundEnc(xmm27, 1); |
1060 | roundEnc(xmm28, 1); |
1061 | roundEnc(xmm29, 1); |
1062 | |
1063 | cmpl(rounds, 52); |
1064 | jcc(Assembler::aboveEqual, AES192_REMAINDER8); |
1065 | lastroundEnc(xmm30, 1); |
1066 | jmp(REMAINDER8_END_LOOP); |
1067 | |
1068 | bind(AES192_REMAINDER8); |
1069 | roundEnc(xmm30, 1); |
1070 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
1071 | roundEnc(xmm18, 1); |
1072 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
1073 | cmpl(rounds, 60); |
1074 | jcc(Assembler::aboveEqual, AES256_REMAINDER8); |
1075 | lastroundEnc(xmm5, 1); |
1076 | jmp(REMAINDER8_END_LOOP); |
1077 | |
1078 | bind(AES256_REMAINDER8); |
1079 | roundEnc(xmm5, 1); |
1080 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
1081 | roundEnc(xmm6, 1); |
1082 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
1083 | lastroundEnc(xmm7, 1); |
1084 | |
1085 | bind(REMAINDER8_END_LOOP); |
1086 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1 |
1087 | // XOR PT with the encrypted counter and store as CT |
1088 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1089 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
1090 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1091 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
1092 | addq(pos, 128); |
1093 | |
1094 | cmpl(len_reg, 64); |
1095 | jcc(Assembler::aboveEqual, REMAINDER_4); |
1096 | // load mask for incrementing the counter value by 1 |
1097 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
1098 | jmp(REMAINDER_LOOP); |
1099 | |
1100 | // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code |
1101 | bind(REMAINDER_4); |
1102 | subq(len_reg, 64); |
1103 | // As we process 4 blocks at a time, load mask for incrementing the counter value by 4 |
1104 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
1105 | // XOR counter with first roundkey |
1106 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
1107 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
1108 | // Increment counter |
1109 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
1110 | vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit); |
1111 | vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit); |
1112 | vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit); |
1113 | vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit); |
1114 | vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit); |
1115 | vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit); |
1116 | vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit); |
1117 | vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit); |
1118 | vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit); |
1119 | cmpl(rounds, 52); |
1120 | jcc(Assembler::aboveEqual, AES192_REMAINDER4); |
1121 | vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
1122 | jmp(END_REMAINDER4); |
1123 | |
1124 | bind(AES192_REMAINDER4); |
1125 | vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
1126 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
1127 | vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit); |
1128 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
1129 | |
1130 | cmpl(rounds, 60); |
1131 | jcc(Assembler::aboveEqual, AES256_REMAINDER4); |
1132 | vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
1133 | jmp(END_REMAINDER4); |
1134 | |
1135 | bind(AES256_REMAINDER4); |
1136 | vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
1137 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
1138 | vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit); |
1139 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
1140 | vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit); |
1141 | // After AES encode rounds, the encrypted block cipher lies in zmm0. |
1142 | // XOR encrypted block cipher with PT and store 64 bytes of ciphertext |
1143 | bind(END_REMAINDER4); |
1144 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1145 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
1146 | addq(pos, 64); |
1147 | // load mask for incrementing the counter value by 1 |
1148 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
1149 | |
1150 | // For a single block, the AES rounds start here. |
1151 | bind(REMAINDER_LOOP); |
1152 | cmpl(len_reg, 0); |
1153 | jcc(Assembler::belowEqual, END); |
1154 | // XOR counter with first roundkey |
1155 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit); |
1156 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit); |
1157 | vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit); |
1158 | // Increment counter by 1 |
1159 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit); |
1160 | vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit); |
1161 | vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit); |
1162 | vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit); |
1163 | vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit); |
1164 | vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit); |
1165 | vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit); |
1166 | vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit); |
1167 | vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit); |
1168 | |
1169 | cmpl(rounds, 52); |
1170 | jcc(Assembler::aboveEqual, AES192_REMAINDER); |
1171 | vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
1172 | jmp(END_REMAINDER_LOOP); |
1173 | |
1174 | bind(AES192_REMAINDER); |
1175 | vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
1176 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
1177 | vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit); |
1178 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
1179 | cmpl(rounds, 60); |
1180 | jcc(Assembler::aboveEqual, AES256_REMAINDER); |
1181 | vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
1182 | jmp(END_REMAINDER_LOOP); |
1183 | |
1184 | bind(AES256_REMAINDER); |
1185 | vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
1186 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
1187 | vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit); |
1188 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
1189 | vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit); |
1190 | |
1191 | bind(END_REMAINDER_LOOP); |
1192 | // If the length register is less than the blockSize i.e. 16 |
1193 | // then we store only those bytes of the CT to the destination |
1194 | // corresponding to the length register value |
1195 | // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES |
1196 | cmpl(len_reg, 16); |
1197 | jcc(Assembler::less, EXTRACT_TAILBYTES); |
1198 | subl(len_reg, 16); |
1199 | // After AES encode rounds, the encrypted block cipher lies in xmm0. |
1200 | // If the length register is equal to 16 bytes, store CT in dest after XOR operation. |
1201 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
1202 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit); |
1203 | addl(pos, 16); |
1204 | |
1205 | jmp(REMAINDER_LOOP); |
1206 | |
1207 | bind(EXTRACT_TAILBYTES); |
1208 | // Save encrypted counter value in xmm0 for next invocation, before XOR operation |
1209 | movdqu(Address(saved_encCounter_start, 0), xmm0); |
1210 | // XOR encryted block cipher in xmm0 with PT to produce CT |
1211 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
1212 | // extract upto 15 bytes of CT from xmm0 as specified by length register |
1213 | testptr(len_reg, 8); |
1214 | jcc(Assembler::zero, EXTRACT_TAIL_4BYTES); |
1215 | pextrq(Address(dest_addr, pos), xmm0, 0); |
1216 | psrldq(xmm0, 8); |
1217 | addl(pos, 8); |
1218 | bind(EXTRACT_TAIL_4BYTES); |
1219 | testptr(len_reg, 4); |
1220 | jcc(Assembler::zero, EXTRACT_TAIL_2BYTES); |
1221 | pextrd(Address(dest_addr, pos), xmm0, 0); |
1222 | psrldq(xmm0, 4); |
1223 | addq(pos, 4); |
1224 | bind(EXTRACT_TAIL_2BYTES); |
1225 | testptr(len_reg, 2); |
1226 | jcc(Assembler::zero, EXTRACT_TAIL_1BYTE); |
1227 | pextrw(Address(dest_addr, pos), xmm0, 0); |
1228 | psrldq(xmm0, 2); |
1229 | addl(pos, 2); |
1230 | bind(EXTRACT_TAIL_1BYTE); |
1231 | testptr(len_reg, 1); |
1232 | jcc(Assembler::zero, END); |
1233 | pextrb(Address(dest_addr, pos), xmm0, 0); |
1234 | addl(pos, 1); |
1235 | |
1236 | bind(END); |
1237 | // If there are no tail bytes, store counter value and exit |
1238 | cmpl(len_reg, 0); |
1239 | jcc(Assembler::equal, STORE_CTR); |
1240 | movl(Address(used_addr, 0), len_reg); |
1241 | |
1242 | bind(STORE_CTR); |
1243 | //shuffle updated counter and store it |
1244 | vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit); |
1245 | movdqu(Address(counter, 0), xmm8); |
1246 | // Zero out counter and key registers |
1247 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
1248 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
1249 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
1250 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
1251 | evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit); |
1252 | evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit); |
1253 | evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit); |
1254 | evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit); |
1255 | evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit); |
1256 | evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit); |
1257 | evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit); |
1258 | evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit); |
1259 | cmpl(rounds, 44); |
1260 | jcc(Assembler::belowEqual, EXIT); |
1261 | evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit); |
1262 | evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit); |
1263 | cmpl(rounds, 52); |
1264 | jcc(Assembler::belowEqual, EXIT); |
1265 | evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit); |
1266 | evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit); |
1267 | bind(EXIT); |
1268 | } |
1269 | |
1270 | void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) { |
1271 | const XMMRegister TMP1 = xmm0; |
1272 | const XMMRegister TMP2 = xmm1; |
1273 | const XMMRegister TMP3 = xmm2; |
1274 | |
1275 | evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit); |
1276 | evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit); |
1277 | evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit); |
1278 | evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit); |
1279 | evpxorq(GH, GH, TMP3, Assembler::AVX_512bit); |
1280 | vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit); |
1281 | vpslldq(GH, GH, 8, Assembler::AVX_512bit); |
1282 | evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit); |
1283 | evpxorq(GH, GH, TMP2, Assembler::AVX_512bit); |
1284 | |
1285 | evmovdquq(TMP3, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, r15); |
1286 | evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit); |
1287 | vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit); |
1288 | evpxorq(GH, GH, TMP2, Assembler::AVX_512bit); |
1289 | evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit); |
1290 | vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit); |
1291 | evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit); |
1292 | vpslldq(GH, GH, 4, Assembler::AVX_512bit); |
1293 | vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit); |
1294 | } |
1295 | |
1296 | void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) { |
1297 | const XMMRegister HK = xmm6; |
1298 | const XMMRegister ZT5 = xmm4; |
1299 | const XMMRegister ZT7 = xmm7; |
1300 | const XMMRegister ZT8 = xmm8; |
1301 | |
1302 | Label GFMUL_AVX512; |
1303 | |
1304 | movdqu(HK, Address(htbl, 0)); |
1305 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
1306 | vpshufb(HK, HK, xmm10, Assembler::AVX_128bit); |
1307 | |
1308 | movdqu(xmm11, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 64)); // Poly |
1309 | movdqu(xmm12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 80)); // Twoone |
1310 | // Compute H ^ 2 from the input subkeyH |
1311 | movdqu(xmm2, xmm6); |
1312 | vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit); |
1313 | vpsrlq(xmm2, xmm2, 63, Assembler::AVX_128bit); |
1314 | movdqu(xmm1, xmm2); |
1315 | vpslldq(xmm2, xmm2, 8, Assembler::AVX_128bit); |
1316 | vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit); |
1317 | vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); |
1318 | |
1319 | vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit); |
1320 | vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit); |
1321 | vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit); |
1322 | vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); |
1323 | movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2 |
1324 | // Compute the remaining three powers of H using XMM registers and all following powers using ZMM |
1325 | movdqu(ZT5, HK); |
1326 | vinserti32x4(ZT7, ZT7, HK, 3); |
1327 | |
1328 | gfmul_avx512(ZT5, HK); |
1329 | movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2 |
1330 | vinserti32x4(ZT7, ZT7, ZT5, 2); |
1331 | |
1332 | gfmul_avx512(ZT5, HK); |
1333 | movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3 |
1334 | vinserti32x4(ZT7, ZT7, ZT5, 1); |
1335 | |
1336 | gfmul_avx512(ZT5, HK); |
1337 | movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4 |
1338 | vinserti32x4(ZT7, ZT7, ZT5, 0); |
1339 | |
1340 | evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); |
1341 | evmovdquq(ZT8, ZT7, Assembler::AVX_512bit); |
1342 | gfmul_avx512(ZT7, ZT5); |
1343 | evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit); |
1344 | evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit); |
1345 | gfmul_avx512(ZT8, ZT5); |
1346 | evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit); |
1347 | gfmul_avx512(ZT7, ZT5); |
1348 | evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit); |
1349 | gfmul_avx512(ZT8, ZT5); |
1350 | evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit); |
1351 | gfmul_avx512(ZT7, ZT5); |
1352 | evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit); |
1353 | gfmul_avx512(ZT8, ZT5); |
1354 | evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit); |
1355 | gfmul_avx512(ZT7, ZT5); |
1356 | evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit); |
1357 | gfmul_avx512(ZT8, ZT5); |
1358 | evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit); |
1359 | gfmul_avx512(ZT7, ZT5); |
1360 | evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit); |
1361 | gfmul_avx512(ZT8, ZT5); |
1362 | evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit); |
1363 | gfmul_avx512(ZT7, ZT5); |
1364 | evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit); |
1365 | ret(0); |
1366 | } |
1367 | |
1368 | #define vclmul_reduce(out, poly, hi128, lo128, tmp0, tmp1)evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); vpslldq (tmp0, tmp0, 8, Assembler::AVX_512bit); evpxorq(tmp0, lo128, tmp0 , Assembler::AVX_512bit); evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit ); evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); vpslldq(out, out, 4, Assembler::AVX_512bit); vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \ |
1369 | evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); \ |
1370 | vpslldq(tmp0, tmp0, 8, Assembler::AVX_512bit); \ |
1371 | evpxorq(tmp0, lo128, tmp0, Assembler::AVX_512bit); \ |
1372 | evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); \ |
1373 | vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit); \ |
1374 | evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); \ |
1375 | vpslldq(out, out, 4, Assembler::AVX_512bit); \ |
1376 | vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \ |
1377 | |
1378 | #define vhpxori4x128(reg, tmp)vextracti64x4(tmp, reg, 1); evpxorq(reg, reg, tmp, Assembler:: AVX_256bit); vextracti32x4(tmp, reg, 1); evpxorq(reg, reg, tmp , Assembler::AVX_128bit); \ |
1379 | vextracti64x4(tmp, reg, 1); \ |
1380 | evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \ |
1381 | vextracti32x4(tmp, reg, 1); \ |
1382 | evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \ |
1383 | |
1384 | #define roundEncode(key, dst1, dst2, dst3, dst4)vaesenc(dst1, dst1, key, Assembler::AVX_512bit); vaesenc(dst2 , dst2, key, Assembler::AVX_512bit); vaesenc(dst3, dst3, key, Assembler::AVX_512bit); vaesenc(dst4, dst4, key, Assembler:: AVX_512bit); \ |
1385 | vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \ |
1386 | vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \ |
1387 | vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \ |
1388 | vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \ |
1389 | |
1390 | #define lastroundEncode(key, dst1, dst2, dst3, dst4)vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); vaesenclast (dst2, dst2, key, Assembler::AVX_512bit); vaesenclast(dst3, dst3 , key, Assembler::AVX_512bit); vaesenclast(dst4, dst4, key, Assembler ::AVX_512bit); \ |
1391 | vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \ |
1392 | vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \ |
1393 | vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \ |
1394 | vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \ |
1395 | |
1396 | #define storeData(dst, position, src1, src2, src3, src4)evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1 , Assembler::AVX_512bit); evmovdquq(Address(dst, position, Address ::times_1, 1 * 64), src2, Assembler::AVX_512bit); evmovdquq(Address (dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit ); evmovdquq(Address(dst, position, Address::times_1, 3 * 64) , src4, Assembler::AVX_512bit); \ |
1397 | evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \ |
1398 | evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \ |
1399 | evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \ |
1400 | evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \ |
1401 | |
1402 | #define loadData(src, position, dst1, dst2, dst3, dst4)evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64 ), Assembler::AVX_512bit); evmovdquq(dst2, Address(src, position , Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq (dst3, Address(src, position, Address::times_1, 2 * 64), Assembler ::AVX_512bit); evmovdquq(dst4, Address(src, position, Address ::times_1, 3 * 64), Assembler::AVX_512bit); \ |
1403 | evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \ |
1404 | evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \ |
1405 | evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \ |
1406 | evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \ |
1407 | |
1408 | #define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey)evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit) ; evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit ); evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit ); evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit ); \ |
1409 | evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit); \ |
1410 | evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit); \ |
1411 | evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit); \ |
1412 | evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit); \ |
1413 | |
1414 | #define shuffleExorRnd1Key(dst0, dst1, dst2, dst3, shufmask, rndkey)vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); evpxorq (dst0, dst0, rndkey, Assembler::AVX_512bit); vpshufb(dst1, dst1 , shufmask, Assembler::AVX_512bit); evpxorq(dst1, dst1, rndkey , Assembler::AVX_512bit); vpshufb(dst2, dst2, shufmask, Assembler ::AVX_512bit); evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit ); vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); evpxorq (dst3, dst3, rndkey, Assembler::AVX_512bit); \ |
1415 | vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); \ |
1416 | evpxorq(dst0, dst0, rndkey, Assembler::AVX_512bit); \ |
1417 | vpshufb(dst1, dst1, shufmask, Assembler::AVX_512bit); \ |
1418 | evpxorq(dst1, dst1, rndkey, Assembler::AVX_512bit); \ |
1419 | vpshufb(dst2, dst2, shufmask, Assembler::AVX_512bit); \ |
1420 | evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit); \ |
1421 | vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); \ |
1422 | evpxorq(dst3, dst3, rndkey, Assembler::AVX_512bit); \ |
1423 | |
1424 | #define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3)evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); evpxorq(dst1 , dst1, src1, Assembler::AVX_512bit); evpxorq(dst2, dst2, src2 , Assembler::AVX_512bit); evpxorq(dst3, dst3, src3, Assembler ::AVX_512bit); \ |
1425 | evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \ |
1426 | evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \ |
1427 | evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \ |
1428 | evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \ |
1429 | |
1430 | #define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33)vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); vpternlogq (dst1, 0x96, src12, src13, Assembler::AVX_512bit); vpternlogq (dst2, 0x96, src22, src23, Assembler::AVX_512bit); vpternlogq (dst3, 0x96, src32, src33, Assembler::AVX_512bit); \ |
1431 | vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \ |
1432 | vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \ |
1433 | vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \ |
1434 | vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \ |
1435 | |
1436 | void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx, |
1437 | Register in, Register out, Register data, Register pos, bool first_time_reduction, XMMRegister addmask, bool ghash_input, Register rounds, |
1438 | Register ghash_pos, bool final_reduction, int i, XMMRegister counter_inc_mask) { |
1439 | |
1440 | Label AES_192, AES_256, LAST_AES_RND; |
1441 | const XMMRegister ZTMP0 = xmm0; |
1442 | const XMMRegister ZTMP1 = xmm3; |
1443 | const XMMRegister ZTMP2 = xmm4; |
1444 | const XMMRegister ZTMP3 = xmm5; |
1445 | const XMMRegister ZTMP5 = xmm7; |
1446 | const XMMRegister ZTMP6 = xmm10; |
1447 | const XMMRegister ZTMP7 = xmm11; |
1448 | const XMMRegister ZTMP8 = xmm12; |
1449 | const XMMRegister ZTMP9 = xmm13; |
1450 | const XMMRegister ZTMP10 = xmm15; |
1451 | const XMMRegister ZTMP11 = xmm16; |
1452 | const XMMRegister ZTMP12 = xmm17; |
1453 | |
1454 | const XMMRegister ZTMP13 = xmm19; |
1455 | const XMMRegister ZTMP14 = xmm20; |
1456 | const XMMRegister ZTMP15 = xmm21; |
1457 | const XMMRegister ZTMP16 = xmm30; |
1458 | const XMMRegister ZTMP17 = xmm31; |
1459 | const XMMRegister ZTMP18 = xmm1; |
1460 | const XMMRegister ZTMP19 = xmm2; |
1461 | const XMMRegister ZTMP20 = xmm8; |
1462 | const XMMRegister ZTMP21 = xmm22; |
1463 | const XMMRegister ZTMP22 = xmm23; |
1464 | |
1465 | // Pre increment counters |
1466 | vpaddd(ZTMP0, ctr_blockx, counter_inc_mask, Assembler::AVX_512bit); |
1467 | vpaddd(ZTMP1, ZTMP0, counter_inc_mask, Assembler::AVX_512bit); |
1468 | vpaddd(ZTMP2, ZTMP1, counter_inc_mask, Assembler::AVX_512bit); |
1469 | vpaddd(ZTMP3, ZTMP2, counter_inc_mask, Assembler::AVX_512bit); |
1470 | // Save counter value |
1471 | evmovdquq(ctr_blockx, ZTMP3, Assembler::AVX_512bit); |
1472 | |
1473 | // Reuse ZTMP17 / ZTMP18 for loading AES Keys |
1474 | // Pre-load AES round keys |
1475 | ev_load_key(ZTMP17, key, 0, xmm29); |
1476 | ev_load_key(ZTMP18, key, 1 * 16, xmm29); |
1477 | |
1478 | // ZTMP19 & ZTMP20 used for loading hash key |
1479 | // Pre-load hash key |
1480 | evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit); |
1481 | evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
1482 | // Load data for computing ghash |
1483 | evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1484 | vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); |
1485 | |
1486 | // Xor cipher block 0 with input ghash, if available |
1487 | if (ghash_input) { |
1488 | evpxorq(ZTMP21, ZTMP21, aad_hashx, Assembler::AVX_512bit); |
1489 | } |
1490 | // Load data for computing ghash |
1491 | evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1492 | vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); |
1493 | |
1494 | // stitch AES rounds with GHASH |
1495 | // AES round 0, xmm24 has shuffle mask |
1496 | shuffleExorRnd1Key(ZTMP0, ZTMP1, ZTMP2, ZTMP3, xmm24, ZTMP17)vpshufb(ZTMP0, ZTMP0, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vpshufb(ZTMP1, ZTMP1 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP1, ZTMP1, ZTMP17 , Assembler::AVX_512bit); vpshufb(ZTMP2, ZTMP2, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit ); vpshufb(ZTMP3, ZTMP3, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1497 | // Reuse ZTMP17 / ZTMP18 for loading remaining AES Keys |
1498 | ev_load_key(ZTMP17, key, 2 * 16, xmm29); |
1499 | // GHASH 4 blocks |
1500 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19)evpclmulqdq(ZTMP6, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );; |
1501 | // Load the next hkey and Ghash data |
1502 | evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
1503 | evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
1504 | vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); |
1505 | |
1506 | // AES round 1 |
1507 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1508 | ev_load_key(ZTMP18, key, 3 * 16, xmm29); |
1509 | |
1510 | // GHASH 4 blocks(11 to 8) |
1511 | carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );; |
1512 | // Load the next hkey and GDATA |
1513 | evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
1514 | evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
1515 | vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); |
1516 | |
1517 | // AES round 2 |
1518 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1519 | ev_load_key(ZTMP17, key, 4 * 16, xmm29); |
1520 | |
1521 | // GHASH 4 blocks(7 to 4) |
1522 | carrylessMultiply(ZTMP14, ZTMP16, ZTMP15, ZTMP13, ZTMP21, ZTMP19)evpclmulqdq(ZTMP14, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP16, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP15, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP13, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );; |
1523 | // AES rounds 3 |
1524 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1525 | ev_load_key(ZTMP18, key, 5 * 16, xmm29); |
1526 | |
1527 | // Gather(XOR) GHASH for 12 blocks |
1528 | xorGHASH(ZTMP5, ZTMP6, ZTMP8, ZTMP7, ZTMP9, ZTMP13, ZTMP10, ZTMP14, ZTMP12, ZTMP16, ZTMP11, ZTMP15)vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP13, Assembler::AVX_512bit) ; vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP14, Assembler::AVX_512bit ); vpternlogq(ZTMP8, 0x96, ZTMP12, ZTMP16, Assembler::AVX_512bit ); vpternlogq(ZTMP7, 0x96, ZTMP11, ZTMP15, Assembler::AVX_512bit );; |
1529 | |
1530 | // AES rounds 4 |
1531 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1532 | ev_load_key(ZTMP17, key, 6 * 16, xmm29); |
1533 | |
1534 | // load plain / cipher text(recycle registers) |
1535 | loadData(in, pos, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evmovdquq(ZTMP13, Address(in, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(in, pos, Address ::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP15, Address(in, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit ); evmovdquq(ZTMP16, Address(in, pos, Address::times_1, 3 * 64 ), Assembler::AVX_512bit);; |
1536 | |
1537 | // AES rounds 5 |
1538 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1539 | ev_load_key(ZTMP18, key, 7 * 16, xmm29); |
1540 | // GHASH 4 blocks(3 to 0) |
1541 | carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );; |
1542 | |
1543 | // AES round 6 |
1544 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1545 | ev_load_key(ZTMP17, key, 8 * 16, xmm29); |
1546 | |
1547 | // gather GHASH in ZTMP6(low) and ZTMP5(high) |
1548 | if (first_time_reduction) { |
1549 | vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit); |
1550 | evpxorq(xmm25, ZTMP7, ZTMP11, Assembler::AVX_512bit); |
1551 | evpxorq(xmm27, ZTMP5, ZTMP9, Assembler::AVX_512bit); |
1552 | evpxorq(xmm26, ZTMP6, ZTMP10, Assembler::AVX_512bit); |
1553 | } |
1554 | else if (!first_time_reduction && !final_reduction) { |
1555 | xorGHASH(ZTMP7, xmm25, xmm27, xmm26, ZTMP8, ZTMP12, ZTMP7, ZTMP11, ZTMP5, ZTMP9, ZTMP6, ZTMP10)vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit) ; vpternlogq(xmm25, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(xmm27, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit ); vpternlogq(xmm26, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit );; |
1556 | } |
1557 | |
1558 | if (final_reduction) { |
1559 | // Phase one: Add mid products together |
1560 | // Also load polynomial constant for reduction |
1561 | vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit); |
1562 | vpternlogq(ZTMP7, 0x96, xmm25, ZTMP11, Assembler::AVX_512bit); |
1563 | vpsrldq(ZTMP11, ZTMP7, 8, Assembler::AVX_512bit); |
1564 | vpslldq(ZTMP7, ZTMP7, 8, Assembler::AVX_512bit); |
1565 | evmovdquq(ZTMP12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx); |
1566 | } |
1567 | // AES round 7 |
1568 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1569 | ev_load_key(ZTMP18, key, 9 * 16, xmm29); |
1570 | if (final_reduction) { |
1571 | vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP11, Assembler::AVX_512bit); |
1572 | evpxorq(ZTMP5, ZTMP5, xmm27, Assembler::AVX_512bit); |
1573 | vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP7, Assembler::AVX_512bit); |
1574 | evpxorq(ZTMP6, ZTMP6, xmm26, Assembler::AVX_512bit); |
1575 | } |
1576 | // AES round 8 |
1577 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1578 | ev_load_key(ZTMP17, key, 10 * 16, xmm29); |
1579 | |
1580 | // Horizontal xor of low and high 4*128 |
1581 | if (final_reduction) { |
1582 | vhpxori4x128(ZTMP5, ZTMP9)vextracti64x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler ::AVX_256bit); vextracti32x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler::AVX_128bit);; |
1583 | vhpxori4x128(ZTMP6, ZTMP10)vextracti64x4(ZTMP10, ZTMP6, 1); evpxorq(ZTMP6, ZTMP6, ZTMP10 , Assembler::AVX_256bit); vextracti32x4(ZTMP10, ZTMP6, 1); evpxorq (ZTMP6, ZTMP6, ZTMP10, Assembler::AVX_128bit);; |
1584 | } |
1585 | // AES round 9 |
1586 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1587 | // First phase of reduction |
1588 | if (final_reduction) { |
1589 | evpclmulqdq(ZTMP10, ZTMP12, ZTMP6, 0x01, Assembler::AVX_128bit); |
1590 | vpslldq(ZTMP10, ZTMP10, 8, Assembler::AVX_128bit); |
1591 | evpxorq(ZTMP10, ZTMP6, ZTMP10, Assembler::AVX_128bit); |
1592 | } |
1593 | cmpl(rounds, 52); |
1594 | jcc(Assembler::greaterEqual, AES_192); |
1595 | jmp(LAST_AES_RND); |
1596 | // AES rounds upto 11 (AES192) or 13 (AES256) |
1597 | bind(AES_192); |
1598 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1599 | ev_load_key(ZTMP18, key, 11 * 16, xmm29); |
1600 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1601 | ev_load_key(ZTMP17, key, 12 * 16, xmm29); |
1602 | cmpl(rounds, 60); |
1603 | jcc(Assembler::aboveEqual, AES_256); |
1604 | jmp(LAST_AES_RND); |
1605 | |
1606 | bind(AES_256); |
1607 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
1608 | ev_load_key(ZTMP18, key, 13 * 16, xmm29); |
1609 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
1610 | ev_load_key(ZTMP17, key, 14 * 16, xmm29); |
1611 | |
1612 | bind(LAST_AES_RND); |
1613 | // Second phase of reduction |
1614 | if (final_reduction) { |
1615 | evpclmulqdq(ZTMP9, ZTMP12, ZTMP10, 0x00, Assembler::AVX_128bit); |
1616 | vpsrldq(ZTMP9, ZTMP9, 4, Assembler::AVX_128bit); // Shift-R 1-DW to obtain 2-DWs shift-R |
1617 | evpclmulqdq(ZTMP11, ZTMP12, ZTMP10, 0x10, Assembler::AVX_128bit); |
1618 | vpslldq(ZTMP11, ZTMP11, 4, Assembler::AVX_128bit); // Shift-L 1-DW for result |
1619 | // ZTMP5 = ZTMP5 X ZTMP11 X ZTMP9 |
1620 | vpternlogq(ZTMP5, 0x96, ZTMP11, ZTMP9, Assembler::AVX_128bit); |
1621 | } |
1622 | // Last AES round |
1623 | lastroundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenclast(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenclast (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP2 , ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP3, ZTMP3 , ZTMP17, Assembler::AVX_512bit);; |
1624 | // XOR against plain / cipher text |
1625 | xorBeforeStore(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evpxorq(ZTMP0, ZTMP0, ZTMP13, Assembler::AVX_512bit); evpxorq (ZTMP1, ZTMP1, ZTMP14, Assembler::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP15, Assembler::AVX_512bit); evpxorq(ZTMP3, ZTMP3, ZTMP16, Assembler::AVX_512bit);; |
1626 | // store cipher / plain text |
1627 | storeData(out, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP0, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP1, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP2, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP3 , Assembler::AVX_512bit);; |
1628 | } |
1629 | |
1630 | void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, |
1631 | Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) { |
1632 | Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32, |
1633 | AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16; |
1634 | const XMMRegister CTR_BLOCKx = xmm9; |
1635 | const XMMRegister AAD_HASHx = xmm14; |
1636 | const Register pos = rax; |
1637 | const Register rounds = r15; |
1638 | Register ghash_pos; |
1639 | #ifndef _WIN64 |
1640 | ghash_pos = r14; |
1641 | #else |
1642 | ghash_pos = r11; |
1643 | #endif // !_WIN64 |
1644 | const XMMRegister ZTMP0 = xmm0; |
1645 | const XMMRegister ZTMP1 = xmm3; |
1646 | const XMMRegister ZTMP2 = xmm4; |
1647 | const XMMRegister ZTMP3 = xmm5; |
1648 | const XMMRegister ZTMP4 = xmm6; |
1649 | const XMMRegister ZTMP5 = xmm7; |
1650 | const XMMRegister ZTMP6 = xmm10; |
1651 | const XMMRegister ZTMP7 = xmm11; |
1652 | const XMMRegister ZTMP8 = xmm12; |
1653 | const XMMRegister ZTMP9 = xmm13; |
1654 | const XMMRegister ZTMP10 = xmm15; |
1655 | const XMMRegister ZTMP11 = xmm16; |
1656 | const XMMRegister ZTMP12 = xmm17; |
1657 | const XMMRegister ZTMP13 = xmm19; |
1658 | const XMMRegister ZTMP14 = xmm20; |
1659 | const XMMRegister ZTMP15 = xmm21; |
1660 | const XMMRegister ZTMP16 = xmm30; |
1661 | const XMMRegister COUNTER_INC_MASK = xmm18; |
1662 | |
1663 | movl(pos, 0); // Total length processed |
1664 | // Min data size processed = 768 bytes |
1665 | cmpl(len, 768); |
1666 | jcc(Assembler::less, ENC_DEC_DONE); |
1667 | |
1668 | // Generate 48 constants for htbl |
1669 | call(GENERATE_HTBL_48_BLKS, relocInfo::none); |
1670 | int index = 0; // Index for choosing subkeyHtbl entry |
1671 | movl(ghash_pos, 0); // Pointer for ghash read and store operations |
1672 | |
1673 | // Move initial counter value and STATE value into variables |
1674 | movdqu(CTR_BLOCKx, Address(counter, 0)); |
1675 | movdqu(AAD_HASHx, Address(state, 0)); |
1676 | // Load lswap mask for ghash |
1677 | movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()), rbx); |
1678 | // Shuffle input state using lswap mask |
1679 | vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit); |
1680 | |
1681 | // Compute #rounds for AES based on the length of the key array |
1682 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
1683 | |
1684 | // Broadcast counter value to 512 bit register |
1685 | evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit); |
1686 | // Load counter shuffle mask |
1687 | evmovdquq(xmm24, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, rbx); |
1688 | // Shuffle counter |
1689 | vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit); |
1690 | |
1691 | // Load mask for incrementing counter |
1692 | evmovdquq(COUNTER_INC_MASK, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, rbx); |
1693 | // Pre-increment counter |
1694 | vpaddd(ZTMP5, CTR_BLOCKx, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, rbx); |
1695 | vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1696 | vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1697 | vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1698 | |
1699 | // Begin 32 blocks of AES processing |
1700 | bind(AES_32_BLOCKS); |
1701 | // Save incremented counter before overwriting it with AES data |
1702 | evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit); |
1703 | |
1704 | // Move 256 bytes of data |
1705 | loadData(in, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(ZTMP0, Address(in, pos, Address::times_1, 0 * 64), Assembler ::AVX_512bit); evmovdquq(ZTMP1, Address(in, pos, Address::times_1 , 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP2, Address(in , pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq (ZTMP3, Address(in, pos, Address::times_1, 3 * 64), Assembler ::AVX_512bit);; |
1706 | // Load key shuffle mask |
1707 | movdqu(xmm29, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); |
1708 | // Load 0th AES round key |
1709 | ev_load_key(ZTMP4, key, 0, xmm29); |
1710 | // AES-ROUND0, xmm24 has the shuffle mask |
1711 | shuffleExorRnd1Key(ZTMP5, ZTMP6, ZTMP7, ZTMP8, xmm24, ZTMP4)vpshufb(ZTMP5, ZTMP5, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP6, ZTMP6 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP7, ZTMP7, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP7, ZTMP7, ZTMP4, Assembler::AVX_512bit ); vpshufb(ZTMP8, ZTMP8, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1712 | |
1713 | for (int j = 1; j < 10; j++) { |
1714 | ev_load_key(ZTMP4, key, j * 16, xmm29); |
1715 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1716 | } |
1717 | ev_load_key(ZTMP4, key, 10 * 16, xmm29); |
1718 | // AES rounds upto 11 (AES192) or 13 (AES256) |
1719 | cmpl(rounds, 52); |
1720 | jcc(Assembler::greaterEqual, AES_192); |
1721 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
1722 | jmp(STORE_CT); |
1723 | |
1724 | bind(AES_192); |
1725 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1726 | ev_load_key(ZTMP4, key, 11 * 16, xmm29); |
1727 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1728 | cmpl(rounds, 60); |
1729 | jcc(Assembler::aboveEqual, AES_256); |
1730 | ev_load_key(ZTMP4, key, 12 * 16, xmm29); |
1731 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
1732 | jmp(STORE_CT); |
1733 | |
1734 | bind(AES_256); |
1735 | ev_load_key(ZTMP4, key, 12 * 16, xmm29); |
1736 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1737 | ev_load_key(ZTMP4, key, 13 * 16, xmm29); |
1738 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
1739 | ev_load_key(ZTMP4, key, 14 * 16, xmm29); |
1740 | // Last AES round |
1741 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
1742 | |
1743 | bind(STORE_CT); |
1744 | // Xor the encrypted key with PT to obtain CT |
1745 | xorBeforeStore(ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evpxorq(ZTMP5, ZTMP5, ZTMP0, Assembler::AVX_512bit); evpxorq( ZTMP6, ZTMP6, ZTMP1, Assembler::AVX_512bit); evpxorq(ZTMP7, ZTMP7 , ZTMP2, Assembler::AVX_512bit); evpxorq(ZTMP8, ZTMP8, ZTMP3, Assembler::AVX_512bit);; |
1746 | storeData(out, pos, ZTMP5, ZTMP6, ZTMP7, ZTMP8)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP5, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP6, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP7, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP8 , Assembler::AVX_512bit);; |
1747 | // 16 blocks encryption completed |
1748 | addl(pos, 256); |
1749 | cmpl(pos, 512); |
1750 | jcc(Assembler::aboveEqual, GHASH_AES_PARALLEL); |
1751 | vpaddd(ZTMP5, CTR_BLOCKx, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1752 | vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1753 | vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1754 | vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit); |
1755 | jmp(AES_32_BLOCKS); |
1756 | |
1757 | bind(GHASH_AES_PARALLEL); |
1758 | // Ghash16_encrypt16_parallel takes place in the order with three reduction values: |
1759 | // 1) First time -> cipher xor input ghash |
1760 | // 2) No reduction -> accumulate multiplication values |
1761 | // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round |
1762 | // Reduction value = first time |
1763 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
1764 | addl(pos, 256); |
1765 | addl(ghash_pos, 256); |
1766 | index += 4; |
1767 | |
1768 | // At this point we have processed 768 bytes of AES and 256 bytes of GHASH. |
1769 | // If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code |
1770 | subl(len, 768); |
1771 | cmpl(len, 768); |
1772 | jcc(Assembler::less, GHASH_LAST_32); |
1773 | |
1774 | // AES 16 blocks and GHASH 16 blocks in parallel |
1775 | // For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times |
1776 | // Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations |
1777 | // Each call uses 4 subkeyHtbl values, so increment the index by 4. |
1778 | bind(GHASH_16_AES_16); |
1779 | // Reduction value = no reduction |
1780 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
1781 | addl(pos, 256); |
1782 | addl(ghash_pos, 256); |
1783 | index += 4; |
1784 | // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash |
1785 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK); |
1786 | addl(pos, 256); |
1787 | addl(ghash_pos, 256); |
1788 | // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline |
1789 | movdqu(AAD_HASHx, ZTMP5); |
1790 | index = 0; // Reset subkeyHtbl index |
1791 | |
1792 | // Restart the pipeline |
1793 | // Reduction value = first time |
1794 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
1795 | addl(pos, 256); |
1796 | addl(ghash_pos, 256); |
1797 | index += 4; |
Value stored to 'index' is never read | |
1798 | |
1799 | subl(len, 768); |
1800 | cmpl(len, 768); |
1801 | jcc(Assembler::greaterEqual, GHASH_16_AES_16); |
1802 | |
1803 | // GHASH last 32 blocks processed here |
1804 | // GHASH products accumulated in ZMM27, ZMM25 and ZMM26 during GHASH16-AES16 operation is used |
1805 | bind(GHASH_LAST_32); |
1806 | // Use rbx as a pointer to the htbl; For last 32 blocks of GHASH, use key# 4-11 entry in subkeyHtbl |
1807 | movl(rbx, 256); |
1808 | // Load cipher blocks |
1809 | evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1810 | evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1811 | vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); |
1812 | vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); |
1813 | // Load ghash keys |
1814 | evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1815 | evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1816 | |
1817 | // Ghash blocks 0 - 3 |
1818 | carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15)evpclmulqdq(ZTMP2, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP3, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP4, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP1, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );; |
1819 | // Ghash blocks 4 - 7 |
1820 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP14, ZTMP16)evpclmulqdq(ZTMP6, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );; |
1821 | |
1822 | vpternlogq(ZTMP1, 0x96, ZTMP5, xmm27, Assembler::AVX_512bit); // ZTMP1 = ZTMP1 + ZTMP5 + zmm27 |
1823 | vpternlogq(ZTMP2, 0x96, ZTMP6, xmm26, Assembler::AVX_512bit); // ZTMP2 = ZTMP2 + ZTMP6 + zmm26 |
1824 | vpternlogq(ZTMP3, 0x96, ZTMP7, xmm25, Assembler::AVX_512bit); // ZTMP3 = ZTMP3 + ZTMP7 + zmm25 |
1825 | evpxorq(ZTMP4, ZTMP4, ZTMP8, Assembler::AVX_512bit); // ZTMP4 = ZTMP4 + ZTMP8 |
1826 | |
1827 | addl(ghash_pos, 128); |
1828 | addl(rbx, 128); |
1829 | |
1830 | // Ghash remaining blocks |
1831 | bind(LOOP); |
1832 | cmpl(ghash_pos, pos); |
1833 | jcc(Assembler::aboveEqual, ACCUMULATE); |
1834 | // Load next cipher blocks and corresponding ghash keys |
1835 | evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1836 | evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1837 | vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); |
1838 | vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); |
1839 | evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
1840 | evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
1841 | |
1842 | // ghash blocks 0 - 3 |
1843 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15)evpclmulqdq(ZTMP6, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );; |
1844 | |
1845 | // ghash blocks 4 - 7 |
1846 | carrylessMultiply(ZTMP10, ZTMP11, ZTMP12, ZTMP9, ZTMP14, ZTMP16)evpclmulqdq(ZTMP10, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );; |
1847 | |
1848 | // update sums |
1849 | // ZTMP1 = ZTMP1 + ZTMP5 + ZTMP9 |
1850 | // ZTMP2 = ZTMP2 + ZTMP6 + ZTMP10 |
1851 | // ZTMP3 = ZTMP3 + ZTMP7 xor ZTMP11 |
1852 | // ZTMP4 = ZTMP4 + ZTMP8 xor ZTMP12 |
1853 | xorGHASH(ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP9, ZTMP6, ZTMP10, ZTMP7, ZTMP11, ZTMP8, ZTMP12)vpternlogq(ZTMP1, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit); vpternlogq(ZTMP2, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit ); vpternlogq(ZTMP3, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(ZTMP4, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit );; |
1854 | addl(ghash_pos, 128); |
1855 | addl(rbx, 128); |
1856 | jmp(LOOP); |
1857 | |
1858 | // Integrate ZTMP3/ZTMP4 into ZTMP1 and ZTMP2 |
1859 | bind(ACCUMULATE); |
1860 | evpxorq(ZTMP3, ZTMP3, ZTMP4, Assembler::AVX_512bit); |
1861 | vpsrldq(ZTMP7, ZTMP3, 8, Assembler::AVX_512bit); |
1862 | vpslldq(ZTMP8, ZTMP3, 8, Assembler::AVX_512bit); |
1863 | evpxorq(ZTMP1, ZTMP1, ZTMP7, Assembler::AVX_512bit); |
1864 | evpxorq(ZTMP2, ZTMP2, ZTMP8, Assembler::AVX_512bit); |
1865 | |
1866 | // Add ZTMP1 and ZTMP2 128 - bit words horizontally |
1867 | vhpxori4x128(ZTMP1, ZTMP11)vextracti64x4(ZTMP11, ZTMP1, 1); evpxorq(ZTMP1, ZTMP1, ZTMP11 , Assembler::AVX_256bit); vextracti32x4(ZTMP11, ZTMP1, 1); evpxorq (ZTMP1, ZTMP1, ZTMP11, Assembler::AVX_128bit);; |
1868 | vhpxori4x128(ZTMP2, ZTMP12)vextracti64x4(ZTMP12, ZTMP2, 1); evpxorq(ZTMP2, ZTMP2, ZTMP12 , Assembler::AVX_256bit); vextracti32x4(ZTMP12, ZTMP2, 1); evpxorq (ZTMP2, ZTMP2, ZTMP12, Assembler::AVX_128bit);; |
1869 | // Load reduction polynomial and compute final reduction |
1870 | evmovdquq(ZTMP15, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx); |
1871 | vclmul_reduce(AAD_HASHx, ZTMP15, ZTMP1, ZTMP2, ZTMP3, ZTMP4)evpclmulqdq(ZTMP3, ZTMP15, ZTMP2, 0x01, Assembler::AVX_512bit ); vpslldq(ZTMP3, ZTMP3, 8, Assembler::AVX_512bit); evpxorq(ZTMP3 , ZTMP2, ZTMP3, Assembler::AVX_512bit); evpclmulqdq(ZTMP4, ZTMP15 , ZTMP3, 0x00, Assembler::AVX_512bit); vpsrldq(ZTMP4, ZTMP4, 4 , Assembler::AVX_512bit); evpclmulqdq(AAD_HASHx, ZTMP15, ZTMP3 , 0x10, Assembler::AVX_512bit); vpslldq(AAD_HASHx, AAD_HASHx, 4, Assembler::AVX_512bit); vpternlogq(AAD_HASHx, 0x96, ZTMP4 , ZTMP1, Assembler::AVX_512bit);; |
1872 | |
1873 | // Pre-increment counter for next operation |
1874 | vpaddd(CTR_BLOCKx, CTR_BLOCKx, xmm18, Assembler::AVX_128bit); |
1875 | // Shuffle counter and save the updated value |
1876 | vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit); |
1877 | movdqu(Address(counter, 0), CTR_BLOCKx); |
1878 | // Load ghash lswap mask |
1879 | movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
1880 | // Shuffle ghash using lbswap_mask and store it |
1881 | vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit); |
1882 | movdqu(Address(state, 0), AAD_HASHx); |
1883 | jmp(ENC_DEC_DONE); |
1884 | |
1885 | bind(GENERATE_HTBL_48_BLKS); |
1886 | generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl); |
1887 | |
1888 | bind(ENC_DEC_DONE); |
1889 | movq(rax, pos); |
1890 | } |
1891 | |
1892 | #endif // _LP64 |