| File: | jdk/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp |
| Warning: | line 1797, column 5 Value stored to 'index' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* |
| 2 | * Copyright (c) 2019, 2021, Intel Corporation. All rights reserved. |
| 3 | * |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This code is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 only, as |
| 8 | * published by the Free Software Foundation. |
| 9 | * |
| 10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 13 | * version 2 for more details (a copy is included in the LICENSE file that |
| 14 | * accompanied this code). |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License version |
| 17 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 19 | * |
| 20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 21 | * or visit www.oracle.com if you need additional information or have any |
| 22 | * questions. |
| 23 | * |
| 24 | */ |
| 25 | |
| 26 | #include "precompiled.hpp" |
| 27 | #include "asm/assembler.hpp" |
| 28 | #include "asm/assembler.inline.hpp" |
| 29 | #include "runtime/stubRoutines.hpp" |
| 30 | #include "macroAssembler_x86.hpp" |
| 31 | |
| 32 | #ifdef _LP641 |
| 33 | |
| 34 | void MacroAssembler::roundEnc(XMMRegister key, int rnum) { |
| 35 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
| 36 | vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
| 37 | } |
| 38 | } |
| 39 | |
| 40 | void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) { |
| 41 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
| 42 | vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | void MacroAssembler::roundDec(XMMRegister key, int rnum) { |
| 47 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
| 48 | vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | void MacroAssembler::lastroundDec(XMMRegister key, int rnum) { |
| 53 | for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) { |
| 54 | vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit); |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | // Load key and shuffle operation |
| 59 | void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL__null) { |
| 60 | movdqu(xmmdst, Address(key, offset)); |
| 61 | if (xmm_shuf_mask != NULL__null) { |
| 62 | pshufb(xmmdst, xmm_shuf_mask); |
| 63 | } else { |
| 64 | pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
| 65 | } |
| 66 | evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit); |
| 67 | } |
| 68 | |
| 69 | // AES-ECB Encrypt Operation |
| 70 | void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) { |
| 71 | |
| 72 | const Register pos = rax; |
| 73 | const Register rounds = r12; |
| 74 | |
| 75 | Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT; |
| 76 | push(r13); |
| 77 | push(r12); |
| 78 | |
| 79 | // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
| 80 | // context for the registers used, where all instructions below are using 128-bit mode |
| 81 | // On EVEX without VL and BW, these instructions will all be AVX. |
| 82 | if (VM_Version::supports_avx512vlbw()) { |
| 83 | movl(rax, 0xffff); |
| 84 | kmovql(k1, rax); |
| 85 | } |
| 86 | push(len); // Save |
| 87 | push(rbx); |
| 88 | |
| 89 | vzeroupper(); |
| 90 | |
| 91 | xorptr(pos, pos); |
| 92 | |
| 93 | // Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds |
| 94 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| 95 | |
| 96 | // Load Key shuf mask |
| 97 | const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front |
| 98 | movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
| 99 | |
| 100 | // Load and shuffle key based on number of rounds |
| 101 | ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask); |
| 102 | ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask); |
| 103 | ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask); |
| 104 | ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask); |
| 105 | ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask); |
| 106 | ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask); |
| 107 | ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask); |
| 108 | ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask); |
| 109 | ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask); |
| 110 | ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask); |
| 111 | ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask); |
| 112 | cmpl(rounds, 52); |
| 113 | jcc(Assembler::greaterEqual, KEY_192); |
| 114 | jmp(Loop_start); |
| 115 | |
| 116 | bind(KEY_192); |
| 117 | ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask); |
| 118 | ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask); |
| 119 | cmpl(rounds, 60); |
| 120 | jcc(Assembler::equal, KEY_256); |
| 121 | jmp(Loop_start); |
| 122 | |
| 123 | bind(KEY_256); |
| 124 | ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask); |
| 125 | ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask); |
| 126 | |
| 127 | bind(Loop_start); |
| 128 | movq(rbx, len); |
| 129 | // Divide length by 16 to convert it to number of blocks |
| 130 | shrq(len, 4); |
| 131 | shlq(rbx, 60); |
| 132 | jcc(Assembler::equal, NO_PARTS); |
| 133 | addq(len, 1); |
| 134 | // Check if number of blocks is greater than or equal to 32 |
| 135 | // If true, 512 bytes are processed at a time (code marked by label LOOP) |
| 136 | // If not, 16 bytes are processed (code marked by REMAINDER label) |
| 137 | bind(NO_PARTS); |
| 138 | movq(rbx, len); |
| 139 | shrq(len, 5); |
| 140 | jcc(Assembler::equal, REMAINDER); |
| 141 | movl(r13, len); |
| 142 | // Compute number of blocks that will be processed 512 bytes at a time |
| 143 | // Subtract this from the total number of blocks which will then be processed by REMAINDER loop |
| 144 | shlq(r13, 5); |
| 145 | subq(rbx, r13); |
| 146 | //Begin processing 512 bytes |
| 147 | bind(LOOP); |
| 148 | // Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7 |
| 149 | evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 150 | evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 151 | evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
| 152 | evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
| 153 | evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
| 154 | evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
| 155 | evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
| 156 | evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
| 157 | // Xor with the first round key |
| 158 | evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit); |
| 159 | evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit); |
| 160 | evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit); |
| 161 | evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit); |
| 162 | evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit); |
| 163 | evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit); |
| 164 | evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit); |
| 165 | evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit); |
| 166 | // 9 Aes encode round operations |
| 167 | roundEnc(xmm9, 7); |
| 168 | roundEnc(xmm10, 7); |
| 169 | roundEnc(xmm23, 7); |
| 170 | roundEnc(xmm12, 7); |
| 171 | roundEnc(xmm13, 7); |
| 172 | roundEnc(xmm14, 7); |
| 173 | roundEnc(xmm15, 7); |
| 174 | roundEnc(xmm16, 7); |
| 175 | roundEnc(xmm17, 7); |
| 176 | cmpl(rounds, 52); |
| 177 | jcc(Assembler::aboveEqual, AES192); |
| 178 | // Aesenclast round operation for keysize = 128 |
| 179 | lastroundEnc(xmm24, 7); |
| 180 | jmp(END_LOOP); |
| 181 | //Additional 2 rounds of Aesenc operation for keysize = 192 |
| 182 | bind(AES192); |
| 183 | roundEnc(xmm24, 7); |
| 184 | roundEnc(xmm19, 7); |
| 185 | cmpl(rounds, 60); |
| 186 | jcc(Assembler::aboveEqual, AES256); |
| 187 | // Aesenclast round for keysize = 192 |
| 188 | lastroundEnc(xmm20, 7); |
| 189 | jmp(END_LOOP); |
| 190 | // 2 rounds of Aesenc operation and Aesenclast for keysize = 256 |
| 191 | bind(AES256); |
| 192 | roundEnc(xmm20, 7); |
| 193 | roundEnc(xmm21, 7); |
| 194 | lastroundEnc(xmm22, 7); |
| 195 | |
| 196 | bind(END_LOOP); |
| 197 | // Move 512 bytes of CT to destination |
| 198 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
| 199 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
| 200 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
| 201 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
| 202 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
| 203 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
| 204 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
| 205 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
| 206 | |
| 207 | addq(pos, 512); |
| 208 | decq(len); |
| 209 | jcc(Assembler::notEqual, LOOP); |
| 210 | |
| 211 | bind(REMAINDER); |
| 212 | vzeroupper(); |
| 213 | cmpq(rbx, 0); |
| 214 | jcc(Assembler::equal, END); |
| 215 | // Process 16 bytes at a time |
| 216 | bind(LOOP2); |
| 217 | movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0)); |
| 218 | vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit); |
| 219 | // xmm2 contains shuffled key for Aesenclast operation. |
| 220 | vmovdqu(xmm2, xmm24); |
| 221 | |
| 222 | vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit); |
| 223 | vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit); |
| 224 | vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit); |
| 225 | vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit); |
| 226 | vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit); |
| 227 | vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit); |
| 228 | vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit); |
| 229 | vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit); |
| 230 | vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit); |
| 231 | |
| 232 | cmpl(rounds, 52); |
| 233 | jcc(Assembler::below, LAST2); |
| 234 | vmovdqu(xmm2, xmm20); |
| 235 | vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit); |
| 236 | vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit); |
| 237 | cmpl(rounds, 60); |
| 238 | jcc(Assembler::below, LAST2); |
| 239 | vmovdqu(xmm2, xmm22); |
| 240 | vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit); |
| 241 | vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit); |
| 242 | |
| 243 | bind(LAST2); |
| 244 | // Aesenclast round |
| 245 | vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit); |
| 246 | // Write 16 bytes of CT to destination |
| 247 | movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1); |
| 248 | addq(pos, 16); |
| 249 | decq(rbx); |
| 250 | jcc(Assembler::notEqual, LOOP2); |
| 251 | |
| 252 | bind(END); |
| 253 | // Zero out the round keys |
| 254 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
| 255 | evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit); |
| 256 | evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit); |
| 257 | evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit); |
| 258 | evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit); |
| 259 | evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit); |
| 260 | evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit); |
| 261 | evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit); |
| 262 | evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit); |
| 263 | evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit); |
| 264 | evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit); |
| 265 | cmpl(rounds, 44); |
| 266 | jcc(Assembler::belowEqual, EXIT); |
| 267 | evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit); |
| 268 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
| 269 | cmpl(rounds, 52); |
| 270 | jcc(Assembler::belowEqual, EXIT); |
| 271 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
| 272 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
| 273 | bind(EXIT); |
| 274 | pop(rbx); |
| 275 | pop(rax); // return length |
| 276 | pop(r12); |
| 277 | pop(r13); |
| 278 | } |
| 279 | |
| 280 | // AES-ECB Decrypt Operation |
| 281 | void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) { |
| 282 | |
| 283 | Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT; |
| 284 | const Register pos = rax; |
| 285 | const Register rounds = r12; |
| 286 | push(r13); |
| 287 | push(r12); |
| 288 | |
| 289 | // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
| 290 | // context for the registers used, where all instructions below are using 128-bit mode |
| 291 | // On EVEX without VL and BW, these instructions will all be AVX. |
| 292 | if (VM_Version::supports_avx512vlbw()) { |
| 293 | movl(rax, 0xffff); |
| 294 | kmovql(k1, rax); |
| 295 | } |
| 296 | |
| 297 | push(len); // Save |
| 298 | push(rbx); |
| 299 | |
| 300 | vzeroupper(); |
| 301 | |
| 302 | xorptr(pos, pos); |
| 303 | // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds |
| 304 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| 305 | |
| 306 | // Load Key shuf mask |
| 307 | const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front |
| 308 | movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
| 309 | |
| 310 | // Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption. |
| 311 | // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16 |
| 312 | ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask); |
| 313 | ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask); |
| 314 | ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask); |
| 315 | ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask); |
| 316 | ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask); |
| 317 | ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask); |
| 318 | ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask); |
| 319 | ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask); |
| 320 | ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask); |
| 321 | ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask); |
| 322 | ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask); |
| 323 | cmpl(rounds, 52); |
| 324 | jcc(Assembler::greaterEqual, KEY_192); |
| 325 | jmp(Loop_start); |
| 326 | |
| 327 | bind(KEY_192); |
| 328 | ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask); |
| 329 | ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask); |
| 330 | cmpl(rounds, 60); |
| 331 | jcc(Assembler::equal, KEY_256); |
| 332 | jmp(Loop_start); |
| 333 | |
| 334 | bind(KEY_256); |
| 335 | ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask); |
| 336 | ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask); |
| 337 | bind(Loop_start); |
| 338 | movq(rbx, len); |
| 339 | // Convert input length to number of blocks |
| 340 | shrq(len, 4); |
| 341 | shlq(rbx, 60); |
| 342 | jcc(Assembler::equal, NO_PARTS); |
| 343 | addq(len, 1); |
| 344 | // Check if number of blocks is greater than/ equal to 32 |
| 345 | // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP) |
| 346 | // If not, 16 bytes are processed (code marked by label REMAINDER) |
| 347 | bind(NO_PARTS); |
| 348 | movq(rbx, len); |
| 349 | shrq(len, 5); |
| 350 | jcc(Assembler::equal, REMAINDER); |
| 351 | movl(r13, len); |
| 352 | // Compute number of blocks that will be processed as 512 bytes at a time |
| 353 | // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop. |
| 354 | shlq(r13, 5); |
| 355 | subq(rbx, r13); |
| 356 | |
| 357 | bind(LOOP); |
| 358 | // Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7 |
| 359 | evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 360 | evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 361 | evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
| 362 | evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
| 363 | evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
| 364 | evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
| 365 | evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
| 366 | evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
| 367 | // Xor with the first round key |
| 368 | evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit); |
| 369 | evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit); |
| 370 | evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit); |
| 371 | evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit); |
| 372 | evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit); |
| 373 | evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit); |
| 374 | evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit); |
| 375 | evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit); |
| 376 | // 9 rounds of Aesdec |
| 377 | roundDec(xmm10, 7); |
| 378 | roundDec(xmm11, 7); |
| 379 | roundDec(xmm12, 7); |
| 380 | roundDec(xmm13, 7); |
| 381 | roundDec(xmm14, 7); |
| 382 | roundDec(xmm15, 7); |
| 383 | roundDec(xmm16, 7); |
| 384 | roundDec(xmm17, 7); |
| 385 | roundDec(xmm18, 7); |
| 386 | cmpl(rounds, 52); |
| 387 | jcc(Assembler::aboveEqual, AES192); |
| 388 | // Aesdeclast round for keysize = 128 |
| 389 | lastroundDec(xmm27, 7); |
| 390 | jmp(END_LOOP); |
| 391 | |
| 392 | bind(AES192); |
| 393 | // 2 Additional rounds for keysize = 192 |
| 394 | roundDec(xmm19, 7); |
| 395 | roundDec(xmm20, 7); |
| 396 | cmpl(rounds, 60); |
| 397 | jcc(Assembler::aboveEqual, AES256); |
| 398 | // Aesdeclast round for keysize = 192 |
| 399 | lastroundDec(xmm27, 7); |
| 400 | jmp(END_LOOP); |
| 401 | bind(AES256); |
| 402 | // 2 Additional rounds and Aesdeclast for keysize = 256 |
| 403 | roundDec(xmm21, 7); |
| 404 | roundDec(xmm22, 7); |
| 405 | lastroundDec(xmm27, 7); |
| 406 | |
| 407 | bind(END_LOOP); |
| 408 | // Write 512 bytes of PT to the destination |
| 409 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
| 410 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
| 411 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
| 412 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
| 413 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
| 414 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
| 415 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
| 416 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
| 417 | |
| 418 | addq(pos, 512); |
| 419 | decq(len); |
| 420 | jcc(Assembler::notEqual, LOOP); |
| 421 | |
| 422 | bind(REMAINDER); |
| 423 | vzeroupper(); |
| 424 | cmpq(rbx, 0); |
| 425 | jcc(Assembler::equal, END); |
| 426 | // Process 16 bytes at a time |
| 427 | bind(LOOP2); |
| 428 | movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0)); |
| 429 | vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit); |
| 430 | // xmm2 contains shuffled key for Aesdeclast operation. |
| 431 | vmovdqu(xmm2, xmm27); |
| 432 | |
| 433 | vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit); |
| 434 | vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit); |
| 435 | vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit); |
| 436 | vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit); |
| 437 | vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit); |
| 438 | vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit); |
| 439 | vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit); |
| 440 | vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit); |
| 441 | vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit); |
| 442 | |
| 443 | cmpl(rounds, 52); |
| 444 | jcc(Assembler::below, LAST2); |
| 445 | vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit); |
| 446 | vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit); |
| 447 | cmpl(rounds, 60); |
| 448 | jcc(Assembler::below, LAST2); |
| 449 | vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit); |
| 450 | vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit); |
| 451 | |
| 452 | bind(LAST2); |
| 453 | // Aesdeclast round |
| 454 | vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit); |
| 455 | // Write 16 bytes of PT to destination |
| 456 | movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1); |
| 457 | addq(pos, 16); |
| 458 | decq(rbx); |
| 459 | jcc(Assembler::notEqual, LOOP2); |
| 460 | |
| 461 | bind(END); |
| 462 | // Zero out the round keys |
| 463 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
| 464 | evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit); |
| 465 | evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit); |
| 466 | evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit); |
| 467 | evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit); |
| 468 | evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit); |
| 469 | evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit); |
| 470 | evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit); |
| 471 | evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit); |
| 472 | evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit); |
| 473 | evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit); |
| 474 | evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit); |
| 475 | cmpl(rounds, 44); |
| 476 | jcc(Assembler::belowEqual, EXIT); |
| 477 | evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit); |
| 478 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
| 479 | cmpl(rounds, 52); |
| 480 | jcc(Assembler::belowEqual, EXIT); |
| 481 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
| 482 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
| 483 | bind(EXIT); |
| 484 | pop(rbx); |
| 485 | pop(rax); // return length |
| 486 | pop(r12); |
| 487 | pop(r13); |
| 488 | } |
| 489 | |
| 490 | // Multiply 128 x 128 bits, using 4 pclmulqdq operations |
| 491 | void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, |
| 492 | XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { |
| 493 | movdqu(xmm15, Address(htbl, i * 16)); |
| 494 | vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 |
| 495 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
| 496 | vpclmulldq(tmp3, data, xmm15); // 0x00 |
| 497 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); |
| 498 | vpclmulhdq(tmp3, data, xmm15); // 0x11 |
| 499 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); |
| 500 | vpclmullqhqdq(tmp3, data, xmm15); // 0x10 |
| 501 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
| 502 | } |
| 503 | |
| 504 | // Multiply two 128 bit numbers resulting in a 256 bit value |
| 505 | // Result of the multiplication followed by reduction stored in state |
| 506 | void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { |
| 507 | const XMMRegister tmp1 = xmm4; |
| 508 | const XMMRegister tmp2 = xmm5; |
| 509 | const XMMRegister tmp3 = xmm6; |
| 510 | const XMMRegister tmp4 = xmm7; |
| 511 | |
| 512 | vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) |
| 513 | vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) |
| 514 | vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) |
| 515 | vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) |
| 516 | |
| 517 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) |
| 518 | |
| 519 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
| 520 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
| 521 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result |
| 522 | vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication |
| 523 | // Follows the reduction technique mentioned in |
| 524 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
| 525 | // First phase of reduction |
| 526 | // |
| 527 | vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 |
| 528 | vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 |
| 529 | vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 |
| 530 | // xor the shifted versions |
| 531 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
| 532 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
| 533 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
| 534 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
| 535 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete |
| 536 | // |
| 537 | // Second phase of the reduction |
| 538 | // |
| 539 | vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 |
| 540 | vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 |
| 541 | vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 |
| 542 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions |
| 543 | vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); |
| 544 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
| 545 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); |
| 546 | vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state |
| 547 | ret(0); |
| 548 | } |
| 549 | |
| 550 | // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. |
| 551 | // The power of H is used in reduction process for one block ghash |
| 552 | void MacroAssembler::generateHtbl_one_block(Register htbl) { |
| 553 | const XMMRegister t = xmm13; |
| 554 | |
| 555 | // load the original subkey hash |
| 556 | movdqu(t, Address(htbl, 0)); |
| 557 | // shuffle using long swap mask |
| 558 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 559 | vpshufb(t, t, xmm10, Assembler::AVX_128bit); |
| 560 | |
| 561 | // Compute H' = GFMUL(H, 2) |
| 562 | vpsrld(xmm3, t, 7, Assembler::AVX_128bit); |
| 563 | movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); |
| 564 | vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); |
| 565 | movl(rax, 0xff00); |
| 566 | movdl(xmm4, rax); |
| 567 | vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); |
| 568 | movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); |
| 569 | vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); |
| 570 | vpsrld(xmm3, t, 31, Assembler::AVX_128bit); |
| 571 | vpslld(xmm4, t, 1, Assembler::AVX_128bit); |
| 572 | vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); |
| 573 | vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 |
| 574 | |
| 575 | //Adding p(x)<<1 to xmm5 which holds the reduction polynomial |
| 576 | vpxor(t, t, xmm5, Assembler::AVX_128bit); |
| 577 | movdqu(Address(htbl, 1 * 16), t); // H * 2 |
| 578 | |
| 579 | ret(0); |
| 580 | } |
| 581 | |
| 582 | // This method takes the subkey after expansion as input and generates the remaining powers of subkey H. |
| 583 | // The power of H is used in reduction process for eight block ghash |
| 584 | void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { |
| 585 | const XMMRegister t = xmm13; |
| 586 | const XMMRegister tmp0 = xmm1; |
| 587 | Label GFMUL; |
| 588 | |
| 589 | movdqu(t, Address(htbl, 1 * 16)); |
| 590 | movdqu(tmp0, t); |
| 591 | |
| 592 | // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) |
| 593 | call(GFMUL, relocInfo::none); |
| 594 | movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 |
| 595 | call(GFMUL, relocInfo::none); |
| 596 | movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 |
| 597 | call(GFMUL, relocInfo::none); |
| 598 | movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 |
| 599 | call(GFMUL, relocInfo::none); |
| 600 | movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 |
| 601 | call(GFMUL, relocInfo::none); |
| 602 | movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 |
| 603 | call(GFMUL, relocInfo::none); |
| 604 | movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 |
| 605 | call(GFMUL, relocInfo::none); |
| 606 | movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 |
| 607 | ret(0); |
| 608 | |
| 609 | bind(GFMUL); |
| 610 | gfmul(tmp0, t); |
| 611 | } |
| 612 | |
| 613 | // Multiblock and single block GHASH computation using Shift XOR reduction technique |
| 614 | void MacroAssembler::avx_ghash(Register input_state, Register htbl, |
| 615 | Register input_data, Register blocks) { |
| 616 | |
| 617 | // temporary variables to hold input data and input state |
| 618 | const XMMRegister data = xmm1; |
| 619 | const XMMRegister state = xmm0; |
| 620 | // temporary variables to hold intermediate results |
| 621 | const XMMRegister tmp0 = xmm3; |
| 622 | const XMMRegister tmp1 = xmm4; |
| 623 | const XMMRegister tmp2 = xmm5; |
| 624 | const XMMRegister tmp3 = xmm6; |
| 625 | // temporary variables to hold byte and long swap masks |
| 626 | const XMMRegister bswap_mask = xmm2; |
| 627 | const XMMRegister lswap_mask = xmm14; |
| 628 | |
| 629 | Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, |
| 630 | ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; |
| 631 | |
| 632 | testptr(blocks, blocks); |
| 633 | jcc(Assembler::zero, EXIT_GHASH); |
| 634 | |
| 635 | // Check if Hashtable (1*16) has been already generated |
| 636 | // For anything less than 8 blocks, we generate only the first power of H. |
| 637 | movdqu(tmp2, Address(htbl, 1 * 16)); |
| 638 | ptest(tmp2, tmp2); |
| 639 | jcc(Assembler::notZero, BEGIN_PROCESS); |
| 640 | call(GENERATE_HTBL_1_BLK, relocInfo::none); |
| 641 | |
| 642 | // Shuffle the input state |
| 643 | bind(BEGIN_PROCESS); |
| 644 | movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 645 | movdqu(state, Address(input_state, 0)); |
| 646 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
| 647 | |
| 648 | cmpl(blocks, 8); |
| 649 | jcc(Assembler::below, ONE_BLK_INIT); |
| 650 | // If we have 8 blocks or more data, then generate remaining powers of H |
| 651 | movdqu(tmp2, Address(htbl, 8 * 16)); |
| 652 | ptest(tmp2, tmp2); |
| 653 | jcc(Assembler::notZero, PROCESS_8_BLOCKS); |
| 654 | call(GENERATE_HTBL_8_BLKS, relocInfo::none); |
| 655 | |
| 656 | //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time |
| 657 | //Each block = 16 bytes. |
| 658 | bind(PROCESS_8_BLOCKS); |
| 659 | subl(blocks, 8); |
| 660 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
| 661 | movdqu(data, Address(input_data, 16 * 7)); |
| 662 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 663 | //Loading 1*16 as calculated powers of H required starts at that location. |
| 664 | movdqu(xmm15, Address(htbl, 1 * 16)); |
| 665 | //Perform carryless multiplication of (H*2, data block #7) |
| 666 | vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 |
| 667 | vpclmulldq(tmp0, data, xmm15);//a0 * b0 |
| 668 | vpclmulhdq(tmp1, data, xmm15);//a1 * b1 |
| 669 | vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 |
| 670 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) |
| 671 | |
| 672 | movdqu(data, Address(input_data, 16 * 6)); |
| 673 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 674 | // Perform carryless multiplication of (H^2 * 2, data block #6) |
| 675 | schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 676 | |
| 677 | movdqu(data, Address(input_data, 16 * 5)); |
| 678 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 679 | // Perform carryless multiplication of (H^3 * 2, data block #5) |
| 680 | schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 681 | movdqu(data, Address(input_data, 16 * 4)); |
| 682 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 683 | // Perform carryless multiplication of (H^4 * 2, data block #4) |
| 684 | schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 685 | movdqu(data, Address(input_data, 16 * 3)); |
| 686 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 687 | // Perform carryless multiplication of (H^5 * 2, data block #3) |
| 688 | schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 689 | movdqu(data, Address(input_data, 16 * 2)); |
| 690 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 691 | // Perform carryless multiplication of (H^6 * 2, data block #2) |
| 692 | schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 693 | movdqu(data, Address(input_data, 16 * 1)); |
| 694 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 695 | // Perform carryless multiplication of (H^7 * 2, data block #1) |
| 696 | schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 697 | movdqu(data, Address(input_data, 16 * 0)); |
| 698 | // xor data block#0 with input state before perfoming carry-less multiplication |
| 699 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 700 | vpxor(data, data, state, Assembler::AVX_128bit); |
| 701 | // Perform carryless multiplication of (H^8 * 2, data block #0) |
| 702 | schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 703 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
| 704 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
| 705 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of |
| 706 | vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation |
| 707 | |
| 708 | // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 |
| 709 | // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 |
| 710 | // Follows the reduction technique mentioned in |
| 711 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
| 712 | bind(BLOCK8_REDUCTION); |
| 713 | // First Phase of the reduction |
| 714 | vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 |
| 715 | vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 |
| 716 | vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 |
| 717 | // xor the shifted versions |
| 718 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
| 719 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
| 720 | |
| 721 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
| 722 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
| 723 | |
| 724 | vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete |
| 725 | // second phase of the reduction |
| 726 | vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 |
| 727 | vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 |
| 728 | vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 |
| 729 | // xor the shifted versions |
| 730 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); |
| 731 | vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); |
| 732 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
| 733 | vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); |
| 734 | // Final result is in state |
| 735 | vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); |
| 736 | |
| 737 | lea(input_data, Address(input_data, 16 * 8)); |
| 738 | cmpl(blocks, 8); |
| 739 | jcc(Assembler::below, ONE_BLK_INIT); |
| 740 | jmp(PROCESS_8_BLOCKS); |
| 741 | |
| 742 | // Since this is one block operation we will only use H * 2 i.e. the first power of H |
| 743 | bind(ONE_BLK_INIT); |
| 744 | movdqu(tmp0, Address(htbl, 1 * 16)); |
| 745 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
| 746 | |
| 747 | //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. |
| 748 | bind(PROCESS_1_BLOCK); |
| 749 | cmpl(blocks, 0); |
| 750 | jcc(Assembler::equal, SAVE_STATE); |
| 751 | subl(blocks, 1); |
| 752 | movdqu(data, Address(input_data, 0)); |
| 753 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 754 | vpxor(state, state, data, Assembler::AVX_128bit); |
| 755 | // gfmul(H*2, state) |
| 756 | call(GFMUL, relocInfo::none); |
| 757 | addptr(input_data, 16); |
| 758 | jmp(PROCESS_1_BLOCK); |
| 759 | |
| 760 | bind(SAVE_STATE); |
| 761 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
| 762 | movdqu(Address(input_state, 0), state); |
| 763 | jmp(EXIT_GHASH); |
| 764 | |
| 765 | bind(GFMUL); |
| 766 | gfmul(tmp0, state); |
| 767 | |
| 768 | bind(GENERATE_HTBL_1_BLK); |
| 769 | generateHtbl_one_block(htbl); |
| 770 | |
| 771 | bind(GENERATE_HTBL_8_BLKS); |
| 772 | generateHtbl_eight_blocks(htbl); |
| 773 | |
| 774 | bind(EXIT_GHASH); |
| 775 | // zero out xmm registers used for Htbl storage |
| 776 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
| 777 | vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
| 778 | vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
| 779 | vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
| 780 | } |
| 781 | |
| 782 | // AES Counter Mode using VAES instructions |
| 783 | void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, |
| 784 | Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) { |
| 785 | |
| 786 | const Register rounds = 0; |
| 787 | const Register pos = r12; |
| 788 | |
| 789 | Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP, |
| 790 | AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16, |
| 791 | REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER, |
| 792 | AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP, |
| 793 | AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES, |
| 794 | EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR; |
| 795 | |
| 796 | cmpl(len_reg, 0); |
| 797 | jcc(Assembler::belowEqual, EXIT); |
| 798 | |
| 799 | movl(pos, 0); |
| 800 | // if the number of used encrypted counter bytes < 16, |
| 801 | // XOR PT with saved encrypted counter to obtain CT |
| 802 | bind(PRELOOP_START); |
| 803 | cmpl(used, 16); |
| 804 | jcc(Assembler::aboveEqual, EXIT_PRELOOP); |
| 805 | movb(rbx, Address(saved_encCounter_start, used)); |
| 806 | xorb(rbx, Address(src_addr, pos)); |
| 807 | movb(Address(dest_addr, pos), rbx); |
| 808 | addptr(pos, 1); |
| 809 | addptr(used, 1); |
| 810 | decrement(len_reg); |
| 811 | jmp(PRELOOP_START); |
| 812 | |
| 813 | bind(EXIT_PRELOOP); |
| 814 | movl(Address(used_addr, 0), used); |
| 815 | |
| 816 | // Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256). |
| 817 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| 818 | |
| 819 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
| 820 | // Move initial counter value in xmm0 |
| 821 | movdqu(xmm0, Address(counter, 0)); |
| 822 | // broadcast counter value to zmm8 |
| 823 | evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit); |
| 824 | |
| 825 | // load lbswap mask |
| 826 | evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15); |
| 827 | |
| 828 | //shuffle counter using lbswap_mask |
| 829 | vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit); |
| 830 | |
| 831 | // pre-increment and propagate counter values to zmm9-zmm15 registers. |
| 832 | // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4 |
| 833 | // The counter is incremented after each block i.e. 16 bytes is processed; |
| 834 | // each zmm register has 4 counter values as its MSB |
| 835 | // the counters are incremented in parallel |
| 836 | vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0 |
| 837 | vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip) |
| 838 | vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 839 | vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 840 | vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 841 | vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 842 | vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 843 | vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 844 | |
| 845 | // load linc32 mask in zmm register.linc32 increments counter by 32 |
| 846 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32 |
| 847 | |
| 848 | // xmm31 contains the key shuffle mask. |
| 849 | movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
| 850 | // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value. |
| 851 | // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register |
| 852 | // that holds shuffled key value. |
| 853 | ev_load_key(xmm20, key, 0, xmm31); |
| 854 | ev_load_key(xmm21, key, 1 * 16, xmm31); |
| 855 | ev_load_key(xmm22, key, 2 * 16, xmm31); |
| 856 | ev_load_key(xmm23, key, 3 * 16, xmm31); |
| 857 | ev_load_key(xmm24, key, 4 * 16, xmm31); |
| 858 | ev_load_key(xmm25, key, 5 * 16, xmm31); |
| 859 | ev_load_key(xmm26, key, 6 * 16, xmm31); |
| 860 | ev_load_key(xmm27, key, 7 * 16, xmm31); |
| 861 | ev_load_key(xmm28, key, 8 * 16, xmm31); |
| 862 | ev_load_key(xmm29, key, 9 * 16, xmm31); |
| 863 | ev_load_key(xmm30, key, 10 * 16, xmm31); |
| 864 | |
| 865 | // Process 32 blocks or 512 bytes of data |
| 866 | bind(LOOP); |
| 867 | cmpl(len_reg, 512); |
| 868 | jcc(Assembler::less, REMAINDER); |
| 869 | subq(len_reg, 512); |
| 870 | //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7 |
| 871 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
| 872 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
| 873 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
| 874 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
| 875 | vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
| 876 | evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
| 877 | vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
| 878 | evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
| 879 | vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit); |
| 880 | evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit); |
| 881 | vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit); |
| 882 | evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit); |
| 883 | vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit); |
| 884 | evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit); |
| 885 | vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit); |
| 886 | evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit); |
| 887 | // Perform AES encode operations and put results in zmm0-zmm7. |
| 888 | // This is followed by incrementing counter values in zmm8-zmm15. |
| 889 | // Since we will be processing 32 blocks at a time, the counter is incremented by 32. |
| 890 | roundEnc(xmm21, 7); |
| 891 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
| 892 | roundEnc(xmm22, 7); |
| 893 | vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
| 894 | roundEnc(xmm23, 7); |
| 895 | vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit); |
| 896 | roundEnc(xmm24, 7); |
| 897 | vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit); |
| 898 | roundEnc(xmm25, 7); |
| 899 | vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit); |
| 900 | roundEnc(xmm26, 7); |
| 901 | vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit); |
| 902 | roundEnc(xmm27, 7); |
| 903 | vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit); |
| 904 | roundEnc(xmm28, 7); |
| 905 | vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit); |
| 906 | roundEnc(xmm29, 7); |
| 907 | |
| 908 | cmpl(rounds, 52); |
| 909 | jcc(Assembler::aboveEqual, AES192); |
| 910 | lastroundEnc(xmm30, 7); |
| 911 | jmp(END_LOOP); |
| 912 | |
| 913 | bind(AES192); |
| 914 | roundEnc(xmm30, 7); |
| 915 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
| 916 | roundEnc(xmm18, 7); |
| 917 | cmpl(rounds, 60); |
| 918 | jcc(Assembler::aboveEqual, AES256); |
| 919 | ev_load_key(xmm18, key, 12 * 16, xmm31); |
| 920 | lastroundEnc(xmm18, 7); |
| 921 | jmp(END_LOOP); |
| 922 | |
| 923 | bind(AES256); |
| 924 | ev_load_key(xmm18, key, 12 * 16, xmm31); |
| 925 | roundEnc(xmm18, 7); |
| 926 | ev_load_key(xmm18, key, 13 * 16, xmm31); |
| 927 | roundEnc(xmm18, 7); |
| 928 | ev_load_key(xmm18, key, 14 * 16, xmm31); |
| 929 | lastroundEnc(xmm18, 7); |
| 930 | |
| 931 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7 |
| 932 | // xor encrypted block cipher and input plaintext and store resultant ciphertext |
| 933 | bind(END_LOOP); |
| 934 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 935 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
| 936 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 937 | evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit); |
| 938 | evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
| 939 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
| 940 | evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
| 941 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
| 942 | evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
| 943 | evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
| 944 | evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
| 945 | evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
| 946 | evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
| 947 | evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
| 948 | evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
| 949 | evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
| 950 | addq(pos, 512); |
| 951 | jmp(LOOP); |
| 952 | |
| 953 | // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes |
| 954 | bind(REMAINDER); |
| 955 | cmpl(len_reg, 0); |
| 956 | jcc(Assembler::equal, END); |
| 957 | cmpl(len_reg, 256); |
| 958 | jcc(Assembler::aboveEqual, REMAINDER_16); |
| 959 | cmpl(len_reg, 128); |
| 960 | jcc(Assembler::aboveEqual, REMAINDER_8); |
| 961 | cmpl(len_reg, 64); |
| 962 | jcc(Assembler::aboveEqual, REMAINDER_4); |
| 963 | // At this point, we will process 16 bytes of data at a time. |
| 964 | // So load xmm19 with counter increment value as 1 |
| 965 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15); |
| 966 | jmp(REMAINDER_LOOP); |
| 967 | |
| 968 | // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data |
| 969 | bind(REMAINDER_16); |
| 970 | subq(len_reg, 256); |
| 971 | // As we process 16 blocks at a time, load mask for incrementing the counter value by 16 |
| 972 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip) |
| 973 | // shuffle counter and XOR counter with roundkey1 |
| 974 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
| 975 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
| 976 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
| 977 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
| 978 | vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
| 979 | evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
| 980 | vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
| 981 | evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
| 982 | // Increment counter values by 16 |
| 983 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
| 984 | vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
| 985 | // AES encode rounds |
| 986 | roundEnc(xmm21, 3); |
| 987 | roundEnc(xmm22, 3); |
| 988 | roundEnc(xmm23, 3); |
| 989 | roundEnc(xmm24, 3); |
| 990 | roundEnc(xmm25, 3); |
| 991 | roundEnc(xmm26, 3); |
| 992 | roundEnc(xmm27, 3); |
| 993 | roundEnc(xmm28, 3); |
| 994 | roundEnc(xmm29, 3); |
| 995 | |
| 996 | cmpl(rounds, 52); |
| 997 | jcc(Assembler::aboveEqual, AES192_REMAINDER16); |
| 998 | lastroundEnc(xmm30, 3); |
| 999 | jmp(REMAINDER16_END_LOOP); |
| 1000 | |
| 1001 | bind(AES192_REMAINDER16); |
| 1002 | roundEnc(xmm30, 3); |
| 1003 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
| 1004 | roundEnc(xmm18, 3); |
| 1005 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
| 1006 | |
| 1007 | cmpl(rounds, 60); |
| 1008 | jcc(Assembler::aboveEqual, AES256_REMAINDER16); |
| 1009 | lastroundEnc(xmm5, 3); |
| 1010 | jmp(REMAINDER16_END_LOOP); |
| 1011 | bind(AES256_REMAINDER16); |
| 1012 | roundEnc(xmm5, 3); |
| 1013 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
| 1014 | roundEnc(xmm6, 3); |
| 1015 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
| 1016 | lastroundEnc(xmm7, 3); |
| 1017 | |
| 1018 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3 |
| 1019 | // xor 256 bytes of PT with the encrypted counters to produce CT. |
| 1020 | bind(REMAINDER16_END_LOOP); |
| 1021 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit); |
| 1022 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
| 1023 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1024 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
| 1025 | evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
| 1026 | evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
| 1027 | evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
| 1028 | evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
| 1029 | addq(pos, 256); |
| 1030 | |
| 1031 | cmpl(len_reg, 128); |
| 1032 | jcc(Assembler::aboveEqual, REMAINDER_8); |
| 1033 | |
| 1034 | cmpl(len_reg, 64); |
| 1035 | jcc(Assembler::aboveEqual, REMAINDER_4); |
| 1036 | //load mask for incrementing the counter value by 1 |
| 1037 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
| 1038 | jmp(REMAINDER_LOOP); |
| 1039 | |
| 1040 | // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data |
| 1041 | bind(REMAINDER_8); |
| 1042 | subq(len_reg, 128); |
| 1043 | // As we process 8 blocks at a time, load mask for incrementing the counter value by 8 |
| 1044 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip) |
| 1045 | // shuffle counters and xor with roundkey1 |
| 1046 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
| 1047 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
| 1048 | vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
| 1049 | evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
| 1050 | // increment counter by 8 |
| 1051 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
| 1052 | // AES encode |
| 1053 | roundEnc(xmm21, 1); |
| 1054 | roundEnc(xmm22, 1); |
| 1055 | roundEnc(xmm23, 1); |
| 1056 | roundEnc(xmm24, 1); |
| 1057 | roundEnc(xmm25, 1); |
| 1058 | roundEnc(xmm26, 1); |
| 1059 | roundEnc(xmm27, 1); |
| 1060 | roundEnc(xmm28, 1); |
| 1061 | roundEnc(xmm29, 1); |
| 1062 | |
| 1063 | cmpl(rounds, 52); |
| 1064 | jcc(Assembler::aboveEqual, AES192_REMAINDER8); |
| 1065 | lastroundEnc(xmm30, 1); |
| 1066 | jmp(REMAINDER8_END_LOOP); |
| 1067 | |
| 1068 | bind(AES192_REMAINDER8); |
| 1069 | roundEnc(xmm30, 1); |
| 1070 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
| 1071 | roundEnc(xmm18, 1); |
| 1072 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
| 1073 | cmpl(rounds, 60); |
| 1074 | jcc(Assembler::aboveEqual, AES256_REMAINDER8); |
| 1075 | lastroundEnc(xmm5, 1); |
| 1076 | jmp(REMAINDER8_END_LOOP); |
| 1077 | |
| 1078 | bind(AES256_REMAINDER8); |
| 1079 | roundEnc(xmm5, 1); |
| 1080 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
| 1081 | roundEnc(xmm6, 1); |
| 1082 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
| 1083 | lastroundEnc(xmm7, 1); |
| 1084 | |
| 1085 | bind(REMAINDER8_END_LOOP); |
| 1086 | // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1 |
| 1087 | // XOR PT with the encrypted counter and store as CT |
| 1088 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1089 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
| 1090 | evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1091 | evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
| 1092 | addq(pos, 128); |
| 1093 | |
| 1094 | cmpl(len_reg, 64); |
| 1095 | jcc(Assembler::aboveEqual, REMAINDER_4); |
| 1096 | // load mask for incrementing the counter value by 1 |
| 1097 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
| 1098 | jmp(REMAINDER_LOOP); |
| 1099 | |
| 1100 | // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code |
| 1101 | bind(REMAINDER_4); |
| 1102 | subq(len_reg, 64); |
| 1103 | // As we process 4 blocks at a time, load mask for incrementing the counter value by 4 |
| 1104 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
| 1105 | // XOR counter with first roundkey |
| 1106 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
| 1107 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
| 1108 | // Increment counter |
| 1109 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
| 1110 | vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit); |
| 1111 | vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit); |
| 1112 | vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit); |
| 1113 | vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit); |
| 1114 | vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit); |
| 1115 | vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit); |
| 1116 | vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit); |
| 1117 | vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit); |
| 1118 | vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit); |
| 1119 | cmpl(rounds, 52); |
| 1120 | jcc(Assembler::aboveEqual, AES192_REMAINDER4); |
| 1121 | vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
| 1122 | jmp(END_REMAINDER4); |
| 1123 | |
| 1124 | bind(AES192_REMAINDER4); |
| 1125 | vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
| 1126 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
| 1127 | vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit); |
| 1128 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
| 1129 | |
| 1130 | cmpl(rounds, 60); |
| 1131 | jcc(Assembler::aboveEqual, AES256_REMAINDER4); |
| 1132 | vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
| 1133 | jmp(END_REMAINDER4); |
| 1134 | |
| 1135 | bind(AES256_REMAINDER4); |
| 1136 | vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
| 1137 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
| 1138 | vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit); |
| 1139 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
| 1140 | vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit); |
| 1141 | // After AES encode rounds, the encrypted block cipher lies in zmm0. |
| 1142 | // XOR encrypted block cipher with PT and store 64 bytes of ciphertext |
| 1143 | bind(END_REMAINDER4); |
| 1144 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1145 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
| 1146 | addq(pos, 64); |
| 1147 | // load mask for incrementing the counter value by 1 |
| 1148 | evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
| 1149 | |
| 1150 | // For a single block, the AES rounds start here. |
| 1151 | bind(REMAINDER_LOOP); |
| 1152 | cmpl(len_reg, 0); |
| 1153 | jcc(Assembler::belowEqual, END); |
| 1154 | // XOR counter with first roundkey |
| 1155 | vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit); |
| 1156 | evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit); |
| 1157 | vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit); |
| 1158 | // Increment counter by 1 |
| 1159 | vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit); |
| 1160 | vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit); |
| 1161 | vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit); |
| 1162 | vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit); |
| 1163 | vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit); |
| 1164 | vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit); |
| 1165 | vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit); |
| 1166 | vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit); |
| 1167 | vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit); |
| 1168 | |
| 1169 | cmpl(rounds, 52); |
| 1170 | jcc(Assembler::aboveEqual, AES192_REMAINDER); |
| 1171 | vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
| 1172 | jmp(END_REMAINDER_LOOP); |
| 1173 | |
| 1174 | bind(AES192_REMAINDER); |
| 1175 | vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
| 1176 | ev_load_key(xmm18, key, 11 * 16, xmm31); |
| 1177 | vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit); |
| 1178 | ev_load_key(xmm5, key, 12 * 16, xmm31); |
| 1179 | cmpl(rounds, 60); |
| 1180 | jcc(Assembler::aboveEqual, AES256_REMAINDER); |
| 1181 | vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
| 1182 | jmp(END_REMAINDER_LOOP); |
| 1183 | |
| 1184 | bind(AES256_REMAINDER); |
| 1185 | vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
| 1186 | ev_load_key(xmm6, key, 13 * 16, xmm31); |
| 1187 | vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit); |
| 1188 | ev_load_key(xmm7, key, 14 * 16, xmm31); |
| 1189 | vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit); |
| 1190 | |
| 1191 | bind(END_REMAINDER_LOOP); |
| 1192 | // If the length register is less than the blockSize i.e. 16 |
| 1193 | // then we store only those bytes of the CT to the destination |
| 1194 | // corresponding to the length register value |
| 1195 | // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES |
| 1196 | cmpl(len_reg, 16); |
| 1197 | jcc(Assembler::less, EXTRACT_TAILBYTES); |
| 1198 | subl(len_reg, 16); |
| 1199 | // After AES encode rounds, the encrypted block cipher lies in xmm0. |
| 1200 | // If the length register is equal to 16 bytes, store CT in dest after XOR operation. |
| 1201 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
| 1202 | evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit); |
| 1203 | addl(pos, 16); |
| 1204 | |
| 1205 | jmp(REMAINDER_LOOP); |
| 1206 | |
| 1207 | bind(EXTRACT_TAILBYTES); |
| 1208 | // Save encrypted counter value in xmm0 for next invocation, before XOR operation |
| 1209 | movdqu(Address(saved_encCounter_start, 0), xmm0); |
| 1210 | // XOR encryted block cipher in xmm0 with PT to produce CT |
| 1211 | evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
| 1212 | // extract upto 15 bytes of CT from xmm0 as specified by length register |
| 1213 | testptr(len_reg, 8); |
| 1214 | jcc(Assembler::zero, EXTRACT_TAIL_4BYTES); |
| 1215 | pextrq(Address(dest_addr, pos), xmm0, 0); |
| 1216 | psrldq(xmm0, 8); |
| 1217 | addl(pos, 8); |
| 1218 | bind(EXTRACT_TAIL_4BYTES); |
| 1219 | testptr(len_reg, 4); |
| 1220 | jcc(Assembler::zero, EXTRACT_TAIL_2BYTES); |
| 1221 | pextrd(Address(dest_addr, pos), xmm0, 0); |
| 1222 | psrldq(xmm0, 4); |
| 1223 | addq(pos, 4); |
| 1224 | bind(EXTRACT_TAIL_2BYTES); |
| 1225 | testptr(len_reg, 2); |
| 1226 | jcc(Assembler::zero, EXTRACT_TAIL_1BYTE); |
| 1227 | pextrw(Address(dest_addr, pos), xmm0, 0); |
| 1228 | psrldq(xmm0, 2); |
| 1229 | addl(pos, 2); |
| 1230 | bind(EXTRACT_TAIL_1BYTE); |
| 1231 | testptr(len_reg, 1); |
| 1232 | jcc(Assembler::zero, END); |
| 1233 | pextrb(Address(dest_addr, pos), xmm0, 0); |
| 1234 | addl(pos, 1); |
| 1235 | |
| 1236 | bind(END); |
| 1237 | // If there are no tail bytes, store counter value and exit |
| 1238 | cmpl(len_reg, 0); |
| 1239 | jcc(Assembler::equal, STORE_CTR); |
| 1240 | movl(Address(used_addr, 0), len_reg); |
| 1241 | |
| 1242 | bind(STORE_CTR); |
| 1243 | //shuffle updated counter and store it |
| 1244 | vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit); |
| 1245 | movdqu(Address(counter, 0), xmm8); |
| 1246 | // Zero out counter and key registers |
| 1247 | evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
| 1248 | evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
| 1249 | evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
| 1250 | evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
| 1251 | evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit); |
| 1252 | evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit); |
| 1253 | evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit); |
| 1254 | evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit); |
| 1255 | evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit); |
| 1256 | evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit); |
| 1257 | evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit); |
| 1258 | evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit); |
| 1259 | cmpl(rounds, 44); |
| 1260 | jcc(Assembler::belowEqual, EXIT); |
| 1261 | evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit); |
| 1262 | evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit); |
| 1263 | cmpl(rounds, 52); |
| 1264 | jcc(Assembler::belowEqual, EXIT); |
| 1265 | evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit); |
| 1266 | evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit); |
| 1267 | bind(EXIT); |
| 1268 | } |
| 1269 | |
| 1270 | void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) { |
| 1271 | const XMMRegister TMP1 = xmm0; |
| 1272 | const XMMRegister TMP2 = xmm1; |
| 1273 | const XMMRegister TMP3 = xmm2; |
| 1274 | |
| 1275 | evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit); |
| 1276 | evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit); |
| 1277 | evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit); |
| 1278 | evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit); |
| 1279 | evpxorq(GH, GH, TMP3, Assembler::AVX_512bit); |
| 1280 | vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit); |
| 1281 | vpslldq(GH, GH, 8, Assembler::AVX_512bit); |
| 1282 | evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit); |
| 1283 | evpxorq(GH, GH, TMP2, Assembler::AVX_512bit); |
| 1284 | |
| 1285 | evmovdquq(TMP3, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, r15); |
| 1286 | evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit); |
| 1287 | vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit); |
| 1288 | evpxorq(GH, GH, TMP2, Assembler::AVX_512bit); |
| 1289 | evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit); |
| 1290 | vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit); |
| 1291 | evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit); |
| 1292 | vpslldq(GH, GH, 4, Assembler::AVX_512bit); |
| 1293 | vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit); |
| 1294 | } |
| 1295 | |
| 1296 | void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) { |
| 1297 | const XMMRegister HK = xmm6; |
| 1298 | const XMMRegister ZT5 = xmm4; |
| 1299 | const XMMRegister ZT7 = xmm7; |
| 1300 | const XMMRegister ZT8 = xmm8; |
| 1301 | |
| 1302 | Label GFMUL_AVX512; |
| 1303 | |
| 1304 | movdqu(HK, Address(htbl, 0)); |
| 1305 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 1306 | vpshufb(HK, HK, xmm10, Assembler::AVX_128bit); |
| 1307 | |
| 1308 | movdqu(xmm11, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 64)); // Poly |
| 1309 | movdqu(xmm12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr() + 80)); // Twoone |
| 1310 | // Compute H ^ 2 from the input subkeyH |
| 1311 | movdqu(xmm2, xmm6); |
| 1312 | vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit); |
| 1313 | vpsrlq(xmm2, xmm2, 63, Assembler::AVX_128bit); |
| 1314 | movdqu(xmm1, xmm2); |
| 1315 | vpslldq(xmm2, xmm2, 8, Assembler::AVX_128bit); |
| 1316 | vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit); |
| 1317 | vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); |
| 1318 | |
| 1319 | vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit); |
| 1320 | vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit); |
| 1321 | vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit); |
| 1322 | vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); |
| 1323 | movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2 |
| 1324 | // Compute the remaining three powers of H using XMM registers and all following powers using ZMM |
| 1325 | movdqu(ZT5, HK); |
| 1326 | vinserti32x4(ZT7, ZT7, HK, 3); |
| 1327 | |
| 1328 | gfmul_avx512(ZT5, HK); |
| 1329 | movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2 |
| 1330 | vinserti32x4(ZT7, ZT7, ZT5, 2); |
| 1331 | |
| 1332 | gfmul_avx512(ZT5, HK); |
| 1333 | movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3 |
| 1334 | vinserti32x4(ZT7, ZT7, ZT5, 1); |
| 1335 | |
| 1336 | gfmul_avx512(ZT5, HK); |
| 1337 | movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4 |
| 1338 | vinserti32x4(ZT7, ZT7, ZT5, 0); |
| 1339 | |
| 1340 | evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); |
| 1341 | evmovdquq(ZT8, ZT7, Assembler::AVX_512bit); |
| 1342 | gfmul_avx512(ZT7, ZT5); |
| 1343 | evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit); |
| 1344 | evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit); |
| 1345 | gfmul_avx512(ZT8, ZT5); |
| 1346 | evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit); |
| 1347 | gfmul_avx512(ZT7, ZT5); |
| 1348 | evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit); |
| 1349 | gfmul_avx512(ZT8, ZT5); |
| 1350 | evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit); |
| 1351 | gfmul_avx512(ZT7, ZT5); |
| 1352 | evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit); |
| 1353 | gfmul_avx512(ZT8, ZT5); |
| 1354 | evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit); |
| 1355 | gfmul_avx512(ZT7, ZT5); |
| 1356 | evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit); |
| 1357 | gfmul_avx512(ZT8, ZT5); |
| 1358 | evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit); |
| 1359 | gfmul_avx512(ZT7, ZT5); |
| 1360 | evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit); |
| 1361 | gfmul_avx512(ZT8, ZT5); |
| 1362 | evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit); |
| 1363 | gfmul_avx512(ZT7, ZT5); |
| 1364 | evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit); |
| 1365 | ret(0); |
| 1366 | } |
| 1367 | |
| 1368 | #define vclmul_reduce(out, poly, hi128, lo128, tmp0, tmp1)evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); vpslldq (tmp0, tmp0, 8, Assembler::AVX_512bit); evpxorq(tmp0, lo128, tmp0 , Assembler::AVX_512bit); evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit ); evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); vpslldq(out, out, 4, Assembler::AVX_512bit); vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \ |
| 1369 | evpclmulqdq(tmp0, poly, lo128, 0x01, Assembler::AVX_512bit); \ |
| 1370 | vpslldq(tmp0, tmp0, 8, Assembler::AVX_512bit); \ |
| 1371 | evpxorq(tmp0, lo128, tmp0, Assembler::AVX_512bit); \ |
| 1372 | evpclmulqdq(tmp1, poly, tmp0, 0x00, Assembler::AVX_512bit); \ |
| 1373 | vpsrldq(tmp1, tmp1, 4, Assembler::AVX_512bit); \ |
| 1374 | evpclmulqdq(out, poly, tmp0, 0x10, Assembler::AVX_512bit); \ |
| 1375 | vpslldq(out, out, 4, Assembler::AVX_512bit); \ |
| 1376 | vpternlogq(out, 0x96, tmp1, hi128, Assembler::AVX_512bit); \ |
| 1377 | |
| 1378 | #define vhpxori4x128(reg, tmp)vextracti64x4(tmp, reg, 1); evpxorq(reg, reg, tmp, Assembler:: AVX_256bit); vextracti32x4(tmp, reg, 1); evpxorq(reg, reg, tmp , Assembler::AVX_128bit); \ |
| 1379 | vextracti64x4(tmp, reg, 1); \ |
| 1380 | evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \ |
| 1381 | vextracti32x4(tmp, reg, 1); \ |
| 1382 | evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \ |
| 1383 | |
| 1384 | #define roundEncode(key, dst1, dst2, dst3, dst4)vaesenc(dst1, dst1, key, Assembler::AVX_512bit); vaesenc(dst2 , dst2, key, Assembler::AVX_512bit); vaesenc(dst3, dst3, key, Assembler::AVX_512bit); vaesenc(dst4, dst4, key, Assembler:: AVX_512bit); \ |
| 1385 | vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \ |
| 1386 | vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \ |
| 1387 | vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \ |
| 1388 | vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \ |
| 1389 | |
| 1390 | #define lastroundEncode(key, dst1, dst2, dst3, dst4)vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); vaesenclast (dst2, dst2, key, Assembler::AVX_512bit); vaesenclast(dst3, dst3 , key, Assembler::AVX_512bit); vaesenclast(dst4, dst4, key, Assembler ::AVX_512bit); \ |
| 1391 | vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \ |
| 1392 | vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \ |
| 1393 | vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \ |
| 1394 | vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \ |
| 1395 | |
| 1396 | #define storeData(dst, position, src1, src2, src3, src4)evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1 , Assembler::AVX_512bit); evmovdquq(Address(dst, position, Address ::times_1, 1 * 64), src2, Assembler::AVX_512bit); evmovdquq(Address (dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit ); evmovdquq(Address(dst, position, Address::times_1, 3 * 64) , src4, Assembler::AVX_512bit); \ |
| 1397 | evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \ |
| 1398 | evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \ |
| 1399 | evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \ |
| 1400 | evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \ |
| 1401 | |
| 1402 | #define loadData(src, position, dst1, dst2, dst3, dst4)evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64 ), Assembler::AVX_512bit); evmovdquq(dst2, Address(src, position , Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq (dst3, Address(src, position, Address::times_1, 2 * 64), Assembler ::AVX_512bit); evmovdquq(dst4, Address(src, position, Address ::times_1, 3 * 64), Assembler::AVX_512bit); \ |
| 1403 | evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \ |
| 1404 | evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \ |
| 1405 | evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \ |
| 1406 | evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \ |
| 1407 | |
| 1408 | #define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey)evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit) ; evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit ); evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit ); evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit ); \ |
| 1409 | evpclmulqdq(dst00, ghdata, hkey, 0x00, Assembler::AVX_512bit); \ |
| 1410 | evpclmulqdq(dst01, ghdata, hkey, 0x01, Assembler::AVX_512bit); \ |
| 1411 | evpclmulqdq(dst10, ghdata, hkey, 0x10, Assembler::AVX_512bit); \ |
| 1412 | evpclmulqdq(dst11, ghdata, hkey, 0x11, Assembler::AVX_512bit); \ |
| 1413 | |
| 1414 | #define shuffleExorRnd1Key(dst0, dst1, dst2, dst3, shufmask, rndkey)vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); evpxorq (dst0, dst0, rndkey, Assembler::AVX_512bit); vpshufb(dst1, dst1 , shufmask, Assembler::AVX_512bit); evpxorq(dst1, dst1, rndkey , Assembler::AVX_512bit); vpshufb(dst2, dst2, shufmask, Assembler ::AVX_512bit); evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit ); vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); evpxorq (dst3, dst3, rndkey, Assembler::AVX_512bit); \ |
| 1415 | vpshufb(dst0, dst0, shufmask, Assembler::AVX_512bit); \ |
| 1416 | evpxorq(dst0, dst0, rndkey, Assembler::AVX_512bit); \ |
| 1417 | vpshufb(dst1, dst1, shufmask, Assembler::AVX_512bit); \ |
| 1418 | evpxorq(dst1, dst1, rndkey, Assembler::AVX_512bit); \ |
| 1419 | vpshufb(dst2, dst2, shufmask, Assembler::AVX_512bit); \ |
| 1420 | evpxorq(dst2, dst2, rndkey, Assembler::AVX_512bit); \ |
| 1421 | vpshufb(dst3, dst3, shufmask, Assembler::AVX_512bit); \ |
| 1422 | evpxorq(dst3, dst3, rndkey, Assembler::AVX_512bit); \ |
| 1423 | |
| 1424 | #define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3)evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); evpxorq(dst1 , dst1, src1, Assembler::AVX_512bit); evpxorq(dst2, dst2, src2 , Assembler::AVX_512bit); evpxorq(dst3, dst3, src3, Assembler ::AVX_512bit); \ |
| 1425 | evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \ |
| 1426 | evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \ |
| 1427 | evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \ |
| 1428 | evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \ |
| 1429 | |
| 1430 | #define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33)vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); vpternlogq (dst1, 0x96, src12, src13, Assembler::AVX_512bit); vpternlogq (dst2, 0x96, src22, src23, Assembler::AVX_512bit); vpternlogq (dst3, 0x96, src32, src33, Assembler::AVX_512bit); \ |
| 1431 | vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \ |
| 1432 | vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \ |
| 1433 | vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \ |
| 1434 | vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \ |
| 1435 | |
| 1436 | void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx, |
| 1437 | Register in, Register out, Register data, Register pos, bool first_time_reduction, XMMRegister addmask, bool ghash_input, Register rounds, |
| 1438 | Register ghash_pos, bool final_reduction, int i, XMMRegister counter_inc_mask) { |
| 1439 | |
| 1440 | Label AES_192, AES_256, LAST_AES_RND; |
| 1441 | const XMMRegister ZTMP0 = xmm0; |
| 1442 | const XMMRegister ZTMP1 = xmm3; |
| 1443 | const XMMRegister ZTMP2 = xmm4; |
| 1444 | const XMMRegister ZTMP3 = xmm5; |
| 1445 | const XMMRegister ZTMP5 = xmm7; |
| 1446 | const XMMRegister ZTMP6 = xmm10; |
| 1447 | const XMMRegister ZTMP7 = xmm11; |
| 1448 | const XMMRegister ZTMP8 = xmm12; |
| 1449 | const XMMRegister ZTMP9 = xmm13; |
| 1450 | const XMMRegister ZTMP10 = xmm15; |
| 1451 | const XMMRegister ZTMP11 = xmm16; |
| 1452 | const XMMRegister ZTMP12 = xmm17; |
| 1453 | |
| 1454 | const XMMRegister ZTMP13 = xmm19; |
| 1455 | const XMMRegister ZTMP14 = xmm20; |
| 1456 | const XMMRegister ZTMP15 = xmm21; |
| 1457 | const XMMRegister ZTMP16 = xmm30; |
| 1458 | const XMMRegister ZTMP17 = xmm31; |
| 1459 | const XMMRegister ZTMP18 = xmm1; |
| 1460 | const XMMRegister ZTMP19 = xmm2; |
| 1461 | const XMMRegister ZTMP20 = xmm8; |
| 1462 | const XMMRegister ZTMP21 = xmm22; |
| 1463 | const XMMRegister ZTMP22 = xmm23; |
| 1464 | |
| 1465 | // Pre increment counters |
| 1466 | vpaddd(ZTMP0, ctr_blockx, counter_inc_mask, Assembler::AVX_512bit); |
| 1467 | vpaddd(ZTMP1, ZTMP0, counter_inc_mask, Assembler::AVX_512bit); |
| 1468 | vpaddd(ZTMP2, ZTMP1, counter_inc_mask, Assembler::AVX_512bit); |
| 1469 | vpaddd(ZTMP3, ZTMP2, counter_inc_mask, Assembler::AVX_512bit); |
| 1470 | // Save counter value |
| 1471 | evmovdquq(ctr_blockx, ZTMP3, Assembler::AVX_512bit); |
| 1472 | |
| 1473 | // Reuse ZTMP17 / ZTMP18 for loading AES Keys |
| 1474 | // Pre-load AES round keys |
| 1475 | ev_load_key(ZTMP17, key, 0, xmm29); |
| 1476 | ev_load_key(ZTMP18, key, 1 * 16, xmm29); |
| 1477 | |
| 1478 | // ZTMP19 & ZTMP20 used for loading hash key |
| 1479 | // Pre-load hash key |
| 1480 | evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit); |
| 1481 | evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
| 1482 | // Load data for computing ghash |
| 1483 | evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1484 | vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); |
| 1485 | |
| 1486 | // Xor cipher block 0 with input ghash, if available |
| 1487 | if (ghash_input) { |
| 1488 | evpxorq(ZTMP21, ZTMP21, aad_hashx, Assembler::AVX_512bit); |
| 1489 | } |
| 1490 | // Load data for computing ghash |
| 1491 | evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1492 | vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); |
| 1493 | |
| 1494 | // stitch AES rounds with GHASH |
| 1495 | // AES round 0, xmm24 has shuffle mask |
| 1496 | shuffleExorRnd1Key(ZTMP0, ZTMP1, ZTMP2, ZTMP3, xmm24, ZTMP17)vpshufb(ZTMP0, ZTMP0, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vpshufb(ZTMP1, ZTMP1 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP1, ZTMP1, ZTMP17 , Assembler::AVX_512bit); vpshufb(ZTMP2, ZTMP2, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit ); vpshufb(ZTMP3, ZTMP3, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1497 | // Reuse ZTMP17 / ZTMP18 for loading remaining AES Keys |
| 1498 | ev_load_key(ZTMP17, key, 2 * 16, xmm29); |
| 1499 | // GHASH 4 blocks |
| 1500 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19)evpclmulqdq(ZTMP6, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );; |
| 1501 | // Load the next hkey and Ghash data |
| 1502 | evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
| 1503 | evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
| 1504 | vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); |
| 1505 | |
| 1506 | // AES round 1 |
| 1507 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1508 | ev_load_key(ZTMP18, key, 3 * 16, xmm29); |
| 1509 | |
| 1510 | // GHASH 4 blocks(11 to 8) |
| 1511 | carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );; |
| 1512 | // Load the next hkey and GDATA |
| 1513 | evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); |
| 1514 | evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
| 1515 | vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); |
| 1516 | |
| 1517 | // AES round 2 |
| 1518 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1519 | ev_load_key(ZTMP17, key, 4 * 16, xmm29); |
| 1520 | |
| 1521 | // GHASH 4 blocks(7 to 4) |
| 1522 | carrylessMultiply(ZTMP14, ZTMP16, ZTMP15, ZTMP13, ZTMP21, ZTMP19)evpclmulqdq(ZTMP14, ZTMP21, ZTMP19, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP16, ZTMP21, ZTMP19, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP15, ZTMP21, ZTMP19, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP13, ZTMP21, ZTMP19, 0x11, Assembler::AVX_512bit );; |
| 1523 | // AES rounds 3 |
| 1524 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1525 | ev_load_key(ZTMP18, key, 5 * 16, xmm29); |
| 1526 | |
| 1527 | // Gather(XOR) GHASH for 12 blocks |
| 1528 | xorGHASH(ZTMP5, ZTMP6, ZTMP8, ZTMP7, ZTMP9, ZTMP13, ZTMP10, ZTMP14, ZTMP12, ZTMP16, ZTMP11, ZTMP15)vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP13, Assembler::AVX_512bit) ; vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP14, Assembler::AVX_512bit ); vpternlogq(ZTMP8, 0x96, ZTMP12, ZTMP16, Assembler::AVX_512bit ); vpternlogq(ZTMP7, 0x96, ZTMP11, ZTMP15, Assembler::AVX_512bit );; |
| 1529 | |
| 1530 | // AES rounds 4 |
| 1531 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1532 | ev_load_key(ZTMP17, key, 6 * 16, xmm29); |
| 1533 | |
| 1534 | // load plain / cipher text(recycle registers) |
| 1535 | loadData(in, pos, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evmovdquq(ZTMP13, Address(in, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(in, pos, Address ::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP15, Address(in, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit ); evmovdquq(ZTMP16, Address(in, pos, Address::times_1, 3 * 64 ), Assembler::AVX_512bit);; |
| 1536 | |
| 1537 | // AES rounds 5 |
| 1538 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1539 | ev_load_key(ZTMP18, key, 7 * 16, xmm29); |
| 1540 | // GHASH 4 blocks(3 to 0) |
| 1541 | carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20)evpclmulqdq(ZTMP10, ZTMP22, ZTMP20, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP22, ZTMP20, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP22, ZTMP20, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP22, ZTMP20, 0x11, Assembler::AVX_512bit );; |
| 1542 | |
| 1543 | // AES round 6 |
| 1544 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1545 | ev_load_key(ZTMP17, key, 8 * 16, xmm29); |
| 1546 | |
| 1547 | // gather GHASH in ZTMP6(low) and ZTMP5(high) |
| 1548 | if (first_time_reduction) { |
| 1549 | vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit); |
| 1550 | evpxorq(xmm25, ZTMP7, ZTMP11, Assembler::AVX_512bit); |
| 1551 | evpxorq(xmm27, ZTMP5, ZTMP9, Assembler::AVX_512bit); |
| 1552 | evpxorq(xmm26, ZTMP6, ZTMP10, Assembler::AVX_512bit); |
| 1553 | } |
| 1554 | else if (!first_time_reduction && !final_reduction) { |
| 1555 | xorGHASH(ZTMP7, xmm25, xmm27, xmm26, ZTMP8, ZTMP12, ZTMP7, ZTMP11, ZTMP5, ZTMP9, ZTMP6, ZTMP10)vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit) ; vpternlogq(xmm25, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(xmm27, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit ); vpternlogq(xmm26, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit );; |
| 1556 | } |
| 1557 | |
| 1558 | if (final_reduction) { |
| 1559 | // Phase one: Add mid products together |
| 1560 | // Also load polynomial constant for reduction |
| 1561 | vpternlogq(ZTMP7, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit); |
| 1562 | vpternlogq(ZTMP7, 0x96, xmm25, ZTMP11, Assembler::AVX_512bit); |
| 1563 | vpsrldq(ZTMP11, ZTMP7, 8, Assembler::AVX_512bit); |
| 1564 | vpslldq(ZTMP7, ZTMP7, 8, Assembler::AVX_512bit); |
| 1565 | evmovdquq(ZTMP12, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx); |
| 1566 | } |
| 1567 | // AES round 7 |
| 1568 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1569 | ev_load_key(ZTMP18, key, 9 * 16, xmm29); |
| 1570 | if (final_reduction) { |
| 1571 | vpternlogq(ZTMP5, 0x96, ZTMP9, ZTMP11, Assembler::AVX_512bit); |
| 1572 | evpxorq(ZTMP5, ZTMP5, xmm27, Assembler::AVX_512bit); |
| 1573 | vpternlogq(ZTMP6, 0x96, ZTMP10, ZTMP7, Assembler::AVX_512bit); |
| 1574 | evpxorq(ZTMP6, ZTMP6, xmm26, Assembler::AVX_512bit); |
| 1575 | } |
| 1576 | // AES round 8 |
| 1577 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1578 | ev_load_key(ZTMP17, key, 10 * 16, xmm29); |
| 1579 | |
| 1580 | // Horizontal xor of low and high 4*128 |
| 1581 | if (final_reduction) { |
| 1582 | vhpxori4x128(ZTMP5, ZTMP9)vextracti64x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler ::AVX_256bit); vextracti32x4(ZTMP9, ZTMP5, 1); evpxorq(ZTMP5, ZTMP5, ZTMP9, Assembler::AVX_128bit);; |
| 1583 | vhpxori4x128(ZTMP6, ZTMP10)vextracti64x4(ZTMP10, ZTMP6, 1); evpxorq(ZTMP6, ZTMP6, ZTMP10 , Assembler::AVX_256bit); vextracti32x4(ZTMP10, ZTMP6, 1); evpxorq (ZTMP6, ZTMP6, ZTMP10, Assembler::AVX_128bit);; |
| 1584 | } |
| 1585 | // AES round 9 |
| 1586 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1587 | // First phase of reduction |
| 1588 | if (final_reduction) { |
| 1589 | evpclmulqdq(ZTMP10, ZTMP12, ZTMP6, 0x01, Assembler::AVX_128bit); |
| 1590 | vpslldq(ZTMP10, ZTMP10, 8, Assembler::AVX_128bit); |
| 1591 | evpxorq(ZTMP10, ZTMP6, ZTMP10, Assembler::AVX_128bit); |
| 1592 | } |
| 1593 | cmpl(rounds, 52); |
| 1594 | jcc(Assembler::greaterEqual, AES_192); |
| 1595 | jmp(LAST_AES_RND); |
| 1596 | // AES rounds upto 11 (AES192) or 13 (AES256) |
| 1597 | bind(AES_192); |
| 1598 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1599 | ev_load_key(ZTMP18, key, 11 * 16, xmm29); |
| 1600 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1601 | ev_load_key(ZTMP17, key, 12 * 16, xmm29); |
| 1602 | cmpl(rounds, 60); |
| 1603 | jcc(Assembler::aboveEqual, AES_256); |
| 1604 | jmp(LAST_AES_RND); |
| 1605 | |
| 1606 | bind(AES_256); |
| 1607 | roundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP17, Assembler::AVX_512bit);; |
| 1608 | ev_load_key(ZTMP18, key, 13 * 16, xmm29); |
| 1609 | roundEncode(ZTMP18, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenc(ZTMP0, ZTMP0, ZTMP18, Assembler::AVX_512bit); vaesenc (ZTMP1, ZTMP1, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP2, ZTMP2, ZTMP18, Assembler::AVX_512bit); vaesenc(ZTMP3, ZTMP3, ZTMP18, Assembler::AVX_512bit);; |
| 1610 | ev_load_key(ZTMP17, key, 14 * 16, xmm29); |
| 1611 | |
| 1612 | bind(LAST_AES_RND); |
| 1613 | // Second phase of reduction |
| 1614 | if (final_reduction) { |
| 1615 | evpclmulqdq(ZTMP9, ZTMP12, ZTMP10, 0x00, Assembler::AVX_128bit); |
| 1616 | vpsrldq(ZTMP9, ZTMP9, 4, Assembler::AVX_128bit); // Shift-R 1-DW to obtain 2-DWs shift-R |
| 1617 | evpclmulqdq(ZTMP11, ZTMP12, ZTMP10, 0x10, Assembler::AVX_128bit); |
| 1618 | vpslldq(ZTMP11, ZTMP11, 4, Assembler::AVX_128bit); // Shift-L 1-DW for result |
| 1619 | // ZTMP5 = ZTMP5 X ZTMP11 X ZTMP9 |
| 1620 | vpternlogq(ZTMP5, 0x96, ZTMP11, ZTMP9, Assembler::AVX_128bit); |
| 1621 | } |
| 1622 | // Last AES round |
| 1623 | lastroundEncode(ZTMP17, ZTMP0, ZTMP1, ZTMP2, ZTMP3)vaesenclast(ZTMP0, ZTMP0, ZTMP17, Assembler::AVX_512bit); vaesenclast (ZTMP1, ZTMP1, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP2 , ZTMP2, ZTMP17, Assembler::AVX_512bit); vaesenclast(ZTMP3, ZTMP3 , ZTMP17, Assembler::AVX_512bit);; |
| 1624 | // XOR against plain / cipher text |
| 1625 | xorBeforeStore(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP13, ZTMP14, ZTMP15, ZTMP16)evpxorq(ZTMP0, ZTMP0, ZTMP13, Assembler::AVX_512bit); evpxorq (ZTMP1, ZTMP1, ZTMP14, Assembler::AVX_512bit); evpxorq(ZTMP2, ZTMP2, ZTMP15, Assembler::AVX_512bit); evpxorq(ZTMP3, ZTMP3, ZTMP16, Assembler::AVX_512bit);; |
| 1626 | // store cipher / plain text |
| 1627 | storeData(out, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP0, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP1, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP2, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP3 , Assembler::AVX_512bit);; |
| 1628 | } |
| 1629 | |
| 1630 | void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, |
| 1631 | Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) { |
| 1632 | Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32, |
| 1633 | AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16; |
| 1634 | const XMMRegister CTR_BLOCKx = xmm9; |
| 1635 | const XMMRegister AAD_HASHx = xmm14; |
| 1636 | const Register pos = rax; |
| 1637 | const Register rounds = r15; |
| 1638 | Register ghash_pos; |
| 1639 | #ifndef _WIN64 |
| 1640 | ghash_pos = r14; |
| 1641 | #else |
| 1642 | ghash_pos = r11; |
| 1643 | #endif // !_WIN64 |
| 1644 | const XMMRegister ZTMP0 = xmm0; |
| 1645 | const XMMRegister ZTMP1 = xmm3; |
| 1646 | const XMMRegister ZTMP2 = xmm4; |
| 1647 | const XMMRegister ZTMP3 = xmm5; |
| 1648 | const XMMRegister ZTMP4 = xmm6; |
| 1649 | const XMMRegister ZTMP5 = xmm7; |
| 1650 | const XMMRegister ZTMP6 = xmm10; |
| 1651 | const XMMRegister ZTMP7 = xmm11; |
| 1652 | const XMMRegister ZTMP8 = xmm12; |
| 1653 | const XMMRegister ZTMP9 = xmm13; |
| 1654 | const XMMRegister ZTMP10 = xmm15; |
| 1655 | const XMMRegister ZTMP11 = xmm16; |
| 1656 | const XMMRegister ZTMP12 = xmm17; |
| 1657 | const XMMRegister ZTMP13 = xmm19; |
| 1658 | const XMMRegister ZTMP14 = xmm20; |
| 1659 | const XMMRegister ZTMP15 = xmm21; |
| 1660 | const XMMRegister ZTMP16 = xmm30; |
| 1661 | const XMMRegister COUNTER_INC_MASK = xmm18; |
| 1662 | |
| 1663 | movl(pos, 0); // Total length processed |
| 1664 | // Min data size processed = 768 bytes |
| 1665 | cmpl(len, 768); |
| 1666 | jcc(Assembler::less, ENC_DEC_DONE); |
| 1667 | |
| 1668 | // Generate 48 constants for htbl |
| 1669 | call(GENERATE_HTBL_48_BLKS, relocInfo::none); |
| 1670 | int index = 0; // Index for choosing subkeyHtbl entry |
| 1671 | movl(ghash_pos, 0); // Pointer for ghash read and store operations |
| 1672 | |
| 1673 | // Move initial counter value and STATE value into variables |
| 1674 | movdqu(CTR_BLOCKx, Address(counter, 0)); |
| 1675 | movdqu(AAD_HASHx, Address(state, 0)); |
| 1676 | // Load lswap mask for ghash |
| 1677 | movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()), rbx); |
| 1678 | // Shuffle input state using lswap mask |
| 1679 | vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit); |
| 1680 | |
| 1681 | // Compute #rounds for AES based on the length of the key array |
| 1682 | movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| 1683 | |
| 1684 | // Broadcast counter value to 512 bit register |
| 1685 | evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit); |
| 1686 | // Load counter shuffle mask |
| 1687 | evmovdquq(xmm24, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, rbx); |
| 1688 | // Shuffle counter |
| 1689 | vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit); |
| 1690 | |
| 1691 | // Load mask for incrementing counter |
| 1692 | evmovdquq(COUNTER_INC_MASK, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, rbx); |
| 1693 | // Pre-increment counter |
| 1694 | vpaddd(ZTMP5, CTR_BLOCKx, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, rbx); |
| 1695 | vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1696 | vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1697 | vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1698 | |
| 1699 | // Begin 32 blocks of AES processing |
| 1700 | bind(AES_32_BLOCKS); |
| 1701 | // Save incremented counter before overwriting it with AES data |
| 1702 | evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit); |
| 1703 | |
| 1704 | // Move 256 bytes of data |
| 1705 | loadData(in, pos, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evmovdquq(ZTMP0, Address(in, pos, Address::times_1, 0 * 64), Assembler ::AVX_512bit); evmovdquq(ZTMP1, Address(in, pos, Address::times_1 , 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP2, Address(in , pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq (ZTMP3, Address(in, pos, Address::times_1, 3 * 64), Assembler ::AVX_512bit);; |
| 1706 | // Load key shuffle mask |
| 1707 | movdqu(xmm29, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); |
| 1708 | // Load 0th AES round key |
| 1709 | ev_load_key(ZTMP4, key, 0, xmm29); |
| 1710 | // AES-ROUND0, xmm24 has the shuffle mask |
| 1711 | shuffleExorRnd1Key(ZTMP5, ZTMP6, ZTMP7, ZTMP8, xmm24, ZTMP4)vpshufb(ZTMP5, ZTMP5, xmm24, Assembler::AVX_512bit); evpxorq( ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP6, ZTMP6 , xmm24, Assembler::AVX_512bit); evpxorq(ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vpshufb(ZTMP7, ZTMP7, xmm24, Assembler ::AVX_512bit); evpxorq(ZTMP7, ZTMP7, ZTMP4, Assembler::AVX_512bit ); vpshufb(ZTMP8, ZTMP8, xmm24, Assembler::AVX_512bit); evpxorq (ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1712 | |
| 1713 | for (int j = 1; j < 10; j++) { |
| 1714 | ev_load_key(ZTMP4, key, j * 16, xmm29); |
| 1715 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1716 | } |
| 1717 | ev_load_key(ZTMP4, key, 10 * 16, xmm29); |
| 1718 | // AES rounds upto 11 (AES192) or 13 (AES256) |
| 1719 | cmpl(rounds, 52); |
| 1720 | jcc(Assembler::greaterEqual, AES_192); |
| 1721 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
| 1722 | jmp(STORE_CT); |
| 1723 | |
| 1724 | bind(AES_192); |
| 1725 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1726 | ev_load_key(ZTMP4, key, 11 * 16, xmm29); |
| 1727 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1728 | cmpl(rounds, 60); |
| 1729 | jcc(Assembler::aboveEqual, AES_256); |
| 1730 | ev_load_key(ZTMP4, key, 12 * 16, xmm29); |
| 1731 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
| 1732 | jmp(STORE_CT); |
| 1733 | |
| 1734 | bind(AES_256); |
| 1735 | ev_load_key(ZTMP4, key, 12 * 16, xmm29); |
| 1736 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1737 | ev_load_key(ZTMP4, key, 13 * 16, xmm29); |
| 1738 | roundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenc(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenc( ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP7, ZTMP7 , ZTMP4, Assembler::AVX_512bit); vaesenc(ZTMP8, ZTMP8, ZTMP4, Assembler::AVX_512bit);; |
| 1739 | ev_load_key(ZTMP4, key, 14 * 16, xmm29); |
| 1740 | // Last AES round |
| 1741 | lastroundEncode(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8)vaesenclast(ZTMP5, ZTMP5, ZTMP4, Assembler::AVX_512bit); vaesenclast (ZTMP6, ZTMP6, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP7 , ZTMP7, ZTMP4, Assembler::AVX_512bit); vaesenclast(ZTMP8, ZTMP8 , ZTMP4, Assembler::AVX_512bit);; |
| 1742 | |
| 1743 | bind(STORE_CT); |
| 1744 | // Xor the encrypted key with PT to obtain CT |
| 1745 | xorBeforeStore(ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP0, ZTMP1, ZTMP2, ZTMP3)evpxorq(ZTMP5, ZTMP5, ZTMP0, Assembler::AVX_512bit); evpxorq( ZTMP6, ZTMP6, ZTMP1, Assembler::AVX_512bit); evpxorq(ZTMP7, ZTMP7 , ZTMP2, Assembler::AVX_512bit); evpxorq(ZTMP8, ZTMP8, ZTMP3, Assembler::AVX_512bit);; |
| 1746 | storeData(out, pos, ZTMP5, ZTMP6, ZTMP7, ZTMP8)evmovdquq(Address(out, pos, Address::times_1, 0 * 64), ZTMP5, Assembler::AVX_512bit); evmovdquq(Address(out, pos, Address:: times_1, 1 * 64), ZTMP6, Assembler::AVX_512bit); evmovdquq(Address (out, pos, Address::times_1, 2 * 64), ZTMP7, Assembler::AVX_512bit ); evmovdquq(Address(out, pos, Address::times_1, 3 * 64), ZTMP8 , Assembler::AVX_512bit);; |
| 1747 | // 16 blocks encryption completed |
| 1748 | addl(pos, 256); |
| 1749 | cmpl(pos, 512); |
| 1750 | jcc(Assembler::aboveEqual, GHASH_AES_PARALLEL); |
| 1751 | vpaddd(ZTMP5, CTR_BLOCKx, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1752 | vpaddd(ZTMP6, ZTMP5, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1753 | vpaddd(ZTMP7, ZTMP6, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1754 | vpaddd(ZTMP8, ZTMP7, COUNTER_INC_MASK, Assembler::AVX_512bit); |
| 1755 | jmp(AES_32_BLOCKS); |
| 1756 | |
| 1757 | bind(GHASH_AES_PARALLEL); |
| 1758 | // Ghash16_encrypt16_parallel takes place in the order with three reduction values: |
| 1759 | // 1) First time -> cipher xor input ghash |
| 1760 | // 2) No reduction -> accumulate multiplication values |
| 1761 | // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round |
| 1762 | // Reduction value = first time |
| 1763 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
| 1764 | addl(pos, 256); |
| 1765 | addl(ghash_pos, 256); |
| 1766 | index += 4; |
| 1767 | |
| 1768 | // At this point we have processed 768 bytes of AES and 256 bytes of GHASH. |
| 1769 | // If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code |
| 1770 | subl(len, 768); |
| 1771 | cmpl(len, 768); |
| 1772 | jcc(Assembler::less, GHASH_LAST_32); |
| 1773 | |
| 1774 | // AES 16 blocks and GHASH 16 blocks in parallel |
| 1775 | // For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times |
| 1776 | // Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations |
| 1777 | // Each call uses 4 subkeyHtbl values, so increment the index by 4. |
| 1778 | bind(GHASH_16_AES_16); |
| 1779 | // Reduction value = no reduction |
| 1780 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
| 1781 | addl(pos, 256); |
| 1782 | addl(ghash_pos, 256); |
| 1783 | index += 4; |
| 1784 | // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash |
| 1785 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK); |
| 1786 | addl(pos, 256); |
| 1787 | addl(ghash_pos, 256); |
| 1788 | // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline |
| 1789 | movdqu(AAD_HASHx, ZTMP5); |
| 1790 | index = 0; // Reset subkeyHtbl index |
| 1791 | |
| 1792 | // Restart the pipeline |
| 1793 | // Reduction value = first time |
| 1794 | ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); |
| 1795 | addl(pos, 256); |
| 1796 | addl(ghash_pos, 256); |
| 1797 | index += 4; |
Value stored to 'index' is never read | |
| 1798 | |
| 1799 | subl(len, 768); |
| 1800 | cmpl(len, 768); |
| 1801 | jcc(Assembler::greaterEqual, GHASH_16_AES_16); |
| 1802 | |
| 1803 | // GHASH last 32 blocks processed here |
| 1804 | // GHASH products accumulated in ZMM27, ZMM25 and ZMM26 during GHASH16-AES16 operation is used |
| 1805 | bind(GHASH_LAST_32); |
| 1806 | // Use rbx as a pointer to the htbl; For last 32 blocks of GHASH, use key# 4-11 entry in subkeyHtbl |
| 1807 | movl(rbx, 256); |
| 1808 | // Load cipher blocks |
| 1809 | evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1810 | evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1811 | vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); |
| 1812 | vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); |
| 1813 | // Load ghash keys |
| 1814 | evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1815 | evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1816 | |
| 1817 | // Ghash blocks 0 - 3 |
| 1818 | carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15)evpclmulqdq(ZTMP2, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP3, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP4, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP1, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );; |
| 1819 | // Ghash blocks 4 - 7 |
| 1820 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP14, ZTMP16)evpclmulqdq(ZTMP6, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );; |
| 1821 | |
| 1822 | vpternlogq(ZTMP1, 0x96, ZTMP5, xmm27, Assembler::AVX_512bit); // ZTMP1 = ZTMP1 + ZTMP5 + zmm27 |
| 1823 | vpternlogq(ZTMP2, 0x96, ZTMP6, xmm26, Assembler::AVX_512bit); // ZTMP2 = ZTMP2 + ZTMP6 + zmm26 |
| 1824 | vpternlogq(ZTMP3, 0x96, ZTMP7, xmm25, Assembler::AVX_512bit); // ZTMP3 = ZTMP3 + ZTMP7 + zmm25 |
| 1825 | evpxorq(ZTMP4, ZTMP4, ZTMP8, Assembler::AVX_512bit); // ZTMP4 = ZTMP4 + ZTMP8 |
| 1826 | |
| 1827 | addl(ghash_pos, 128); |
| 1828 | addl(rbx, 128); |
| 1829 | |
| 1830 | // Ghash remaining blocks |
| 1831 | bind(LOOP); |
| 1832 | cmpl(ghash_pos, pos); |
| 1833 | jcc(Assembler::aboveEqual, ACCUMULATE); |
| 1834 | // Load next cipher blocks and corresponding ghash keys |
| 1835 | evmovdquq(ZTMP13, Address(ct, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1836 | evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1837 | vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); |
| 1838 | vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); |
| 1839 | evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
| 1840 | evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
| 1841 | |
| 1842 | // ghash blocks 0 - 3 |
| 1843 | carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15)evpclmulqdq(ZTMP6, ZTMP13, ZTMP15, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP7, ZTMP13, ZTMP15, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP8, ZTMP13, ZTMP15, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP5, ZTMP13, ZTMP15, 0x11, Assembler::AVX_512bit );; |
| 1844 | |
| 1845 | // ghash blocks 4 - 7 |
| 1846 | carrylessMultiply(ZTMP10, ZTMP11, ZTMP12, ZTMP9, ZTMP14, ZTMP16)evpclmulqdq(ZTMP10, ZTMP14, ZTMP16, 0x00, Assembler::AVX_512bit ); evpclmulqdq(ZTMP11, ZTMP14, ZTMP16, 0x01, Assembler::AVX_512bit ); evpclmulqdq(ZTMP12, ZTMP14, ZTMP16, 0x10, Assembler::AVX_512bit ); evpclmulqdq(ZTMP9, ZTMP14, ZTMP16, 0x11, Assembler::AVX_512bit );; |
| 1847 | |
| 1848 | // update sums |
| 1849 | // ZTMP1 = ZTMP1 + ZTMP5 + ZTMP9 |
| 1850 | // ZTMP2 = ZTMP2 + ZTMP6 + ZTMP10 |
| 1851 | // ZTMP3 = ZTMP3 + ZTMP7 xor ZTMP11 |
| 1852 | // ZTMP4 = ZTMP4 + ZTMP8 xor ZTMP12 |
| 1853 | xorGHASH(ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP9, ZTMP6, ZTMP10, ZTMP7, ZTMP11, ZTMP8, ZTMP12)vpternlogq(ZTMP1, 0x96, ZTMP5, ZTMP9, Assembler::AVX_512bit); vpternlogq(ZTMP2, 0x96, ZTMP6, ZTMP10, Assembler::AVX_512bit ); vpternlogq(ZTMP3, 0x96, ZTMP7, ZTMP11, Assembler::AVX_512bit ); vpternlogq(ZTMP4, 0x96, ZTMP8, ZTMP12, Assembler::AVX_512bit );; |
| 1854 | addl(ghash_pos, 128); |
| 1855 | addl(rbx, 128); |
| 1856 | jmp(LOOP); |
| 1857 | |
| 1858 | // Integrate ZTMP3/ZTMP4 into ZTMP1 and ZTMP2 |
| 1859 | bind(ACCUMULATE); |
| 1860 | evpxorq(ZTMP3, ZTMP3, ZTMP4, Assembler::AVX_512bit); |
| 1861 | vpsrldq(ZTMP7, ZTMP3, 8, Assembler::AVX_512bit); |
| 1862 | vpslldq(ZTMP8, ZTMP3, 8, Assembler::AVX_512bit); |
| 1863 | evpxorq(ZTMP1, ZTMP1, ZTMP7, Assembler::AVX_512bit); |
| 1864 | evpxorq(ZTMP2, ZTMP2, ZTMP8, Assembler::AVX_512bit); |
| 1865 | |
| 1866 | // Add ZTMP1 and ZTMP2 128 - bit words horizontally |
| 1867 | vhpxori4x128(ZTMP1, ZTMP11)vextracti64x4(ZTMP11, ZTMP1, 1); evpxorq(ZTMP1, ZTMP1, ZTMP11 , Assembler::AVX_256bit); vextracti32x4(ZTMP11, ZTMP1, 1); evpxorq (ZTMP1, ZTMP1, ZTMP11, Assembler::AVX_128bit);; |
| 1868 | vhpxori4x128(ZTMP2, ZTMP12)vextracti64x4(ZTMP12, ZTMP2, 1); evpxorq(ZTMP2, ZTMP2, ZTMP12 , Assembler::AVX_256bit); vextracti32x4(ZTMP12, ZTMP2, 1); evpxorq (ZTMP2, ZTMP2, ZTMP12, Assembler::AVX_128bit);; |
| 1869 | // Load reduction polynomial and compute final reduction |
| 1870 | evmovdquq(ZTMP15, ExternalAddress(StubRoutines::x86::ghash_polynomial512_addr()), Assembler::AVX_512bit, rbx); |
| 1871 | vclmul_reduce(AAD_HASHx, ZTMP15, ZTMP1, ZTMP2, ZTMP3, ZTMP4)evpclmulqdq(ZTMP3, ZTMP15, ZTMP2, 0x01, Assembler::AVX_512bit ); vpslldq(ZTMP3, ZTMP3, 8, Assembler::AVX_512bit); evpxorq(ZTMP3 , ZTMP2, ZTMP3, Assembler::AVX_512bit); evpclmulqdq(ZTMP4, ZTMP15 , ZTMP3, 0x00, Assembler::AVX_512bit); vpsrldq(ZTMP4, ZTMP4, 4 , Assembler::AVX_512bit); evpclmulqdq(AAD_HASHx, ZTMP15, ZTMP3 , 0x10, Assembler::AVX_512bit); vpslldq(AAD_HASHx, AAD_HASHx, 4, Assembler::AVX_512bit); vpternlogq(AAD_HASHx, 0x96, ZTMP4 , ZTMP1, Assembler::AVX_512bit);; |
| 1872 | |
| 1873 | // Pre-increment counter for next operation |
| 1874 | vpaddd(CTR_BLOCKx, CTR_BLOCKx, xmm18, Assembler::AVX_128bit); |
| 1875 | // Shuffle counter and save the updated value |
| 1876 | vpshufb(CTR_BLOCKx, CTR_BLOCKx, xmm24, Assembler::AVX_512bit); |
| 1877 | movdqu(Address(counter, 0), CTR_BLOCKx); |
| 1878 | // Load ghash lswap mask |
| 1879 | movdqu(xmm24, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 1880 | // Shuffle ghash using lbswap_mask and store it |
| 1881 | vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit); |
| 1882 | movdqu(Address(state, 0), AAD_HASHx); |
| 1883 | jmp(ENC_DEC_DONE); |
| 1884 | |
| 1885 | bind(GENERATE_HTBL_48_BLKS); |
| 1886 | generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl); |
| 1887 | |
| 1888 | bind(ENC_DEC_DONE); |
| 1889 | movq(rax, pos); |
| 1890 | } |
| 1891 | |
| 1892 | #endif // _LP64 |