1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2018, Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39 ; This code was derived and highly optimized from the code described in paper:
40 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
41 ; The details of the implementation is explained in:
42 ; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
71 ; padded AAD in xmm register = {A1 A0 0 0}
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ; AAD Format with 32-bit Sequence Number
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ; AAD Format with 64-bit Extended Sequence Number
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
115 %include "reg_sizes.asm"
116 %include "gcm_defines.asm"
117 %include "mb_mgr_datastruct.asm"
118 %include "job_aes_hmac.asm"
119 %include "memcpy.asm"
124 %error "No GCM mode selected for gcm_avx512.asm!"
129 ;; Decide on AES-GCM key size to compile for
132 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512
137 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512
142 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512
148 ; need to push 4 registers into stack to maintain
149 %define STACK_OFFSET 8*4
151 %ifidn __OUTPUT_FORMAT__, win64
152 %define XMM_STORAGE 16*10
154 %define XMM_STORAGE 0
157 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
158 %define TMP3 16*1 ; Temporary storage for AES State 3
159 %define TMP4 16*2 ; Temporary storage for AES State 4
160 %define TMP5 16*3 ; Temporary storage for AES State 5
161 %define TMP6 16*4 ; Temporary storage for AES State 6
162 %define TMP7 16*5 ; Temporary storage for AES State 7
163 %define TMP8 16*6 ; Temporary storage for AES State 8
164 %define LOCAL_STORAGE 16*7
165 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
167 %define LOCAL_STORAGE_AVX512 2*8 ; temporary storage
168 %define STACK_SIZE_GP_AVX512 10*8 ; up to 10 GP registers (5 GP + 3 reserve places for the algorithmic code)
169 %define STACK_OFFSET_AVX512 (LOCAL_STORAGE_AVX512 + XMM_STORAGE)
170 %define VARIABLE_OFFSET_AVX512 (LOCAL_STORAGE_AVX512 + XMM_STORAGE + STACK_SIZE_GP_AVX512)
172 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
177 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
178 ; Input: A and B (128-bits each, bit-reflected)
179 ; Output: C = A*B*x mod poly, (i.e. >>1 )
180 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
181 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184 %define %%GH %1 ; 16 Bytes
185 %define %%HK %2 ; 16 Bytes
191 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
194 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
195 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
196 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
197 vpxor %%GH, %%GH, %%T3
200 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
201 vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
203 vpxor %%T1, %%T1, %%T3
204 vpxor %%GH, %%GH, %%T2
206 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207 ;first phase of the reduction
208 vmovdqu %%T3, [rel POLY2]
210 vpclmulqdq %%T2, %%T3, %%GH, 0x01
211 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
213 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
214 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
215 ;second phase of the reduction
216 vpclmulqdq %%T2, %%T3, %%GH, 0x00
217 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
219 vpclmulqdq %%GH, %%T3, %%GH, 0x10
220 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
222 vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
228 ; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
229 ; functions, but are kept to allow users to switch cpu architectures between calls
230 ; of pre, init, update, and finalize.
241 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
244 vpshufd %%T1, %%T5, 01001110b
246 vmovdqu [%%GDATA + HashKey_k], %%T1
248 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
249 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
250 vpshufd %%T1, %%T5, 01001110b
252 vmovdqu [%%GDATA + HashKey_2_k], %%T1
254 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
255 vmovdqu [%%GDATA + HashKey_3], %%T5
256 vpshufd %%T1, %%T5, 01001110b
258 vmovdqu [%%GDATA + HashKey_3_k], %%T1
260 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
261 vmovdqu [%%GDATA + HashKey_4], %%T5
262 vpshufd %%T1, %%T5, 01001110b
264 vmovdqu [%%GDATA + HashKey_4_k], %%T1
266 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
267 vmovdqu [%%GDATA + HashKey_5], %%T5
268 vpshufd %%T1, %%T5, 01001110b
270 vmovdqu [%%GDATA + HashKey_5_k], %%T1
272 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
273 vmovdqu [%%GDATA + HashKey_6], %%T5
274 vpshufd %%T1, %%T5, 01001110b
276 vmovdqu [%%GDATA + HashKey_6_k], %%T1
278 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
279 vmovdqu [%%GDATA + HashKey_7], %%T5
280 vpshufd %%T1, %%T5, 01001110b
282 vmovdqu [%%GDATA + HashKey_7_k], %%T1
284 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
285 vmovdqu [%%GDATA + HashKey_8], %%T5
286 vpshufd %%T1, %%T5, 01001110b
288 vmovdqu [%%GDATA + HashKey_8_k], %%T1
292 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
294 ; Returns 0 if data has length 0.
295 ; Input: The input data (INPUT), that data's length (LENGTH).
296 ; Output: The packed xmm register (OUTPUT).
297 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
298 %macro READ_SMALL_DATA_INPUT 4
299 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
304 lea %%TMP1, [rel byte_len_to_mask_table]
305 %ifidn __OUTPUT_FORMAT__, win64
310 kmovw k1, [%%TMP1 + %%LENGTH*2]
312 vmovdqu8 XWORD(%%OUTPUT){k1}{z}, [%%INPUT]
314 %endmacro ; READ_SMALL_DATA_INPUT
317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
319 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
320 ; Output: The hash of the data (AAD_HASH).
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322 %macro CALC_AAD_HASH 13
325 %define %%AAD_HASH %3
326 %define %%GDATA_KEY %4
327 %define %%XTMP0 %5 ; xmm temp reg 5
328 %define %%XTMP1 %6 ; xmm temp reg 5
332 %define %%XTMP5 %10 ; xmm temp reg 5
333 %define %%T1 %11 ; temp reg 1
338 mov %%T1, %%A_IN ; T1 = AAD
339 mov %%T2, %%A_LEN ; T2 = aadLen
340 vpxor %%AAD_HASH, %%AAD_HASH
344 jl %%_exit_AAD_loop128
346 vmovdqu %%XTMP0, [%%T1 + 16*0]
347 vpshufb %%XTMP0, [rel SHUF_MASK]
349 vpxor %%XTMP0, %%AAD_HASH
351 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
352 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
353 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
354 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
355 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
356 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
361 vmovdqu %%XTMP0, [%%T1 + 16*i]
362 vpshufb %%XTMP0, [rel SHUF_MASK]
364 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
365 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
366 vpxor %%XTMP1, %%XTMP1, %%XTMP4
368 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
369 vpxor %%XTMP2, %%XTMP2, %%XTMP4
371 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
372 vpxor %%XTMP3, %%XTMP3, %%XTMP4
373 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
374 vpxor %%XTMP3, %%XTMP3, %%XTMP4
379 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
380 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
381 vpxor %%XTMP2, %%XTMP2, %%XTMP4
382 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 ;first phase of the reduction
386 vmovdqa %%XTMP5, [rel POLY2]
387 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
388 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
389 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392 ;second phase of the reduction
393 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
394 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
396 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
397 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
399 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
400 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
401 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
407 jmp %%_get_AAD_loop128
411 jl %%_get_small_AAD_block
413 ;; calculate hash_key position to start with
415 and %%T3, -16 ; 1 to 7 blocks possible here
417 add %%T3, HashKey_1 + 16
418 lea %%T3, [%%GDATA_KEY + %%T3]
420 vmovdqu %%XTMP0, [%%T1]
421 vpshufb %%XTMP0, [rel SHUF_MASK]
423 vpxor %%XTMP0, %%AAD_HASH
425 vmovdqu %%XTMP5, [%%T3]
426 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
427 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
428 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
429 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
430 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
432 add %%T3, 16 ; move to next hashkey
433 add %%T1, 16 ; move to next data block
439 vmovdqu %%XTMP0, [%%T1]
440 vpshufb %%XTMP0, [rel SHUF_MASK]
442 vmovdqu %%XTMP5, [%%T3]
443 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
444 vpxor %%XTMP1, %%XTMP1, %%XTMP4
446 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
447 vpxor %%XTMP2, %%XTMP2, %%XTMP4
449 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
450 vpxor %%XTMP3, %%XTMP3, %%XTMP4
451 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
452 vpxor %%XTMP3, %%XTMP3, %%XTMP4
454 add %%T3, 16 ; move to next hashkey
462 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
463 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
464 vpxor %%XTMP2, %%XTMP2, %%XTMP4
465 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
467 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
468 ;first phase of the reduction
469 vmovdqa %%XTMP5, [rel POLY2]
470 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
471 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
472 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
474 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
475 ;second phase of the reduction
476 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
477 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
479 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
480 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
482 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
483 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
484 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
489 %%_get_small_AAD_block:
490 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
491 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3
492 ;byte-reflect the AAD data
493 vpshufb %%XTMP1, [rel SHUF_MASK]
494 vpxor %%AAD_HASH, %%XTMP1
495 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
499 %endmacro ; CALC_AAD_HASH
501 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
502 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
503 ; Requires the input data be at least 1 byte long.
504 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
505 ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
506 ; and whether encoding or decoding (ENC_DEC)
507 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
508 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
509 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
510 %macro PARTIAL_BLOCK 8
511 %define %%GDATA_KEY %1
512 %define %%GDATA_CTX %2
513 %define %%CYPH_PLAIN_OUT %3
514 %define %%PLAIN_CYPH_IN %4
515 %define %%PLAIN_CYPH_LEN %5
516 %define %%DATA_OFFSET %6
517 %define %%AAD_HASH %7
520 mov r13, [%%GDATA_CTX + PBlockLen]
522 je %%_partial_block_done ;Leave Macro if no partial blocks
524 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
525 jl %%_fewer_than_16_bytes
526 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
529 %%_fewer_than_16_bytes:
530 lea r10, [%%PLAIN_CYPH_IN]
531 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax
533 %%_data_read: ;Finished reading in data
535 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
536 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
538 lea r12, [rel SHIFT_MASK]
540 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
541 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
542 vpshufb xmm9, xmm2 ;shift right r13 bytes
544 %ifidn %%ENC_DEC, DEC
547 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
549 mov r15, %%PLAIN_CYPH_LEN
551 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
552 jge %%_no_extra_mask ;Determine if if partial block is not being filled and shift mask accordingly
556 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
557 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
559 %ifidn %%ENC_DEC, DEC
561 vpshufb xmm3, [rel SHUF_MASK]
563 vpxor %%AAD_HASH, xmm3
565 vpshufb xmm9, [rel SHUF_MASK]
567 vpxor %%AAD_HASH, xmm9
570 jl %%_partial_incomplete
572 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
574 mov [%%GDATA_CTX + PBlockLen], rax
576 %%_partial_incomplete:
577 %ifidn __OUTPUT_FORMAT__, win64
578 mov rax, %%PLAIN_CYPH_LEN
579 add [%%GDATA_CTX + PBlockLen], rax
581 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
584 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
586 %ifidn %%ENC_DEC, ENC
587 vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
591 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
592 ; output encrypted Bytes
597 sub r13, r12 ; Set r13 to be the number of bytes to write out
600 mov r13, %%PLAIN_CYPH_LEN
602 lea rax, [rel byte_len_to_mask_table]
603 kmovw k1, [rax + r13*2]
604 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9
605 add %%DATA_OFFSET, r13
606 %%_partial_block_done:
607 %endmacro ; PARTIAL_BLOCK
610 %macro GHASH_SINGLE_MUL 9
614 %define %%STATE_11 %4
615 %define %%STATE_00 %5
616 %define %%STATE_MID %6
621 vmovdqu %%T1, [%%GDATA + %%HASHKEY]
622 %ifidn %%FIRST, first
623 vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
624 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
625 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
626 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
627 vpxor %%STATE_MID, %%STATE_MID, %%T2
629 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
630 vpxor %%STATE_11, %%STATE_11, %%T2
632 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
633 vpxor %%STATE_00, %%STATE_00, %%T2
635 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
636 vpxor %%STATE_MID, %%STATE_MID, %%T2
638 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
639 vpxor %%STATE_MID, %%STATE_MID, %%T2
644 ; if a = number of total plaintext bytes
646 ; %%num_initial_blocks = b mod 8;
647 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
648 ; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
649 ; Updated AAD_HASH is returned in %%T3
651 %macro INITIAL_BLOCKS 23
652 %define %%GDATA_KEY %1
653 %define %%CYPH_PLAIN_OUT %2
654 %define %%PLAIN_CYPH_IN %3
656 %define %%DATA_OFFSET %5
657 %define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
674 %define %%ENC_DEC %23
676 %assign i (8-%%num_initial_blocks)
677 ;; Move AAD_HASH to temp reg
679 ;; Start AES for %%num_initial_blocks blocks
680 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
682 %assign i (9-%%num_initial_blocks)
683 %rep %%num_initial_blocks
684 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
685 vmovdqa reg(i), %%CTR
686 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
690 %if(%%num_initial_blocks>0)
691 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
692 %assign i (9-%%num_initial_blocks)
693 %rep %%num_initial_blocks
694 vpxor reg(i),reg(i),%%T_key
700 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
701 %assign i (9-%%num_initial_blocks)
702 %rep %%num_initial_blocks
703 vaesenc reg(i),%%T_key
711 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
712 %assign i (9-%%num_initial_blocks)
713 %rep %%num_initial_blocks
714 vaesenclast reg(i),%%T_key
718 %endif ; %if(%%num_initial_blocks>0)
722 %assign i (9-%%num_initial_blocks)
723 %rep %%num_initial_blocks
724 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
725 vpxor reg(i), reg(i), %%T1
726 ;; Write back ciphertext for %%num_initial_blocks blocks
727 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
728 add %%DATA_OFFSET, 16
729 %ifidn %%ENC_DEC, DEC
732 ;; Prepare ciphertext for GHASH computations
733 vpshufb reg(i), [rel SHUF_MASK]
738 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
740 %assign i (9-%%num_initial_blocks)
741 %if(%%num_initial_blocks>0)
745 %if %%num_initial_blocks>1
746 %rep %%num_initial_blocks-1
747 vmovdqu [rsp + TMP %+ i], reg(i)
751 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
752 ;; Haskey_i_k holds XORed values of the low and high parts of
754 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
755 vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
756 vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
757 vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
758 vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
759 vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
760 vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
761 vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
762 vmovdqa %%CTR, %%XMM8
764 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
765 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
766 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
767 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
768 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
769 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
770 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
771 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
773 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
774 vpxor %%XMM1, %%XMM1, %%T_key
775 vpxor %%XMM2, %%XMM2, %%T_key
776 vpxor %%XMM3, %%XMM3, %%T_key
777 vpxor %%XMM4, %%XMM4, %%T_key
778 vpxor %%XMM5, %%XMM5, %%T_key
779 vpxor %%XMM6, %%XMM6, %%T_key
780 vpxor %%XMM7, %%XMM7, %%T_key
781 vpxor %%XMM8, %%XMM8, %%T_key
783 %assign i (8-%%num_initial_blocks)
784 %assign j (9-%%num_initial_blocks)
785 %assign k (%%num_initial_blocks)
788 %if(%%num_initial_blocks>0)
790 ;; T2 - incoming AAD hash
793 ;; GDATA, HASHKEY, CIPHER,
794 ;; STATE_11, STATE_00, STATE_MID, T1, T2
795 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
796 %%T1, %%T4, %%T6, %%T5, %%T3, first
799 vmovdqu %%T_key, [%%GDATA_KEY+16*1]
800 vaesenc %%XMM1, %%T_key
801 vaesenc %%XMM2, %%T_key
802 vaesenc %%XMM3, %%T_key
803 vaesenc %%XMM4, %%T_key
804 vaesenc %%XMM5, %%T_key
805 vaesenc %%XMM6, %%T_key
806 vaesenc %%XMM7, %%T_key
807 vaesenc %%XMM8, %%T_key
809 vmovdqu %%T_key, [%%GDATA_KEY+16*2]
810 vaesenc %%XMM1, %%T_key
811 vaesenc %%XMM2, %%T_key
812 vaesenc %%XMM3, %%T_key
813 vaesenc %%XMM4, %%T_key
814 vaesenc %%XMM5, %%T_key
815 vaesenc %%XMM6, %%T_key
816 vaesenc %%XMM7, %%T_key
817 vaesenc %%XMM8, %%T_key
822 %if(%%num_initial_blocks>1)
823 ;; GDATA, HASHKEY, CIPHER,
824 ;; STATE_11, STATE_00, STATE_MID, T1, T2
825 vmovdqu %%T2, [rsp + TMP %+ j]
826 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
827 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
830 vmovdqu %%T_key, [%%GDATA_KEY+16*3]
831 vaesenc %%XMM1, %%T_key
832 vaesenc %%XMM2, %%T_key
833 vaesenc %%XMM3, %%T_key
834 vaesenc %%XMM4, %%T_key
835 vaesenc %%XMM5, %%T_key
836 vaesenc %%XMM6, %%T_key
837 vaesenc %%XMM7, %%T_key
838 vaesenc %%XMM8, %%T_key
840 vmovdqu %%T_key, [%%GDATA_KEY+16*4]
841 vaesenc %%XMM1, %%T_key
842 vaesenc %%XMM2, %%T_key
843 vaesenc %%XMM3, %%T_key
844 vaesenc %%XMM4, %%T_key
845 vaesenc %%XMM5, %%T_key
846 vaesenc %%XMM6, %%T_key
847 vaesenc %%XMM7, %%T_key
848 vaesenc %%XMM8, %%T_key
853 %if(%%num_initial_blocks>2)
854 ;; GDATA, HASHKEY, CIPHER,
855 ;; STATE_11, STATE_00, STATE_MID, T1, T2
856 vmovdqu %%T2, [rsp + TMP %+ j]
857 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
858 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
864 %if(%%num_initial_blocks>3)
865 ;; GDATA, HASHKEY, CIPHER,
866 ;; STATE_11, STATE_00, STATE_MID, T1, T2
867 vmovdqu %%T2, [rsp + TMP %+ j]
868 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
869 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
872 vmovdqu %%T_key, [%%GDATA_KEY+16*5]
873 vaesenc %%XMM1, %%T_key
874 vaesenc %%XMM2, %%T_key
875 vaesenc %%XMM3, %%T_key
876 vaesenc %%XMM4, %%T_key
877 vaesenc %%XMM5, %%T_key
878 vaesenc %%XMM6, %%T_key
879 vaesenc %%XMM7, %%T_key
880 vaesenc %%XMM8, %%T_key
882 vmovdqu %%T_key, [%%GDATA_KEY+16*6]
883 vaesenc %%XMM1, %%T_key
884 vaesenc %%XMM2, %%T_key
885 vaesenc %%XMM3, %%T_key
886 vaesenc %%XMM4, %%T_key
887 vaesenc %%XMM5, %%T_key
888 vaesenc %%XMM6, %%T_key
889 vaesenc %%XMM7, %%T_key
890 vaesenc %%XMM8, %%T_key
895 %if(%%num_initial_blocks>4)
896 ;; GDATA, HASHKEY, CIPHER,
897 ;; STATE_11, STATE_00, STATE_MID, T1, T2
898 vmovdqu %%T2, [rsp + TMP %+ j]
899 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
900 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
903 vmovdqu %%T_key, [%%GDATA_KEY+16*7]
904 vaesenc %%XMM1, %%T_key
905 vaesenc %%XMM2, %%T_key
906 vaesenc %%XMM3, %%T_key
907 vaesenc %%XMM4, %%T_key
908 vaesenc %%XMM5, %%T_key
909 vaesenc %%XMM6, %%T_key
910 vaesenc %%XMM7, %%T_key
911 vaesenc %%XMM8, %%T_key
913 vmovdqu %%T_key, [%%GDATA_KEY+16*8]
914 vaesenc %%XMM1, %%T_key
915 vaesenc %%XMM2, %%T_key
916 vaesenc %%XMM3, %%T_key
917 vaesenc %%XMM4, %%T_key
918 vaesenc %%XMM5, %%T_key
919 vaesenc %%XMM6, %%T_key
920 vaesenc %%XMM7, %%T_key
921 vaesenc %%XMM8, %%T_key
926 %if(%%num_initial_blocks>5)
927 ;; GDATA, HASHKEY, CIPHER,
928 ;; STATE_11, STATE_00, STATE_MID, T1, T2
929 vmovdqu %%T2, [rsp + TMP %+ j]
930 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
931 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
934 vmovdqu %%T_key, [%%GDATA_KEY+16*9]
935 vaesenc %%XMM1, %%T_key
936 vaesenc %%XMM2, %%T_key
937 vaesenc %%XMM3, %%T_key
938 vaesenc %%XMM4, %%T_key
939 vaesenc %%XMM5, %%T_key
940 vaesenc %%XMM6, %%T_key
941 vaesenc %%XMM7, %%T_key
942 vaesenc %%XMM8, %%T_key
945 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
946 vaesenc %%XMM1, %%T_key
947 vaesenc %%XMM2, %%T_key
948 vaesenc %%XMM3, %%T_key
949 vaesenc %%XMM4, %%T_key
950 vaesenc %%XMM5, %%T_key
951 vaesenc %%XMM6, %%T_key
952 vaesenc %%XMM7, %%T_key
953 vaesenc %%XMM8, %%T_key
959 %if(%%num_initial_blocks>6)
960 ;; GDATA, HASHKEY, CIPHER,
961 ;; STATE_11, STATE_00, STATE_MID, T1, T2
962 vmovdqu %%T2, [rsp + TMP %+ j]
963 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
964 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
968 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
969 vaesenclast %%XMM1, %%T_key
970 vaesenclast %%XMM2, %%T_key
971 vaesenclast %%XMM3, %%T_key
972 vaesenclast %%XMM4, %%T_key
973 vaesenclast %%XMM5, %%T_key
974 vaesenclast %%XMM6, %%T_key
975 vaesenclast %%XMM7, %%T_key
976 vaesenclast %%XMM8, %%T_key
980 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
981 vaesenc %%XMM1, %%T_key
982 vaesenc %%XMM2, %%T_key
983 vaesenc %%XMM3, %%T_key
984 vaesenc %%XMM4, %%T_key
985 vaesenc %%XMM5, %%T_key
986 vaesenc %%XMM6, %%T_key
987 vaesenc %%XMM7, %%T_key
988 vaesenc %%XMM8, %%T_key
990 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
991 vaesenclast %%XMM1, %%T_key
992 vaesenclast %%XMM2, %%T_key
993 vaesenclast %%XMM3, %%T_key
994 vaesenclast %%XMM4, %%T_key
995 vaesenclast %%XMM5, %%T_key
996 vaesenclast %%XMM6, %%T_key
997 vaesenclast %%XMM7, %%T_key
998 vaesenclast %%XMM8, %%T_key
1001 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
1002 vaesenc %%XMM1, %%T_key
1003 vaesenc %%XMM2, %%T_key
1004 vaesenc %%XMM3, %%T_key
1005 vaesenc %%XMM4, %%T_key
1006 vaesenc %%XMM5, %%T_key
1007 vaesenc %%XMM6, %%T_key
1008 vaesenc %%XMM7, %%T_key
1009 vaesenc %%XMM8, %%T_key
1011 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
1012 vaesenc %%XMM1, %%T_key
1013 vaesenc %%XMM2, %%T_key
1014 vaesenc %%XMM3, %%T_key
1015 vaesenc %%XMM4, %%T_key
1016 vaesenc %%XMM5, %%T_key
1017 vaesenc %%XMM6, %%T_key
1018 vaesenc %%XMM7, %%T_key
1019 vaesenc %%XMM8, %%T_key
1025 %if(%%num_initial_blocks>7)
1026 ;; GDATA, HASHKEY, CIPHER,
1027 ;; STATE_11, STATE_00, STATE_MID, T1, T2
1028 vmovdqu %%T2, [rsp + TMP %+ j]
1029 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1030 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
1033 %ifdef GCM256_MODE ; GCM256
1034 vmovdqu %%T_key, [%%GDATA_KEY+16*13]
1035 vaesenc %%XMM1, %%T_key
1036 vaesenc %%XMM2, %%T_key
1037 vaesenc %%XMM3, %%T_key
1038 vaesenc %%XMM4, %%T_key
1039 vaesenc %%XMM5, %%T_key
1040 vaesenc %%XMM6, %%T_key
1041 vaesenc %%XMM7, %%T_key
1042 vaesenc %%XMM8, %%T_key
1044 vmovdqu %%T_key, [%%GDATA_KEY+16*14]
1045 vaesenclast %%XMM1, %%T_key
1046 vaesenclast %%XMM2, %%T_key
1047 vaesenclast %%XMM3, %%T_key
1048 vaesenclast %%XMM4, %%T_key
1049 vaesenclast %%XMM5, %%T_key
1050 vaesenclast %%XMM6, %%T_key
1051 vaesenclast %%XMM7, %%T_key
1052 vaesenclast %%XMM8, %%T_key
1053 %endif ; GCM256 mode
1055 %if(%%num_initial_blocks>0)
1056 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1057 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1058 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1059 vpxor %%T4, %%T6, %%T4
1061 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1062 ; First phase of the reduction
1063 vmovdqu %%T3, [rel POLY2]
1065 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1066 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
1068 ;; First phase of the reduction complete
1069 vpxor %%T4, %%T4, %%T2
1071 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1072 ; Second phase of the reduction
1073 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1074 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1075 vpsrldq %%T2, %%T2, 4
1077 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1078 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1079 vpslldq %%T4, %%T4, 4
1080 ;; Second phase of the reduction complete
1081 vpxor %%T4, %%T4, %%T2
1082 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1083 ; The result is in %%T3
1084 vpxor %%T3, %%T1, %%T4
1086 ;; The hash should end up in T3
1090 ;; Final hash is now in T3
1091 %if %%num_initial_blocks > 0
1092 ;; NOTE: obsolete in case %%num_initial_blocks = 0
1093 sub %%LENGTH, 16*%%num_initial_blocks
1096 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
1097 vpxor %%XMM1, %%XMM1, %%T1
1098 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
1099 %ifidn %%ENC_DEC, DEC
1100 vmovdqa %%XMM1, %%T1
1103 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
1104 vpxor %%XMM2, %%XMM2, %%T1
1105 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
1106 %ifidn %%ENC_DEC, DEC
1107 vmovdqa %%XMM2, %%T1
1110 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
1111 vpxor %%XMM3, %%XMM3, %%T1
1112 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
1113 %ifidn %%ENC_DEC, DEC
1114 vmovdqa %%XMM3, %%T1
1117 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
1118 vpxor %%XMM4, %%XMM4, %%T1
1119 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
1120 %ifidn %%ENC_DEC, DEC
1121 vmovdqa %%XMM4, %%T1
1124 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
1125 vpxor %%XMM5, %%XMM5, %%T1
1126 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
1127 %ifidn %%ENC_DEC, DEC
1128 vmovdqa %%XMM5, %%T1
1131 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
1132 vpxor %%XMM6, %%XMM6, %%T1
1133 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
1134 %ifidn %%ENC_DEC, DEC
1135 vmovdqa %%XMM6, %%T1
1138 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
1139 vpxor %%XMM7, %%XMM7, %%T1
1140 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
1141 %ifidn %%ENC_DEC, DEC
1142 vmovdqa %%XMM7, %%T1
1145 %if %%num_initial_blocks > 0
1146 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
1147 ;; This macro is executed for lenght 128 and up,
1148 ;; zero length is checked in GCM_ENC_DEC.
1149 ;; If the last block is partial then the xor will be done later
1150 ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
1151 ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
1153 jl %%_initial_skip_last_word_write
1155 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
1156 vpxor %%XMM8, %%XMM8, %%T1
1157 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
1158 %ifidn %%ENC_DEC, DEC
1159 vmovdqa %%XMM8, %%T1
1162 ;; Update %%LENGTH with the number of blocks processed
1164 add %%DATA_OFFSET, 16
1165 %%_initial_skip_last_word_write:
1166 sub %%LENGTH, 128-16
1167 add %%DATA_OFFSET, 128-16
1169 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
1170 ;; Combine GHASHed value with the corresponding ciphertext
1171 vpxor %%XMM1, %%XMM1, %%T3
1172 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
1173 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
1174 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
1175 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
1176 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
1177 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
1178 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
1180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1182 %%_initial_blocks_done:
1187 ;;; INITIAL_BLOCKS macro with support for a partial final block.
1188 ;;; num_initial_blocks is expected to include the partial final block
1190 %macro INITIAL_BLOCKS_PARTIAL 25
1191 %define %%GDATA_KEY %1
1192 %define %%GDATA_CTX %2
1193 %define %%CYPH_PLAIN_OUT %3
1194 %define %%PLAIN_CYPH_IN %4
1196 %define %%DATA_OFFSET %6
1197 %define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
1200 %define %%T3 %10 ; [out] hash value
1211 %define %%XMM8 %21 ; [in] hash value
1214 %define %%ENC_DEC %24
1215 %define %%INSTANCE_TYPE %25
1217 ;; Move AAD_HASH to temp reg
1218 vmovdqu %%T2, %%XMM8
1220 %assign i (9-%%num_initial_blocks)
1221 %rep %%num_initial_blocks
1222 ;; Compute AES counters
1223 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
1224 vmovdqa reg(i), %%CTR
1225 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
1229 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
1230 %assign i (9-%%num_initial_blocks)
1231 %rep %%num_initial_blocks
1232 ; Start AES for %%num_initial_blocks blocks
1233 vpxor reg(i),reg(i),%%T_key
1239 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1240 %assign i (9-%%num_initial_blocks)
1241 %rep %%num_initial_blocks
1242 vaesenc reg(i),%%T_key
1250 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1251 %assign i (9-%%num_initial_blocks)
1252 %rep %%num_initial_blocks
1253 vaesenclast reg(i),%%T_key
1257 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1258 ;;; Hash all but the last block of data
1259 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1261 %assign i (9-%%num_initial_blocks)
1262 %rep %%num_initial_blocks-1
1263 ;; Encrypt the message for all but the last block
1264 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1265 vpxor reg(i), reg(i), %%T1
1266 ;; write back ciphertext for %%num_initial_blocks blocks
1267 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1268 add %%DATA_OFFSET, 16
1269 %ifidn %%ENC_DEC, DEC
1270 vmovdqa reg(i), %%T1
1272 ;; Prepare ciphertext for GHASH computations
1273 vpshufb reg(i), [rel SHUF_MASK]
1277 %if %%num_initial_blocks > 1
1278 ;; The final block of data may be <16B
1279 sub %%LENGTH, 16*(%%num_initial_blocks-1)
1282 %if %%num_initial_blocks < 8
1283 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
1284 ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
1286 jl %%_small_initial_partial_block
1288 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1289 ;;; Handle a full length final block - encrypt and hash all blocks
1290 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1293 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1295 ;; Encrypt the message
1296 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1297 vpxor reg(i), reg(i), %%T1
1298 ;; write back ciphertext for %%num_initial_blocks blocks
1299 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1300 add %%DATA_OFFSET, 16
1301 %ifidn %%ENC_DEC, DEC
1302 vmovdqa reg(i), %%T1
1304 ;; Prepare ciphertext for GHASH computations
1305 vpshufb reg(i), [rel SHUF_MASK]
1307 ;; Hash all of the data
1308 %assign i (8-%%num_initial_blocks)
1309 %assign j (9-%%num_initial_blocks)
1310 %assign k (%%num_initial_blocks)
1311 %assign last_block_to_hash 0
1313 %if(%%num_initial_blocks>last_block_to_hash)
1314 ;; Hash in AES state
1317 ;; T2 - incoming AAD hash
1318 ;; reg(i) holds ciphertext
1321 ;; reg(1)/xmm1 should now be available for tmp use
1322 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1323 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1324 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1325 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1326 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1327 vpxor %%T6, %%T6, %%T5
1333 %assign rep_count (%%num_initial_blocks-1)
1336 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1337 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1338 vpxor %%T1, %%T1, %%T3
1340 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1341 vpxor %%T4, %%T4, %%T3
1343 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1344 vpxor %%T6, %%T6, %%T3
1346 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1347 vpxor %%T6, %%T6, %%T3
1354 ;; Record that a reduction is needed
1357 jmp %%_small_initial_compute_hash
1360 %endif ; %if %%num_initial_blocks < 8
1362 %%_small_initial_partial_block:
1364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1365 ;;; Handle ghash for a <16B final block
1366 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1368 ;; In this case if it's a single call to encrypt we can
1369 ;; hash all of the data but if it's an init / update / finalize
1370 ;; series of call we need to leave the last block if it's
1371 ;; less than a full block of data.
1373 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1374 vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
1375 ;; Handle a partial final block
1376 ;; GDATA, KEY, T1, T2
1378 ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
1379 ;; NOTE: could be replaced with %%LENGTH but at this point
1380 ;; %%LENGTH is always less than 16.
1381 ;; No PLAIN_CYPH_LEN argument available in this macro.
1382 ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
1383 vpshufb reg(i), [rel SHUF_MASK]
1385 %ifidn %%INSTANCE_TYPE, multi_call
1386 %assign i (8-%%num_initial_blocks)
1387 %assign j (9-%%num_initial_blocks)
1388 %assign k (%%num_initial_blocks-1)
1389 %assign last_block_to_hash 1
1391 %assign i (8-%%num_initial_blocks)
1392 %assign j (9-%%num_initial_blocks)
1393 %assign k (%%num_initial_blocks)
1394 %assign last_block_to_hash 0
1397 %if(%%num_initial_blocks>last_block_to_hash)
1398 ;; Record that a reduction is needed
1400 ;; Hash in AES state
1403 ;; T2 - incoming AAD hash
1404 ;; reg(i) holds ciphertext
1407 ;; reg(1)/xmm1 should now be available for tmp use
1408 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1409 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1410 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1411 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1412 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1413 vpxor %%T6, %%T6, %%T5
1415 ;; Record that a reduction is not needed -
1416 ;; In this case no hashes are computed because there
1417 ;; is only one initial block and it is < 16B in length.
1424 %ifidn %%INSTANCE_TYPE, multi_call
1425 %assign rep_count (%%num_initial_blocks-2)
1428 %assign rep_count (%%num_initial_blocks-1)
1432 ;; fix for negative rep_count
1438 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1439 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1440 vpxor %%T1, %%T1, %%T3
1442 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1443 vpxor %%T4, %%T4, %%T3
1445 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1446 vpxor %%T6, %%T6, %%T3
1448 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1449 vpxor %%T6, %%T6, %%T3
1456 %%_small_initial_compute_hash:
1458 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1460 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1462 %if(%%num_initial_blocks=1)
1463 %ifidn %%INSTANCE_TYPE, multi_call
1464 ;; We only need to check if a reduction is needed if
1465 ;; initial_blocks == 1 and init/update/final is being used.
1466 ;; In this case we may just have a partial block, and that
1467 ;; gets hashed in finalize.
1470 je %%_no_reduction_needed
1474 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1475 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1476 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1477 vpxor %%T4, %%T6, %%T4
1479 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1480 ;; First phase of the reduction
1481 vmovdqu %%T3, [rel POLY2]
1483 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1484 ;; shift-L xmm2 2 DWs
1485 vpslldq %%T2, %%T2, 8
1486 vpxor %%T4, %%T4, %%T2
1488 ;; First phase of the reduction complete
1489 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1490 ;; Second phase of the reduction
1492 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1493 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1494 vpsrldq %%T2, %%T2, 4
1496 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1497 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1498 vpslldq %%T4, %%T4, 4
1500 vpxor %%T4, %%T4, %%T2
1501 ;; Second phase of the reduction complete
1502 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1503 vpxor %%T3, %%T1, %%T4
1505 %ifidn %%INSTANCE_TYPE, multi_call
1506 ;; If using init/update/finalize, we need to xor any partial block data
1508 %if %%num_initial_blocks > 1
1509 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
1510 %if %%num_initial_blocks != 8
1511 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
1512 cmp qword [%%GDATA_CTX + PBlockLen], 0
1513 je %%_no_partial_block_xor
1514 %endif ; %%num_initial_blocks != 8
1515 vpxor %%T3, %%T3, reg(8)
1516 %%_no_partial_block_xor:
1517 %endif ; %%num_initial_blocks > 1
1518 %endif ; %%INSTANCE_TYPE, multi_call
1520 %if(%%num_initial_blocks=1)
1521 %ifidn %%INSTANCE_TYPE, multi_call
1522 ;; NOTE: %%_no_reduction_needed case only valid for
1523 ;; multi_call with initial_blocks = 1.
1524 ;; Look for comment above around '_no_reduction_needed'
1525 ;; The jmp below is obsolete as the code will fall through.
1527 ;; The result is in %%T3
1528 jmp %%_after_reduction
1530 %%_no_reduction_needed:
1531 ;; The hash should end up in T3. The only way we should get here is if
1532 ;; there is a partial block of data, so xor that into the hash.
1533 vpxor %%T3, %%T2, reg(8)
1534 %endif ; %%INSTANCE_TYPE = multi_call
1535 %endif ; %%num_initial_blocks=1
1538 ;; Final hash is now in T3
1540 %endmacro ; INITIAL_BLOCKS_PARTIAL
1544 ; encrypt 8 blocks at a time
1545 ; ghash the 8 previously encrypted ciphertext blocks
1546 ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
1547 ; %%DATA_OFFSET is the data offset value
1548 %macro GHASH_8_ENCRYPT_8_PARALLEL 23
1550 %define %%CYPH_PLAIN_OUT %2
1551 %define %%PLAIN_CYPH_IN %3
1552 %define %%DATA_OFFSET %4
1569 %define %%loop_idx %21
1570 %define %%ENC_DEC %22
1571 %define %%FULL_PARTIAL %23
1573 vmovdqa %%T2, %%XMM1
1574 vmovdqu [rsp + TMP2], %%XMM2
1575 vmovdqu [rsp + TMP3], %%XMM3
1576 vmovdqu [rsp + TMP4], %%XMM4
1577 vmovdqu [rsp + TMP5], %%XMM5
1578 vmovdqu [rsp + TMP6], %%XMM6
1579 vmovdqu [rsp + TMP7], %%XMM7
1580 vmovdqu [rsp + TMP8], %%XMM8
1582 %ifidn %%loop_idx, in_order
1583 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
1584 vmovdqu %%T5, [rel TWO]
1585 vpaddd %%XMM2, %%CTR, %%T5
1586 vpaddd %%XMM3, %%XMM1, %%T5
1587 vpaddd %%XMM4, %%XMM2, %%T5
1588 vpaddd %%XMM5, %%XMM3, %%T5
1589 vpaddd %%XMM6, %%XMM4, %%T5
1590 vpaddd %%XMM7, %%XMM5, %%T5
1591 vpaddd %%XMM8, %%XMM6, %%T5
1592 vmovdqa %%CTR, %%XMM8
1594 vmovdqu %%T5, [rel SHUF_MASK]
1595 vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
1596 vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
1597 vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
1598 vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
1599 vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
1600 vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
1601 vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
1602 vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
1604 vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
1605 vmovdqu %%T5, [rel TWOf]
1606 vpaddd %%XMM2, %%CTR, %%T5
1607 vpaddd %%XMM3, %%XMM1, %%T5
1608 vpaddd %%XMM4, %%XMM2, %%T5
1609 vpaddd %%XMM5, %%XMM3, %%T5
1610 vpaddd %%XMM6, %%XMM4, %%T5
1611 vpaddd %%XMM7, %%XMM5, %%T5
1612 vpaddd %%XMM8, %%XMM6, %%T5
1613 vmovdqa %%CTR, %%XMM8
1618 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1620 vmovdqu %%T1, [%%GDATA + 16*0]
1621 vpxor %%XMM1, %%XMM1, %%T1
1622 vpxor %%XMM2, %%XMM2, %%T1
1623 vpxor %%XMM3, %%XMM3, %%T1
1624 vpxor %%XMM4, %%XMM4, %%T1
1625 vpxor %%XMM5, %%XMM5, %%T1
1626 vpxor %%XMM6, %%XMM6, %%T1
1627 vpxor %%XMM7, %%XMM7, %%T1
1628 vpxor %%XMM8, %%XMM8, %%T1
1630 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1632 vmovdqu %%T1, [%%GDATA + 16*1]
1633 vaesenc %%XMM1, %%T1
1634 vaesenc %%XMM2, %%T1
1635 vaesenc %%XMM3, %%T1
1636 vaesenc %%XMM4, %%T1
1637 vaesenc %%XMM5, %%T1
1638 vaesenc %%XMM6, %%T1
1639 vaesenc %%XMM7, %%T1
1640 vaesenc %%XMM8, %%T1
1643 vmovdqu %%T1, [%%GDATA + 16*2]
1644 vaesenc %%XMM1, %%T1
1645 vaesenc %%XMM2, %%T1
1646 vaesenc %%XMM3, %%T1
1647 vaesenc %%XMM4, %%T1
1648 vaesenc %%XMM5, %%T1
1649 vaesenc %%XMM6, %%T1
1650 vaesenc %%XMM7, %%T1
1651 vaesenc %%XMM8, %%T1
1653 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1655 vmovdqu %%T5, [%%GDATA + HashKey_8]
1656 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1657 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
1658 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1659 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1660 vpxor %%T6, %%T6, %%T5
1662 vmovdqu %%T1, [%%GDATA + 16*3]
1663 vaesenc %%XMM1, %%T1
1664 vaesenc %%XMM2, %%T1
1665 vaesenc %%XMM3, %%T1
1666 vaesenc %%XMM4, %%T1
1667 vaesenc %%XMM5, %%T1
1668 vaesenc %%XMM6, %%T1
1669 vaesenc %%XMM7, %%T1
1670 vaesenc %%XMM8, %%T1
1672 vmovdqu %%T1, [rsp + TMP2]
1673 vmovdqu %%T5, [%%GDATA + HashKey_7]
1674 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1675 vpxor %%T4, %%T4, %%T3
1677 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1678 vpxor %%T7, %%T7, %%T3
1680 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1681 vpxor %%T6, %%T6, %%T3
1683 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1684 vpxor %%T6, %%T6, %%T3
1686 vmovdqu %%T1, [%%GDATA + 16*4]
1687 vaesenc %%XMM1, %%T1
1688 vaesenc %%XMM2, %%T1
1689 vaesenc %%XMM3, %%T1
1690 vaesenc %%XMM4, %%T1
1691 vaesenc %%XMM5, %%T1
1692 vaesenc %%XMM6, %%T1
1693 vaesenc %%XMM7, %%T1
1694 vaesenc %%XMM8, %%T1
1696 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1697 vmovdqu %%T1, [rsp + TMP3]
1698 vmovdqu %%T5, [%%GDATA + HashKey_6]
1699 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1700 vpxor %%T4, %%T4, %%T3
1702 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1703 vpxor %%T7, %%T7, %%T3
1705 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1706 vpxor %%T6, %%T6, %%T3
1708 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1709 vpxor %%T6, %%T6, %%T3
1711 vmovdqu %%T1, [%%GDATA + 16*5]
1712 vaesenc %%XMM1, %%T1
1713 vaesenc %%XMM2, %%T1
1714 vaesenc %%XMM3, %%T1
1715 vaesenc %%XMM4, %%T1
1716 vaesenc %%XMM5, %%T1
1717 vaesenc %%XMM6, %%T1
1718 vaesenc %%XMM7, %%T1
1719 vaesenc %%XMM8, %%T1
1722 vmovdqu %%T1, [rsp + TMP4]
1723 vmovdqu %%T5, [%%GDATA + HashKey_5]
1724 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1725 vpxor %%T4, %%T4, %%T3
1727 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1728 vpxor %%T7, %%T7, %%T3
1730 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1731 vpxor %%T6, %%T6, %%T3
1733 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1734 vpxor %%T6, %%T6, %%T3
1736 vmovdqu %%T1, [%%GDATA + 16*6]
1737 vaesenc %%XMM1, %%T1
1738 vaesenc %%XMM2, %%T1
1739 vaesenc %%XMM3, %%T1
1740 vaesenc %%XMM4, %%T1
1741 vaesenc %%XMM5, %%T1
1742 vaesenc %%XMM6, %%T1
1743 vaesenc %%XMM7, %%T1
1744 vaesenc %%XMM8, %%T1
1746 vmovdqu %%T1, [rsp + TMP5]
1747 vmovdqu %%T5, [%%GDATA + HashKey_4]
1748 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1749 vpxor %%T4, %%T4, %%T3
1751 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1752 vpxor %%T7, %%T7, %%T3
1754 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1755 vpxor %%T6, %%T6, %%T3
1757 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1758 vpxor %%T6, %%T6, %%T3
1760 vmovdqu %%T1, [%%GDATA + 16*7]
1761 vaesenc %%XMM1, %%T1
1762 vaesenc %%XMM2, %%T1
1763 vaesenc %%XMM3, %%T1
1764 vaesenc %%XMM4, %%T1
1765 vaesenc %%XMM5, %%T1
1766 vaesenc %%XMM6, %%T1
1767 vaesenc %%XMM7, %%T1
1768 vaesenc %%XMM8, %%T1
1770 vmovdqu %%T1, [rsp + TMP6]
1771 vmovdqu %%T5, [%%GDATA + HashKey_3]
1772 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1773 vpxor %%T4, %%T4, %%T3
1775 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1776 vpxor %%T7, %%T7, %%T3
1778 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1779 vpxor %%T6, %%T6, %%T3
1781 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1782 vpxor %%T6, %%T6, %%T3
1784 vmovdqu %%T1, [%%GDATA + 16*8]
1785 vaesenc %%XMM1, %%T1
1786 vaesenc %%XMM2, %%T1
1787 vaesenc %%XMM3, %%T1
1788 vaesenc %%XMM4, %%T1
1789 vaesenc %%XMM5, %%T1
1790 vaesenc %%XMM6, %%T1
1791 vaesenc %%XMM7, %%T1
1792 vaesenc %%XMM8, %%T1
1794 vmovdqu %%T1, [rsp + TMP7]
1795 vmovdqu %%T5, [%%GDATA + HashKey_2]
1796 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1797 vpxor %%T4, %%T4, %%T3
1799 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1800 vpxor %%T7, %%T7, %%T3
1802 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1803 vpxor %%T6, %%T6, %%T3
1805 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1806 vpxor %%T6, %%T6, %%T3
1808 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1810 vmovdqu %%T5, [%%GDATA + 16*9]
1811 vaesenc %%XMM1, %%T5
1812 vaesenc %%XMM2, %%T5
1813 vaesenc %%XMM3, %%T5
1814 vaesenc %%XMM4, %%T5
1815 vaesenc %%XMM5, %%T5
1816 vaesenc %%XMM6, %%T5
1817 vaesenc %%XMM7, %%T5
1818 vaesenc %%XMM8, %%T5
1820 vmovdqu %%T1, [rsp + TMP8]
1821 vmovdqu %%T5, [%%GDATA + HashKey]
1824 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1825 vpxor %%T7, %%T7, %%T3
1827 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1828 vpxor %%T6, %%T6, %%T3
1830 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1831 vpxor %%T6, %%T6, %%T3
1833 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1834 vpxor %%T1, %%T4, %%T3
1837 vmovdqu %%T5, [%%GDATA + 16*10]
1838 %ifndef GCM128_MODE ; GCM192 or GCM256
1839 vaesenc %%XMM1, %%T5
1840 vaesenc %%XMM2, %%T5
1841 vaesenc %%XMM3, %%T5
1842 vaesenc %%XMM4, %%T5
1843 vaesenc %%XMM5, %%T5
1844 vaesenc %%XMM6, %%T5
1845 vaesenc %%XMM7, %%T5
1846 vaesenc %%XMM8, %%T5
1848 vmovdqu %%T5, [%%GDATA + 16*11]
1849 vaesenc %%XMM1, %%T5
1850 vaesenc %%XMM2, %%T5
1851 vaesenc %%XMM3, %%T5
1852 vaesenc %%XMM4, %%T5
1853 vaesenc %%XMM5, %%T5
1854 vaesenc %%XMM6, %%T5
1855 vaesenc %%XMM7, %%T5
1856 vaesenc %%XMM8, %%T5
1858 vmovdqu %%T5, [%%GDATA + 16*12]
1861 vaesenc %%XMM1, %%T5
1862 vaesenc %%XMM2, %%T5
1863 vaesenc %%XMM3, %%T5
1864 vaesenc %%XMM4, %%T5
1865 vaesenc %%XMM5, %%T5
1866 vaesenc %%XMM6, %%T5
1867 vaesenc %%XMM7, %%T5
1868 vaesenc %%XMM8, %%T5
1870 vmovdqu %%T5, [%%GDATA + 16*13]
1871 vaesenc %%XMM1, %%T5
1872 vaesenc %%XMM2, %%T5
1873 vaesenc %%XMM3, %%T5
1874 vaesenc %%XMM4, %%T5
1875 vaesenc %%XMM5, %%T5
1876 vaesenc %%XMM6, %%T5
1877 vaesenc %%XMM7, %%T5
1878 vaesenc %%XMM8, %%T5
1880 vmovdqu %%T5, [%%GDATA + 16*14]
1887 ;; SNP TBD: This is pretty ugly - consider whether just XORing the
1888 ;; data in after vaesenclast is simpler and performant. Would
1889 ;; also have to ripple it through partial block and ghash_mul_8.
1890 %ifidn %%FULL_PARTIAL, full
1892 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1893 vpxor %%T2, %%T2, %%T5
1895 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1898 %ifidn %%ENC_DEC, ENC
1899 vaesenclast reg(j), reg(j), %%T2
1901 vaesenclast %%T3, reg(j), %%T2
1902 vpxor reg(j), %%T2, %%T5
1903 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1907 ; Don't read the final data during partial block processing
1910 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1911 vpxor %%T2, %%T2, %%T5
1913 ;; Stage the key directly in T2 rather than hash it with plaintext
1918 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1920 ;; Stage the key directly in T2 rather than hash it with plaintext
1925 %ifidn %%ENC_DEC, ENC
1926 vaesenclast reg(j), reg(j), %%T2
1929 vaesenclast %%T3, reg(j), %%T2
1930 vpxor reg(j), %%T2, %%T5
1931 ;; Do not read the data since it could fault
1932 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1934 vaesenclast reg(j), reg(j), %%T2
1944 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1947 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
1948 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
1949 vpxor %%T7, %%T7, %%T3
1950 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
1954 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1955 ;first phase of the reduction
1956 vmovdqu %%T3, [rel POLY2]
1958 vpclmulqdq %%T2, %%T3, %%T7, 0x01
1959 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
1961 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
1962 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1964 %ifidn %%ENC_DEC, ENC
1965 ; Write to the Ciphertext buffer
1966 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
1967 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
1968 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
1969 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
1970 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
1971 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
1972 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
1973 %ifidn %%FULL_PARTIAL, full
1974 ;; Avoid writing past the buffer if handling a partial block
1975 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
1980 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1981 ;second phase of the reduction
1982 vpclmulqdq %%T2, %%T3, %%T7, 0x00
1983 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1985 vpclmulqdq %%T4, %%T3, %%T7, 0x10
1986 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1988 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
1989 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1990 vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
1992 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
1993 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
1994 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
1995 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
1996 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
1997 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
1998 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
1999 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
2005 %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
2008 ; GHASH the last 4 ciphertext blocks.
2009 %macro GHASH_LAST_8 16
2029 vmovdqu %%T5, [%%GDATA + HashKey_8]
2031 vpshufd %%T2, %%XMM1, 01001110b
2032 vpshufd %%T3, %%T5, 01001110b
2033 vpxor %%T2, %%T2, %%XMM1
2034 vpxor %%T3, %%T3, %%T5
2036 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2037 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2039 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2041 ;;;;;;;;;;;;;;;;;;;;;;
2043 vmovdqu %%T5, [%%GDATA + HashKey_7]
2044 vpshufd %%T2, %%XMM2, 01001110b
2045 vpshufd %%T3, %%T5, 01001110b
2046 vpxor %%T2, %%T2, %%XMM2
2047 vpxor %%T3, %%T3, %%T5
2049 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2050 vpxor %%T6, %%T6, %%T4
2052 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2053 vpxor %%T7, %%T7, %%T4
2055 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2057 vpxor %%XMM1, %%XMM1, %%T2
2059 ;;;;;;;;;;;;;;;;;;;;;;
2061 vmovdqu %%T5, [%%GDATA + HashKey_6]
2062 vpshufd %%T2, %%XMM3, 01001110b
2063 vpshufd %%T3, %%T5, 01001110b
2064 vpxor %%T2, %%T2, %%XMM3
2065 vpxor %%T3, %%T3, %%T5
2067 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2068 vpxor %%T6, %%T6, %%T4
2070 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2071 vpxor %%T7, %%T7, %%T4
2073 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2075 vpxor %%XMM1, %%XMM1, %%T2
2077 ;;;;;;;;;;;;;;;;;;;;;;
2079 vmovdqu %%T5, [%%GDATA + HashKey_5]
2080 vpshufd %%T2, %%XMM4, 01001110b
2081 vpshufd %%T3, %%T5, 01001110b
2082 vpxor %%T2, %%T2, %%XMM4
2083 vpxor %%T3, %%T3, %%T5
2085 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2086 vpxor %%T6, %%T6, %%T4
2088 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2089 vpxor %%T7, %%T7, %%T4
2091 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2093 vpxor %%XMM1, %%XMM1, %%T2
2095 ;;;;;;;;;;;;;;;;;;;;;;
2097 vmovdqu %%T5, [%%GDATA + HashKey_4]
2098 vpshufd %%T2, %%XMM5, 01001110b
2099 vpshufd %%T3, %%T5, 01001110b
2100 vpxor %%T2, %%T2, %%XMM5
2101 vpxor %%T3, %%T3, %%T5
2103 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2104 vpxor %%T6, %%T6, %%T4
2106 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2107 vpxor %%T7, %%T7, %%T4
2109 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2111 vpxor %%XMM1, %%XMM1, %%T2
2113 ;;;;;;;;;;;;;;;;;;;;;;
2115 vmovdqu %%T5, [%%GDATA + HashKey_3]
2116 vpshufd %%T2, %%XMM6, 01001110b
2117 vpshufd %%T3, %%T5, 01001110b
2118 vpxor %%T2, %%T2, %%XMM6
2119 vpxor %%T3, %%T3, %%T5
2121 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2122 vpxor %%T6, %%T6, %%T4
2124 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2125 vpxor %%T7, %%T7, %%T4
2127 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2129 vpxor %%XMM1, %%XMM1, %%T2
2131 ;;;;;;;;;;;;;;;;;;;;;;
2133 vmovdqu %%T5, [%%GDATA + HashKey_2]
2134 vpshufd %%T2, %%XMM7, 01001110b
2135 vpshufd %%T3, %%T5, 01001110b
2136 vpxor %%T2, %%T2, %%XMM7
2137 vpxor %%T3, %%T3, %%T5
2139 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2140 vpxor %%T6, %%T6, %%T4
2142 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2143 vpxor %%T7, %%T7, %%T4
2145 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2147 vpxor %%XMM1, %%XMM1, %%T2
2149 ;;;;;;;;;;;;;;;;;;;;;;
2151 vmovdqu %%T5, [%%GDATA + HashKey]
2152 vpshufd %%T2, %%XMM8, 01001110b
2153 vpshufd %%T3, %%T5, 01001110b
2154 vpxor %%T2, %%T2, %%XMM8
2155 vpxor %%T3, %%T3, %%T5
2157 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
2158 vpxor %%T6, %%T6, %%T4
2160 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
2161 vpxor %%T7, %%T7, %%T4
2163 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2165 vpxor %%XMM1, %%XMM1, %%T2
2166 vpxor %%XMM1, %%XMM1, %%T6
2167 vpxor %%T2, %%XMM1, %%T7
2172 vpslldq %%T4, %%T2, 8
2173 vpsrldq %%T2, %%T2, 8
2175 vpxor %%T7, %%T7, %%T4
2176 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2178 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2179 ;first phase of the reduction
2180 vmovdqu %%T3, [rel POLY2]
2182 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2183 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2185 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2186 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2189 ;second phase of the reduction
2190 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2191 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2193 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2194 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2196 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2197 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2198 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2202 ; GHASH the last 4 ciphertext blocks.
2203 %macro GHASH_LAST_7 15
2222 vmovdqu %%T5, [%%GDATA + HashKey_7]
2224 vpshufd %%T2, %%XMM1, 01001110b
2225 vpshufd %%T3, %%T5, 01001110b
2226 vpxor %%T2, %%T2, %%XMM1
2227 vpxor %%T3, %%T3, %%T5
2229 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2230 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2232 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2234 ;;;;;;;;;;;;;;;;;;;;;;
2236 vmovdqu %%T5, [%%GDATA + HashKey_6]
2237 vpshufd %%T2, %%XMM2, 01001110b
2238 vpshufd %%T3, %%T5, 01001110b
2239 vpxor %%T2, %%T2, %%XMM2
2240 vpxor %%T3, %%T3, %%T5
2242 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2243 vpxor %%T6, %%T6, %%T4
2245 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2246 vpxor %%T7, %%T7, %%T4
2248 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2250 vpxor %%XMM1, %%XMM1, %%T2
2252 ;;;;;;;;;;;;;;;;;;;;;;
2254 vmovdqu %%T5, [%%GDATA + HashKey_5]
2255 vpshufd %%T2, %%XMM3, 01001110b
2256 vpshufd %%T3, %%T5, 01001110b
2257 vpxor %%T2, %%T2, %%XMM3
2258 vpxor %%T3, %%T3, %%T5
2260 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2261 vpxor %%T6, %%T6, %%T4
2263 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2264 vpxor %%T7, %%T7, %%T4
2266 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2268 vpxor %%XMM1, %%XMM1, %%T2
2270 ;;;;;;;;;;;;;;;;;;;;;;
2272 vmovdqu %%T5, [%%GDATA + HashKey_4]
2273 vpshufd %%T2, %%XMM4, 01001110b
2274 vpshufd %%T3, %%T5, 01001110b
2275 vpxor %%T2, %%T2, %%XMM4
2276 vpxor %%T3, %%T3, %%T5
2278 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2279 vpxor %%T6, %%T6, %%T4
2281 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2282 vpxor %%T7, %%T7, %%T4
2284 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2286 vpxor %%XMM1, %%XMM1, %%T2
2288 ;;;;;;;;;;;;;;;;;;;;;;
2290 vmovdqu %%T5, [%%GDATA + HashKey_3]
2291 vpshufd %%T2, %%XMM5, 01001110b
2292 vpshufd %%T3, %%T5, 01001110b
2293 vpxor %%T2, %%T2, %%XMM5
2294 vpxor %%T3, %%T3, %%T5
2296 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2297 vpxor %%T6, %%T6, %%T4
2299 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2300 vpxor %%T7, %%T7, %%T4
2302 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2304 vpxor %%XMM1, %%XMM1, %%T2
2306 ;;;;;;;;;;;;;;;;;;;;;;
2308 vmovdqu %%T5, [%%GDATA + HashKey_2]
2309 vpshufd %%T2, %%XMM6, 01001110b
2310 vpshufd %%T3, %%T5, 01001110b
2311 vpxor %%T2, %%T2, %%XMM6
2312 vpxor %%T3, %%T3, %%T5
2314 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2315 vpxor %%T6, %%T6, %%T4
2317 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2318 vpxor %%T7, %%T7, %%T4
2320 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2322 vpxor %%XMM1, %%XMM1, %%T2
2324 ;;;;;;;;;;;;;;;;;;;;;;
2326 vmovdqu %%T5, [%%GDATA + HashKey_1]
2327 vpshufd %%T2, %%XMM7, 01001110b
2328 vpshufd %%T3, %%T5, 01001110b
2329 vpxor %%T2, %%T2, %%XMM7
2330 vpxor %%T3, %%T3, %%T5
2332 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2333 vpxor %%T6, %%T6, %%T4
2335 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2336 vpxor %%T7, %%T7, %%T4
2338 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2340 vpxor %%XMM1, %%XMM1, %%T2
2342 ;;;;;;;;;;;;;;;;;;;;;;
2344 vpxor %%XMM1, %%XMM1, %%T6
2345 vpxor %%T2, %%XMM1, %%T7
2350 vpslldq %%T4, %%T2, 8
2351 vpsrldq %%T2, %%T2, 8
2353 vpxor %%T7, %%T7, %%T4
2354 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2356 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2357 ;first phase of the reduction
2358 vmovdqu %%T3, [rel POLY2]
2360 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2361 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2363 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2367 ;second phase of the reduction
2368 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2369 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2371 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2372 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2374 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2376 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2381 ;;; Handle encryption of the final partial block
2383 ;;; r13 - Number of bytes to read
2385 ;;; KEY - Key for encrypting the partial block
2386 ;;; HASH - Current hash value
2388 ;;; r10, r12, r15, rax
2391 ;;; PLAIN_CYPH_LEN, %7, is passed only to determine
2392 ;;; if buffer is big enough to do a 16 byte read & shift.
2393 ;;; 'LT16' is passed here only if buffer is known to be smaller
2395 ;;; Any other value passed here will result in 16 byte read
2397 ;;; TBD: Remove HASH from the instantiation
2398 %macro ENCRYPT_FINAL_PARTIAL_BLOCK 8
2402 %define %%CYPH_PLAIN_OUT %4
2403 %define %%PLAIN_CYPH_IN %5
2404 %define %%PLAIN_CYPH_LEN %6
2405 %define %%ENC_DEC %7
2406 %define %%DATA_OFFSET %8
2408 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
2410 ;; T1 - packed output
2411 ;; r10 - input data address
2412 ;; r13 - input data length
2413 ;; rax - temp registers
2415 ;; k1 - valid byte mask
2416 READ_SMALL_DATA_INPUT %%T1, r10, r13, rax
2418 ;; At this point T1 contains the partial block data
2419 %ifidn %%ENC_DEC, DEC
2420 ;; Plaintext XOR E(K, Yn)
2421 ;; Set aside the ciphertext
2422 ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
2423 vmovdqu8 %%T2{k1}{z}, %%T1
2424 vpxor %%KEY, %%KEY, %%T1
2426 ;; Plaintext XOR E(K, Yn)
2427 ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
2428 vpxor %%KEY, %%KEY, %%T1
2430 vmovdqu8 %%KEY{k1}{z}, %%KEY
2432 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2434 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY
2436 %ifidn %%ENC_DEC, DEC
2437 ;; If decrypt, restore the ciphertext into %%KEY
2440 %endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
2444 ; Encryption of a single block
2445 %macro ENCRYPT_SINGLE_BLOCK 2
2449 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
2452 vaesenc %%XMM0, [%%GDATA+16*i]
2455 vaesenclast %%XMM0, [%%GDATA+16*i]
2459 ;; Start of Stack Setup
2462 ;; Required for Update/GMC_ENC
2463 ;the number of pushes must equal STACK_OFFSET
2470 sub rsp, VARIABLE_OFFSET
2473 %ifidn __OUTPUT_FORMAT__, win64
2474 ; xmm6:xmm15 need to be maintained for Windows
2475 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2476 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
2477 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
2478 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
2479 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
2480 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
2481 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
2482 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
2483 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
2484 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
2489 %macro FUNC_RESTORE 0
2491 %ifidn __OUTPUT_FORMAT__, win64
2492 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
2493 vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
2494 vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
2495 vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
2496 vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
2497 vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
2498 vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
2499 vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
2500 vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
2501 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2503 ;; Required for Update/GMC_ENC
2511 %macro FUNC_SAVE_AVX512 0
2512 ;; Required for Update/GMC_ENC
2513 ;the number of pushes must equal STACK_OFFSET
2516 sub rsp, VARIABLE_OFFSET_AVX512
2519 mov [rsp + STACK_OFFSET_AVX512 + 0*8], r12
2520 mov [rsp + STACK_OFFSET_AVX512 + 1*8], r13
2521 mov [rsp + STACK_OFFSET_AVX512 + 2*8], r14
2522 mov [rsp + STACK_OFFSET_AVX512 + 3*8], r15
2523 mov [rsp + STACK_OFFSET_AVX512 + 4*8], rax ; stack
2524 mov r14, rax ; r14 is used to retrieve stack args
2525 mov [rsp + STACK_OFFSET_AVX512 + 5*8], rbp
2526 mov [rsp + STACK_OFFSET_AVX512 + 6*8], rbx
2527 %ifidn __OUTPUT_FORMAT__, win64
2528 mov [rsp + STACK_OFFSET_AVX512 + 7*8], rdi
2529 mov [rsp + STACK_OFFSET_AVX512 + 8*8], rsi
2532 %ifidn __OUTPUT_FORMAT__, win64
2533 ; xmm6:xmm15 need to be maintained for Windows
2534 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 0*16], xmm6
2535 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 1*16], xmm7
2536 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 2*16], xmm8
2537 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 3*16], xmm9
2538 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 4*16], xmm10
2539 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 5*16], xmm11
2540 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 6*16], xmm12
2541 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 7*16], xmm13
2542 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 8*16], xmm14
2543 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 9*16], xmm15
2548 %macro FUNC_RESTORE_AVX512 0
2550 %ifidn __OUTPUT_FORMAT__, win64
2551 vmovdqu xmm15, [rsp + LOCAL_STORAGE_AVX512 + 9*16]
2552 vmovdqu xmm14, [rsp + LOCAL_STORAGE_AVX512 + 8*16]
2553 vmovdqu xmm13, [rsp + LOCAL_STORAGE_AVX512 + 7*16]
2554 vmovdqu xmm12, [rsp + LOCAL_STORAGE_AVX512 + 6*16]
2555 vmovdqu xmm11, [rsp + LOCAL_STORAGE_AVX512 + 5*16]
2556 vmovdqu xmm10, [rsp + LOCAL_STORAGE_AVX512 + 4*16]
2557 vmovdqu xmm9, [rsp + LOCAL_STORAGE_AVX512 + 3*16]
2558 vmovdqu xmm8, [rsp + LOCAL_STORAGE_AVX512 + 2*16]
2559 vmovdqu xmm7, [rsp + LOCAL_STORAGE_AVX512 + 1*16]
2560 vmovdqu xmm6, [rsp + LOCAL_STORAGE_AVX512 + 0*16]
2563 ;; Required for Update/GMC_ENC
2564 mov rbp, [rsp + STACK_OFFSET_AVX512 + 5*8]
2565 mov rbx, [rsp + STACK_OFFSET_AVX512 + 6*8]
2566 %ifidn __OUTPUT_FORMAT__, win64
2567 mov rdi, [rsp + STACK_OFFSET_AVX512 + 7*8]
2568 mov rsi, [rsp + STACK_OFFSET_AVX512 + 8*8]
2570 mov r12, [rsp + STACK_OFFSET_AVX512 + 0*8]
2571 mov r13, [rsp + STACK_OFFSET_AVX512 + 1*8]
2572 mov r14, [rsp + STACK_OFFSET_AVX512 + 2*8]
2573 mov r15, [rsp + STACK_OFFSET_AVX512 + 3*8]
2574 mov rsp, [rsp + STACK_OFFSET_AVX512 + 4*8] ; stack
2578 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2579 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
2580 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
2581 ; Additional Authentication data (A_IN), Additional Data length (A_LEN).
2582 ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
2583 ; Clobbers rax, r10-r13, and xmm0-xmm6
2584 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2586 %define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
2587 %define %%GDATA_CTX %2 ; [in] GCM context pointer
2588 %define %%IV %3 ; [in] IV pointer
2589 %define %%A_IN %4 ; [in] AAD pointer
2590 %define %%A_LEN %5 ; [in] AAD length in bytes
2591 %define %%GPR1 %6 ; temp GPR
2592 %define %%GPR2 %7 ; temp GPR
2593 %define %%GPR3 %8 ; temp GPR
2595 %define %%AAD_HASH xmm14
2597 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3
2600 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
2601 mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length
2604 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0
2605 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0
2607 ;; read 12 IV bytes and pad with 0x00000001
2609 vmovd xmm3, [%%GPR2 + 8]
2611 vmovq xmm2, [%%GPR2]
2612 vmovdqa xmm4, [rel ONEf]
2613 vpternlogq xmm2, xmm3, xmm4, 0xfe ; xmm2 = xmm2 or xmm3 or xmm4
2615 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
2617 ;; store IV as counter in LE format
2618 vpshufb xmm2, [rel SHUF_MASK]
2619 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
2622 %macro GCM_ENC_DEC_SMALL 12
2623 %define %%GDATA_KEY %1
2624 %define %%GDATA_CTX %2
2625 %define %%CYPH_PLAIN_OUT %3
2626 %define %%PLAIN_CYPH_IN %4
2627 %define %%PLAIN_CYPH_LEN %5
2628 %define %%ENC_DEC %6
2629 %define %%DATA_OFFSET %7
2630 %define %%LENGTH %8 ; assumed r13
2631 %define %%NUM_BLOCKS %9
2632 %define %%CTR %10 ; assumed xmm9
2633 %define %%HASH_OUT %11 ; assumed xmm14
2634 %define %%INSTANCE_TYPE %12
2636 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
2637 ;; cmp %%NUM_BLOCKS, 0
2638 ;; je %%_small_initial_blocks_encrypted
2640 je %%_small_initial_num_blocks_is_8
2642 je %%_small_initial_num_blocks_is_7
2644 je %%_small_initial_num_blocks_is_6
2646 je %%_small_initial_num_blocks_is_5
2648 je %%_small_initial_num_blocks_is_4
2650 je %%_small_initial_num_blocks_is_3
2652 je %%_small_initial_num_blocks_is_2
2654 jmp %%_small_initial_num_blocks_is_1
2657 %%_small_initial_num_blocks_is_8:
2661 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2665 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2672 ;; xmm8 - XMM8 - AAD HASH IN
2675 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2676 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \
2677 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2678 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2679 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2680 jmp %%_small_initial_blocks_encrypted
2682 %%_small_initial_num_blocks_is_7:
2683 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2684 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \
2685 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2686 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2687 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2688 jmp %%_small_initial_blocks_encrypted
2690 %%_small_initial_num_blocks_is_6:
2691 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2692 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \
2693 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2694 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2695 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2696 jmp %%_small_initial_blocks_encrypted
2698 %%_small_initial_num_blocks_is_5:
2699 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2700 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \
2701 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2702 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2703 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2704 jmp %%_small_initial_blocks_encrypted
2706 %%_small_initial_num_blocks_is_4:
2707 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2708 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \
2709 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2710 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2711 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2712 jmp %%_small_initial_blocks_encrypted
2714 %%_small_initial_num_blocks_is_3:
2715 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2716 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \
2717 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2718 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2719 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2720 jmp %%_small_initial_blocks_encrypted
2722 %%_small_initial_num_blocks_is_2:
2723 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2724 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \
2725 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2726 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2727 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2728 jmp %%_small_initial_blocks_encrypted
2730 %%_small_initial_num_blocks_is_1:
2731 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2732 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \
2733 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2734 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2735 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2736 %%_small_initial_blocks_encrypted:
2738 %endmacro ; GCM_ENC_DEC_SMALL
2740 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2741 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
2742 ; has been initialized by GCM_INIT
2743 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
2744 ; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
2745 ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
2746 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
2747 ; Clobbers rax, r10-r15, and xmm0-xmm15
2748 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2749 %macro GCM_ENC_DEC 7
2750 %define %%GDATA_KEY %1
2751 %define %%GDATA_CTX %2
2752 %define %%CYPH_PLAIN_OUT %3
2753 %define %%PLAIN_CYPH_IN %4
2754 %define %%PLAIN_CYPH_LEN %5
2755 %define %%ENC_DEC %6
2756 %define %%INSTANCE_TYPE %7
2757 %define %%DATA_OFFSET r11
2760 ; calculate the number of 16byte blocks in the message
2761 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
2762 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
2763 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
2765 %ifidn __OUTPUT_FORMAT__, win64
2766 cmp %%PLAIN_CYPH_LEN, 0
2768 or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
2772 xor %%DATA_OFFSET, %%DATA_OFFSET
2773 ;; Update length of data processed
2774 %ifidn __OUTPUT_FORMAT__, win64
2775 mov rax, %%PLAIN_CYPH_LEN
2776 add [%%GDATA_CTX + InLen], rax
2778 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
2780 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2781 vmovdqu xmm8, [%%GDATA_CTX + AadHash]
2783 %ifidn %%INSTANCE_TYPE, multi_call
2784 ;; NOTE: partial block processing makes only sense for multi_call here.
2785 ;; Used for the update flow - if there was a previous partial
2786 ;; block fill the remaining bytes here.
2787 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
2790 ;; lift CTR set from initial_blocks to here
2791 %ifidn %%INSTANCE_TYPE, single_call
2794 vmovdqu xmm9, [%%GDATA_CTX + CurCount]
2797 ;; Save the amount of data left to process in r10
2798 mov r13, %%PLAIN_CYPH_LEN
2799 %ifidn %%INSTANCE_TYPE, multi_call
2800 ;; NOTE: %%DATA_OFFSET is zero in single_call case.
2801 ;; Consequently PLAIN_CYPH_LEN will never be zero after
2802 ;; %%DATA_OFFSET subtraction below.
2803 sub r13, %%DATA_OFFSET
2805 ;; There may be no more data if it was consumed in the partial block.
2808 %endif ; %%INSTANCE_TYPE, multi_call
2811 ;; Determine how many blocks to process in INITIAL
2816 ;; Process one additional block in INITIAL if there is a partial block
2818 blsmsk r10, r10 ; Set CF if zero
2820 adc r12, 0x0 ; Process an additional INITIAL block if CF set
2822 ;; Less than 127B will be handled by the small message code, which
2823 ;; can process up to 7 16B blocks.
2825 jge %%_large_message_path
2827 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
2830 %%_large_message_path:
2831 and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
2832 ; can be handled by the x8 partial loop.
2835 je %%_initial_num_blocks_is_0
2837 je %%_initial_num_blocks_is_7
2839 je %%_initial_num_blocks_is_6
2841 je %%_initial_num_blocks_is_5
2843 je %%_initial_num_blocks_is_4
2845 je %%_initial_num_blocks_is_3
2847 je %%_initial_num_blocks_is_2
2849 jmp %%_initial_num_blocks_is_1
2851 %%_initial_num_blocks_is_7:
2855 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2859 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2866 ;; xmm8 - XMM8 - AAD HASH IN
2869 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2870 jmp %%_initial_blocks_encrypted
2872 %%_initial_num_blocks_is_6:
2873 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2874 jmp %%_initial_blocks_encrypted
2876 %%_initial_num_blocks_is_5:
2877 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2878 jmp %%_initial_blocks_encrypted
2880 %%_initial_num_blocks_is_4:
2881 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2882 jmp %%_initial_blocks_encrypted
2884 %%_initial_num_blocks_is_3:
2885 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2886 jmp %%_initial_blocks_encrypted
2888 %%_initial_num_blocks_is_2:
2889 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2890 jmp %%_initial_blocks_encrypted
2892 %%_initial_num_blocks_is_1:
2893 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2894 jmp %%_initial_blocks_encrypted
2896 %%_initial_num_blocks_is_0:
2897 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2900 %%_initial_blocks_encrypted:
2901 ;; The entire message was encrypted processed in initial and now need to be hashed
2905 ;; Encrypt the final <16 byte (partial) block, then hash
2907 jl %%_encrypt_final_partial
2909 ;; Process 7 full blocks plus a partial block
2911 jl %%_encrypt_by_8_partial
2914 %%_encrypt_by_8_parallel:
2915 ;; in_order vs. out_order is an optimization to increment the counter without shuffling
2916 ;; it back into little endian. r15d keeps track of when we need to increent in order so
2917 ;; that the carry is handled correctly.
2920 vpshufb xmm9, [rel SHUF_MASK]
2923 %%_encrypt_by_8_new:
2946 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
2947 add %%DATA_OFFSET, 128
2950 jge %%_encrypt_by_8_new
2952 vpshufb xmm9, [rel SHUF_MASK]
2953 jmp %%_encrypt_by_8_parallel_done
2956 vpshufb xmm9, [rel SHUF_MASK]
2958 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
2959 vpshufb xmm9, [rel SHUF_MASK]
2960 add %%DATA_OFFSET, 128
2963 jge %%_encrypt_by_8_new
2964 vpshufb xmm9, [rel SHUF_MASK]
2967 %%_encrypt_by_8_parallel_done:
2968 ;; Test to see if we need a by 8 with partial block. At this point
2969 ;; bytes remaining should be either zero or between 113-127.
2973 %%_encrypt_by_8_partial:
2974 ;; Shuffle needed to align key for partial block xor. out_order
2975 ;; is a little faster because it avoids extra shuffles.
2976 ;; TBD: Might need to account for when we don't have room to increment the counter.
2979 ;; Process parallel buffers with a final partial block.
2980 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
2983 add %%DATA_OFFSET, 128-16
2986 %%_encrypt_final_partial:
2988 vpshufb xmm8, [rel SHUF_MASK]
2989 mov [%%GDATA_CTX + PBlockLen], r13
2990 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
2992 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
2993 ;; GDATA, KEY, T1, T2
2994 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
2996 vpshufb xmm8, [rel SHUF_MASK]
3001 ;; Mapping to macro parameters
3003 ;; xmm9 contains the counter
3004 ;; xmm1-xmm8 contain the xor'd ciphertext
3006 ;; xmm14 contains the final hash
3007 ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3008 %ifidn %%INSTANCE_TYPE, multi_call
3009 mov r13, [%%GDATA_CTX + PBlockLen]
3012 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
3013 ;; XOR the partial word into the hash
3014 vpxor xmm14, xmm14, xmm8
3018 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3021 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
3022 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
3027 %endmacro ; GCM_ENC_DEC
3029 ;;; ===========================================================================
3030 ;;; AESROUND4x128 macro
3031 ;;; - 4 lanes, 8 blocks per lane
3032 ;;; - it handles special cases: the last and zero rounds
3033 ;;; Uses NROUNDS macro defined at the top of the file to check the last round
3034 %macro AESROUND4x128 25
3035 %define %%L0B03 %1 ; [in/out] lane 0, blocks 0 to 3
3036 %define %%L0B47 %2 ; [in/out] lane 0, blocks 4 to 7
3037 %define %%L1B03 %3 ; [in/out] lane 1, blocks 0 to 3
3038 %define %%L1B47 %4 ; ...
3041 %define %%L3B03 %7 ; ...
3042 %define %%L3B47 %8 ; [in/out] lane 3, blocks 4 to 7
3047 %define %%KP0 %13 ; [in] expanded key pointer lane 0
3048 %define %%KP1 %14 ; [in] expanded key pointer lane 1
3049 %define %%KP2 %15 ; [in] expanded key pointer lane 2
3050 %define %%KP3 %16 ; [in] expanded key pointer lane 3
3051 %define %%ROUND %17 ; [in] round number
3052 %define %%D0L %18 ; [in] plain/cipher text blocks 0-3 lane 0 - NEEDED FOR THE LAST ROUND ONLY (CAN BE EMPTY OTHERWISE)
3053 %define %%D0H %19 ; [in] plain/cipher text blocks 4-7 lane 0
3054 %define %%D1L %20 ; [in] plain/cipher text blocks 0-3 lane 1
3055 %define %%D1H %21 ; ...
3058 %define %%D3L %24 ; ...
3059 %define %%D3H %25 ; [in] plain/cipher text blocks 4-7 lane 3
3061 vbroadcastf64x2 %%TMP0, [%%KP0 + 16*(%%ROUND)]
3062 vbroadcastf64x2 %%TMP1, [%%KP1 + 16*(%%ROUND)]
3063 vbroadcastf64x2 %%TMP2, [%%KP2 + 16*(%%ROUND)]
3064 vbroadcastf64x2 %%TMP3, [%%KP3 + 16*(%%ROUND)]
3067 vpxorq %%L0B03, %%L0B03, %%TMP0
3068 vpxorq %%L0B47, %%L0B47, %%TMP0
3069 vpxorq %%L1B03, %%L1B03, %%TMP1
3070 vpxorq %%L1B47, %%L1B47, %%TMP1
3071 vpxorq %%L2B03, %%L2B03, %%TMP2
3072 vpxorq %%L2B47, %%L2B47, %%TMP2
3073 vpxorq %%L3B03, %%L3B03, %%TMP3
3074 vpxorq %%L3B47, %%L3B47, %%TMP3
3076 %if %%ROUND <= NROUNDS
3077 ;; rounds 1 to 9/11/13
3078 vaesenc %%L0B03, %%L0B03, %%TMP0
3079 vaesenc %%L0B47, %%L0B47, %%TMP0
3080 vaesenc %%L1B03, %%L1B03, %%TMP1
3081 vaesenc %%L1B47, %%L1B47, %%TMP1
3082 vaesenc %%L2B03, %%L2B03, %%TMP2
3083 vaesenc %%L2B47, %%L2B47, %%TMP2
3084 vaesenc %%L3B03, %%L3B03, %%TMP3
3085 vaesenc %%L3B47, %%L3B47, %%TMP3
3087 ;; the last round - mix enclast with text xor's
3088 vaesenclast %%L0B03, %%L0B03, %%TMP0
3089 vpxorq %%L0B03, %%L0B03, %%D0L
3090 vaesenclast %%L0B47, %%L0B47, %%TMP0
3091 vpxorq %%L0B47, %%L0B47, %%D0H
3092 vaesenclast %%L1B03, %%L1B03, %%TMP1
3093 vpxorq %%L1B03, %%L1B03, %%D1L
3094 vaesenclast %%L1B47, %%L1B47, %%TMP1
3095 vpxorq %%L1B47, %%L1B47, %%D1H
3096 vaesenclast %%L2B03, %%L2B03, %%TMP2
3097 vpxorq %%L2B03, %%L2B03, %%D2L
3098 vaesenclast %%L2B47, %%L2B47, %%TMP2
3099 vpxorq %%L2B47, %%L2B47, %%D2H
3100 vaesenclast %%L3B03, %%L3B03, %%TMP3
3101 vpxorq %%L3B03, %%L3B03, %%D3L
3102 vaesenclast %%L3B47, %%L3B47, %%TMP3
3103 vpxorq %%L3B47, %%L3B47, %%D3H
3106 %endmacro ; AESROUND4x128
3108 ;;; ===========================================================================
3109 ;;; ===========================================================================
3110 ;;; Horizontal XOR - 4 x 128bits xored together
3111 %macro VHPXORI4x128 2
3112 %define %%REG %1 ; [in/out] zmm512 4x128bits to xor; i128 on output
3113 %define %%TMP %2 ; temporary register
3114 vextracti64x4 YWORD(%%TMP), %%REG, 1
3115 vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
3116 vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1
3117 vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
3118 %endmacro ; VHPXORI4x128
3120 ;;; ===========================================================================
3121 ;;; ===========================================================================
3122 ;;; schoolbook multiply - 1st step
3124 %define %%KP %1 ; [in] key pointer
3125 %define %%HI %2 ; [in] previous blocks 4 to 7
3127 %define %%TH %4 ; [out] tmp high
3128 %define %%TM %5 ; [out] tmp medium
3129 %define %%TL %6 ; [out] tmp low
3130 vmovdqu64 %%TMP, [%%KP + HashKey_4]
3131 vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1
3132 vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0
3133 vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0
3134 vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1
3135 vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL]
3136 %endmacro ; CLMUL_INIT
3138 ;;; ===========================================================================
3139 ;;; ===========================================================================
3140 ;;; schoolbook multiply - 2nd step
3142 %define %%KP %1 ; [in] key pointer
3143 %define %%HI %2 ; [out] high 128b of hash to reduce
3144 %define %%LO %3 ; [in/out] previous blocks 0 to 3; low 128b of hash to reduce
3148 %define %%TH %7 ; [in] tmp high
3149 %define %%TM %8 ; [in] tmp medium
3150 %define %%TL %9 ; [in] tmp low
3152 vmovdqu64 %%TMP0, [%%KP + HashKey_8]
3153 vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1
3154 vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1
3155 vpxorq %%TH, %%TH, %%TMP2
3156 vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0
3157 vpxorq %%TL, %%TL, %%TMP2
3158 vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0
3159 vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0
3161 ;; finish multiplications
3162 vpsrldq %%TMP2, %%TM, 8
3163 vpxorq %%HI, %%TH, %%TMP2
3164 vpslldq %%TMP2, %%TM, 8
3165 vpxorq %%LO, %%TL, %%TMP2
3167 ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
3168 ;; note: (X1+Y0) handled elsewhere
3169 VHPXORI4x128 %%HI, %%TMP2
3170 VHPXORI4x128 %%LO, %%TMP1
3171 ;; HIx holds top 128 bits
3172 ;; LOx holds low 128 bits
3173 ;; - further reductions to follow
3174 %endmacro ; CLMUL_STEP
3176 ;;; ===========================================================================
3177 ;;; ===========================================================================
3178 ;;; Encrypt the initial 8 blocks from 4 lanes and apply ghash on the ciphertext
3179 %macro INITIAL_BLOCKS_x4 33
3180 %define %%IN %1 ; pointer to array of pointers to input text
3181 %define %%OUT %2 ; pointer to array of pointers to output text
3182 %define %%KEYP0 %3 ; pointer to expanded keys, lane 0
3183 %define %%KEYP1 %4 ; pointer to expanded keys, lane 1
3184 %define %%KEYP2 %5 ; pointer to expanded keys, lane 2
3185 %define %%KEYP3 %6 ; pointer to expanded keys, lane 3
3186 %define %%TPTR0 %7 ; temporary GP register
3187 %define %%TPTR1 %8 ; temporary GP register
3188 %define %%TPTR2 %9 ; temporary GP register
3189 %define %%TPTR3 %10 ; temporary GP register
3190 %define %%L0B03 %11 ; [out] cipher text blocks 0 to 3, lane 0
3191 %define %%L0B47 %12 ; [out] cipher text blocks 4 to 7, lane 0
3192 %define %%L1B03 %13 ; [out] cipher text blocks 0 to 3, lane 1
3193 %define %%L1B47 %14 ; ...
3196 %define %%L3B03 %17 ; ...
3197 %define %%L3B47 %18 ; [out] cipher text blocks 4 to 7, lane 3
3198 %define %%GHASH %19 ; [in] AAD lane 0, 1, 2 and 3
3199 %define %%T0 %20 ; temporary AVX512 register
3200 %define %%T1 %21 ; temporary AVX512 register
3201 %define %%T2 %22 ; temporary AVX512 register
3202 %define %%T3 %23 ; temporary AVX512 register
3203 %define %%T4 %24 ; temporary AVX512 register
3204 %define %%T5 %25 ; temporary AVX512 register
3205 %define %%T6 %26 ; temporary AVX512 register
3206 %define %%T7 %27 ; temporary AVX512 register
3207 %define %%T8 %28 ; temporary AVX512 register
3208 %define %%T9 %29 ; temporary AVX512 register
3209 %define %%T10 %30 ; temporary AVX512 register
3210 %define %%T11 %31 ; temporary AVX512 register
3211 %define %%ZMM_SHFMASK %32 ; [in] shuffle mask changing byte order in 4 128bit words
3212 %define %%ENC_DEC %33 ; [in] ENC (encrypt) or DEC (decrypt) selector
3214 %define %%INP0 %%TPTR0
3215 %define %%INP1 %%TPTR1
3216 %define %%INP2 %%TPTR2
3217 %define %%INP3 %%TPTR3
3219 %define %%OUTP0 %%TPTR0
3220 %define %%OUTP1 %%TPTR1
3221 %define %%OUTP2 %%TPTR2
3222 %define %%OUTP3 %%TPTR3
3225 mov %%INP0, [%%IN + 8*0]
3226 mov %%INP1, [%%IN + 8*1]
3227 mov %%INP2, [%%IN + 8*2]
3228 mov %%INP3, [%%IN + 8*3]
3230 VX512LDR %%T4, [%%INP0 + (16*0)]
3231 VX512LDR %%T5, [%%INP0 + (16*4)]
3232 VX512LDR %%T6, [%%INP1 + (16*0)]
3233 VX512LDR %%T7, [%%INP1 + (16*4)]
3234 VX512LDR %%T8, [%%INP2 + (16*0)]
3235 VX512LDR %%T9, [%%INP2 + (16*4)]
3236 VX512LDR %%T10,[%%INP3 + (16*0)]
3237 VX512LDR %%T11,[%%INP3 + (16*4)]
3240 vpshufb %%L0B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3241 vpshufb %%L0B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3242 vpshufb %%L1B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3243 vpshufb %%L1B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3244 vpshufb %%L2B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3245 vpshufb %%L2B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3246 vpshufb %%L3B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3247 vpshufb %%L3B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3249 ;; move to AES encryption rounds
3250 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3251 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3252 %%T0, %%T1, %%T2, %%T3, \
3253 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 0, \
3254 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3256 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3257 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3258 %%T0, %%T1, %%T2, %%T3, \
3259 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 1, \
3260 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3262 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3263 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3264 %%T0, %%T1, %%T2, %%T3, \
3265 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 2, \
3266 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3268 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3269 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3270 %%T0, %%T1, %%T2, %%T3, \
3271 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 3, \
3272 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3274 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3275 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3276 %%T0, %%T1, %%T2, %%T3, \
3277 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 4, \
3278 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3280 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3281 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3282 %%T0, %%T1, %%T2, %%T3, \
3283 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 5, \
3284 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3286 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3287 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3288 %%T0, %%T1, %%T2, %%T3, \
3289 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 6, \
3290 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3292 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3293 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3294 %%T0, %%T1, %%T2, %%T3, \
3295 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 7, \
3296 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3298 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3299 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3300 %%T0, %%T1, %%T2, %%T3, \
3301 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 8, \
3302 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3304 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3305 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3306 %%T0, %%T1, %%T2, %%T3, \
3307 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 9, \
3308 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3310 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3311 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3312 %%T0, %%T1, %%T2, %%T3, \
3313 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 10, \
3314 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3317 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3318 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3319 %%T0, %%T1, %%T2, %%T3, \
3320 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 11, \
3321 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3323 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3324 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3325 %%T0, %%T1, %%T2, %%T3, \
3326 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 12, \
3327 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3330 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3331 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3332 %%T0, %%T1, %%T2, %%T3, \
3333 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 13, \
3334 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3336 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3337 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3338 %%T0, %%T1, %%T2, %%T3, \
3339 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 14, \
3340 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3345 mov %%OUTP0, [%%OUT + 8*0]
3346 mov %%OUTP1, [%%OUT + 8*1]
3347 mov %%OUTP2, [%%OUT + 8*2]
3348 mov %%OUTP3, [%%OUT + 8*3]
3350 VX512STR [%%OUTP0 + (16*0)], %%L0B03
3351 VX512STR [%%OUTP0 + (16*4)], %%L0B47
3352 VX512STR [%%OUTP1 + (16*0)], %%L1B03
3353 VX512STR [%%OUTP1 + (16*4)], %%L1B47
3354 VX512STR [%%OUTP2 + (16*0)], %%L2B03
3355 VX512STR [%%OUTP2 + (16*4)], %%L2B47
3356 VX512STR [%%OUTP3 + (16*0)], %%L3B03
3357 VX512STR [%%OUTP3 + (16*4)], %%L3B47
3359 %ifidn %%ENC_DEC, DEC
3360 ;; decryption - cipher text needs to go to GHASH phase
3361 vpshufb %%L0B03, %%T4, %%ZMM_SHFMASK
3362 vpshufb %%L0B47, %%T5, %%ZMM_SHFMASK
3363 vpshufb %%L1B03, %%T6, %%ZMM_SHFMASK
3364 vpshufb %%L1B47, %%T7, %%ZMM_SHFMASK
3365 vpshufb %%L2B03, %%T8, %%ZMM_SHFMASK
3366 vpshufb %%L2B47, %%T9, %%ZMM_SHFMASK
3367 vpshufb %%L3B03, %%T10, %%ZMM_SHFMASK
3368 vpshufb %%L3B47, %%T11, %%ZMM_SHFMASK
3371 vpshufb %%L0B03, %%L0B03, %%ZMM_SHFMASK
3372 vpshufb %%L0B47, %%L0B47, %%ZMM_SHFMASK
3373 vpshufb %%L1B03, %%L1B03, %%ZMM_SHFMASK
3374 vpshufb %%L1B47, %%L1B47, %%ZMM_SHFMASK
3375 vpshufb %%L2B03, %%L2B03, %%ZMM_SHFMASK
3376 vpshufb %%L2B47, %%L2B47, %%ZMM_SHFMASK
3377 vpshufb %%L3B03, %%L3B03, %%ZMM_SHFMASK
3378 vpshufb %%L3B47, %%L3B47, %%ZMM_SHFMASK
3381 ;; xor encrypted block 0 with GHASH for the next GHASH round
3382 vmovdqa64 XWORD(%%T1), XWORD(%%GHASH)
3383 vextracti32x4 XWORD(%%T2), %%GHASH, 1
3384 vextracti32x4 XWORD(%%T3), %%GHASH, 2
3385 vextracti32x4 XWORD(%%T4), %%GHASH, 3
3387 vpxorq %%L0B03, %%T1
3388 vpxorq %%L1B03, %%T2
3389 vpxorq %%L2B03, %%T3
3390 vpxorq %%L3B03, %%T4
3391 %endmacro ;INITIAL_BLOCKS_x4
3393 ;;; ===========================================================================
3394 ;;; ===========================================================================
3395 ;;; Encrypt 8 blocks at a time on 4 lanes
3396 ;;; GHASH the 8 previously encrypted ciphertext blocks (4 lanes)
3397 %macro GHASH_8_ENCRYPT_8_PARALLEL_x4 44
3398 %define %%IN %1 ; pointer to array of pointers to plain/cipher text
3399 %define %%OUT %2 ; pointer to array of pointers to cipher/plain text
3400 %define %%KEYP0 %3 ; pointer to expanded keys, lane 0
3401 %define %%KEYP1 %4 ; pointer to expanded keys, lane 1
3402 %define %%KEYP2 %5 ; pointer to expanded keys, lane 2
3403 %define %%KEYP3 %6 ; pointer to expanded keys, lane 3
3404 %define %%TPTR0 %7 ; temporary GP register (used as pointer)
3405 %define %%TPTR1 %8 ; temporary GP register (used as pointer)
3406 %define %%TPTR2 %9 ; temporary GP register (used as pointer)
3407 %define %%TPTR3 %10 ; temporary GP register (used as pointer)
3408 %define %%DATA_OFFSET %11 ; current data offset (used with text loads and stores)
3409 %define %%CTRL0 %12 ; counter blocks 4 to 7 for lane 0
3410 %define %%CTRL1 %13 ; counter blocks 4 to 7 for lane 1
3411 %define %%CTRL2 %14 ; counter blocks 4 to 7 for lane 2
3412 %define %%CTRL3 %15 ; counter blocks 4 to 7 for lane 3
3413 %define %%L0B03 %16 ; lane 0 blocks 0 to 3
3414 %define %%L0B47 %17 ; lane 0 blocks 4 to 7
3415 %define %%L1B03 %18 ; lane 1 blocks 0 to 3
3416 %define %%L1B47 %19 ; lane 1 blocks 4 to 7
3417 %define %%L2B03 %20 ; lane 2 blocks 0 to 3
3418 %define %%L2B47 %21 ; lane 2 blocks 4 to 7
3419 %define %%L3B03 %22 ; lane 3 blocks 0 to 3
3420 %define %%L3B47 %23 ; lane 3 blocks 4 to 7
3421 %define %%GHASH %24 ; [in/out] GHASH for 4 lanes
3432 %define %%PREVLO0 %35 ; [in] 4 lanes x 8 blocks of cipher text for GHASH
3433 %define %%PREVHI0 %36
3434 %define %%PREVLO1 %37
3435 %define %%PREVHI1 %38
3436 %define %%PREVLO2 %39
3437 %define %%PREVHI2 %40
3438 %define %%PREVLO3 %41
3439 %define %%PREVHI3 %42
3440 %define %%ZMM_SHFMASK %43 ; [in] byte swap shuffle mask for 128 bits
3441 %define %%ENC_DEC %44 ; [in] ENC (encryption) or DEC (decryption)
3443 ;;; ============================================================================
3444 ;;; a few virtual register mappings
3445 %define %%INP0 %%TPTR0
3446 %define %%INP1 %%TPTR1
3447 %define %%INP2 %%TPTR2
3448 %define %%INP3 %%TPTR3
3450 %define %%OUTP0 %%TPTR0
3451 %define %%OUTP1 %%TPTR1
3452 %define %%OUTP2 %%TPTR2
3453 %define %%OUTP3 %%TPTR3
3459 %define %%TEXTL0B03 %%T8
3460 %define %%TEXTL0B47 %%T9
3461 %define %%TEXTL1B03 %%PREVLO1 ; GHASH needs to be complete before using these
3462 %define %%TEXTL1B47 %%PREVHI1
3463 %define %%TEXTL2B03 %%PREVLO2
3464 %define %%TEXTL2B47 %%PREVHI2
3465 %define %%TEXTL3B03 %%PREVLO3
3466 %define %%TEXTL3B47 %%PREVHI3
3467 ;;; ============================================================================
3469 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3470 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3471 %%T0, %%T1, %%T2, %%T3, \
3472 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 0, \
3473 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3474 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3476 mov %%INP0, [%%IN + 8*0]
3477 mov %%INP1, [%%IN + 8*1]
3478 mov %%INP2, [%%IN + 8*2]
3479 mov %%INP3, [%%IN + 8*3]
3481 ;; =====================================================================
3482 CLMUL_INIT %%KEYP0, %%PREVHI0, %%T4, %%TH, %%TM, %%TL
3483 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3484 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3485 %%T0, %%T1, %%T2, %%T3, \
3486 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 1, \
3487 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3488 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3490 CLMUL_STEP %%KEYP0, %%PREVHI0, %%PREVLO0, %%T4, %%T8, %%T9, %%TH, %%TM, %%TL
3492 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3493 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3494 %%T0, %%T1, %%T2, %%T3, \
3495 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 2, \
3496 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3497 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3499 ;; =====================================================================
3501 CLMUL_INIT %%KEYP1, %%PREVHI1, %%T4, %%TH, %%TM, %%TL
3503 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3504 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3505 %%T0, %%T1, %%T2, %%T3, \
3506 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 3, \
3507 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3508 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3510 CLMUL_STEP %%KEYP1, %%PREVHI1, %%PREVLO1, %%T4, %%T8, %%T9, %%TH, %%TM, %%TL
3512 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3513 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3514 %%T0, %%T1, %%T2, %%T3, \
3515 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 4, \
3516 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3517 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3519 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3520 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO1), 1
3521 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI1), 1
3523 ;; =====================================================================
3525 CLMUL_INIT %%KEYP2, %%PREVHI2, %%T4, %%T5, %%T6, %%T7
3527 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3528 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3529 %%T0, %%T1, %%T2, %%T3, \
3530 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 5, \
3531 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3532 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3534 CLMUL_STEP %%KEYP2, %%PREVHI2, %%PREVLO2, %%T4, %%T8, %%T9, %%T5, %%T6, %%T7
3536 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3537 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3538 %%T0, %%T1, %%T2, %%T3, \
3539 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 6, \
3540 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3541 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3543 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3544 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO2), 2
3545 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI2), 2
3547 ;; =====================================================================
3549 CLMUL_INIT %%KEYP3, %%PREVHI3, %%T4, %%T5, %%T6, %%T7
3551 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3552 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3553 %%T0, %%T1, %%T2, %%T3, \
3554 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 7, \
3555 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3556 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3558 CLMUL_STEP %%KEYP3, %%PREVHI3, %%PREVLO3, %%T4, %%T8, %%T9, %%T5, %%T6, %%T7
3560 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3561 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3562 %%T0, %%T1, %%T2, %%T3, \
3563 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 8, \
3564 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3565 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3567 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3568 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO3), 3
3569 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI3), 3
3571 ;; =====================================================================
3572 ;; load plain/cipher text
3573 ;; - this cannot be done before GHASH is complete (reuses same registers)
3575 VX512LDR %%TEXTL0B03, [%%INP0 + %%DATA_OFFSET + 64*0]
3576 VX512LDR %%TEXTL0B47, [%%INP0 + %%DATA_OFFSET + 64*1]
3577 VX512LDR %%TEXTL1B03, [%%INP1 + %%DATA_OFFSET + 64*0]
3578 VX512LDR %%TEXTL1B47, [%%INP1 + %%DATA_OFFSET + 64*1]
3579 VX512LDR %%TEXTL2B03, [%%INP2 + %%DATA_OFFSET + 64*0]
3580 VX512LDR %%TEXTL2B47, [%%INP2 + %%DATA_OFFSET + 64*1]
3581 VX512LDR %%TEXTL3B03, [%%INP3 + %%DATA_OFFSET + 64*0]
3582 VX512LDR %%TEXTL3B47, [%%INP3 + %%DATA_OFFSET + 64*1]
3584 mov %%OUTP0, [%%OUT + 8*0]
3585 mov %%OUTP1, [%%OUT + 8*1]
3586 mov %%OUTP2, [%%OUT + 8*2]
3587 mov %%OUTP3, [%%OUT + 8*3]
3589 ;; =====================================================================
3590 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3591 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3592 %%T0, %%T1, %%T2, %%T3, \
3593 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 9, \
3594 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3595 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3597 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3598 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3599 %%T0, %%T1, %%T2, %%T3, \
3600 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 10, \
3601 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3602 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3605 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3606 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3607 %%T0, %%T1, %%T2, %%T3, \
3608 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 11, \
3609 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3610 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3611 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3612 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3613 %%T0, %%T1, %%T2, %%T3, \
3614 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 12, \
3615 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3616 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3618 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3619 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3620 %%T0, %%T1, %%T2, %%T3, \
3621 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 13, \
3622 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3623 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3624 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3625 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3626 %%T0, %%T1, %%T2, %%T3, \
3627 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 14, \
3628 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3629 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3633 ;; =====================================================================
3634 ;; =====================================================================
3635 ;; =====================================================================
3637 ;; =====================================================================
3638 ;; first phase of the reduction (barret)
3639 ;; - becasue of bit ordering, LSB 128 bit word is reduced rather than MSB
3640 ;; - accumulated GHASH in [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3642 vmovdqu64 %%T3, [rel POLY2]
3644 vpclmulqdq %%T4, %%T3, %%PREVLO0, 0x01
3645 vpslldq %%T4, %%T4, 8 ; shift-L 2 DWs
3646 vpxorq %%PREVLO0, %%PREVLO0, %%T4 ; first phase of the reduction complete
3648 ;; =====================================================================
3649 ;; store cipher/plain text
3651 VX512STR [%%OUTP0 + %%DATA_OFFSET + 64*0], %%L0B03
3652 VX512STR [%%OUTP0 + %%DATA_OFFSET + 64*1], %%L0B47
3653 VX512STR [%%OUTP1 + %%DATA_OFFSET + 64*0], %%L1B03
3654 VX512STR [%%OUTP1 + %%DATA_OFFSET + 64*1], %%L1B47
3655 VX512STR [%%OUTP2 + %%DATA_OFFSET + 64*0], %%L2B03
3656 VX512STR [%%OUTP2 + %%DATA_OFFSET + 64*1], %%L2B47
3657 VX512STR [%%OUTP3 + %%DATA_OFFSET + 64*0], %%L3B03
3658 VX512STR [%%OUTP3 + %%DATA_OFFSET + 64*1], %%L3B47
3660 ;; =====================================================================
3661 ;; second phase of the reduction
3662 vpclmulqdq %%T4, %%T3, %%PREVLO0, 0x00
3663 vpsrldq %%T4, %%T4, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
3665 vpclmulqdq %%GHASH, %%T3, %%PREVLO0, 0x10
3666 vpslldq %%GHASH, %%GHASH, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
3668 ;; second phase of the reduction complete
3669 vpternlogq %%GHASH, %%T4, %%PREVHI0, 0x96 ; GHASH = GHASH xor T4 xor PREVHI0
3671 ;; =====================================================================
3673 ;; prepare cipher blocks for the next GHASH round
3674 %ifidn %%ENC_DEC, ENC
3675 vpshufb %%L0B03, %%L0B03, %%ZMM_SHFMASK
3676 vpshufb %%L0B47, %%L0B47, %%ZMM_SHFMASK
3677 vpshufb %%L1B03, %%L1B03, %%ZMM_SHFMASK
3678 vpshufb %%L1B47, %%L1B47, %%ZMM_SHFMASK
3679 vpshufb %%L2B03, %%L2B03, %%ZMM_SHFMASK
3680 vpshufb %%L2B47, %%L2B47, %%ZMM_SHFMASK
3681 vpshufb %%L3B03, %%L3B03, %%ZMM_SHFMASK
3682 vpshufb %%L3B47, %%L3B47, %%ZMM_SHFMASK
3684 ;; GHASH is computed over cipher text (use text)
3685 vpshufb %%L0B03, %%TEXTL0B03, %%ZMM_SHFMASK
3686 vpshufb %%L0B47, %%TEXTL0B47, %%ZMM_SHFMASK
3687 vpshufb %%L1B03, %%TEXTL1B03, %%ZMM_SHFMASK
3688 vpshufb %%L1B47, %%TEXTL1B47, %%ZMM_SHFMASK
3689 vpshufb %%L2B03, %%TEXTL2B03, %%ZMM_SHFMASK
3690 vpshufb %%L2B47, %%TEXTL2B47, %%ZMM_SHFMASK
3691 vpshufb %%L3B03, %%TEXTL3B03, %%ZMM_SHFMASK
3692 vpshufb %%L3B47, %%TEXTL3B47, %%ZMM_SHFMASK
3695 ;; xor encrypted block 0 with GHASH for the next round
3696 vmovdqa64 XWORD(%%T1), XWORD(%%GHASH)
3697 vextracti32x4 XWORD(%%T2), %%GHASH, 1
3698 vextracti32x4 XWORD(%%T3), %%GHASH, 2
3699 vextracti32x4 XWORD(%%T4), %%GHASH, 3
3701 vpxorq %%L0B03, %%T1
3702 vpxorq %%L1B03, %%T2
3703 vpxorq %%L2B03, %%T3
3704 vpxorq %%L3B03, %%T4
3705 %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL_x4
3707 ;;; ===========================================================================
3708 ;;; ===========================================================================
3709 ;;; GHASH the last 8 ciphertext blocks on 4 lanes
3710 %macro GHASH_LAST_8x4 25
3711 %define %%KEYP0 %1 ; [in] pointer to expanded keys, lane 0
3712 %define %%KEYP1 %2 ; [in] pointer to expanded keys, lane 1
3713 %define %%KEYP2 %3 ; [in] pointer to expanded keys, lane 2
3714 %define %%KEYP3 %4 ; [in] pointer to expanded keys, lane 3
3715 %define %%L0B03 %5 ; [in] clobbered, ciper text, lane 0, blocks 0 to 3 (Y0 already XOR'ed on X1)
3716 %define %%L0B47 %6 ; [in] clobbered, ciper text, lane 0, blocks 4 to 7
3717 %define %%L1B03 %7 ; ...
3721 %define %%L3B03 %11 ; ...
3722 %define %%L3B47 %12 ; [in] clobbered, ciper text, lane 3, blocks 4 to 7
3723 %define %%GHASH %13 ; [out] ghash output
3744 ;; =====================================================================
3747 CLMUL_INIT %%KEYP0, %%L0B47, %%T4, %%TH, %%TM, %%TL
3748 CLMUL_STEP %%KEYP0, %%L0B47, %%L0B03, \
3752 vmovdqa64 XWORD(%%L), XWORD(%%L0B03)
3753 vmovdqa64 XWORD(%%H), XWORD(%%L0B47)
3755 ;; =====================================================================
3758 CLMUL_INIT %%KEYP1, %%L1B47, %%T4, %%TH, %%TM, %%TL
3759 CLMUL_STEP %%KEYP1, %%L1B47, %%L1B03, \
3763 vinserti64x2 %%L, XWORD(%%L1B03), 1
3764 vinserti64x2 %%H, XWORD(%%L1B47), 1
3766 ;; =====================================================================
3769 CLMUL_INIT %%KEYP2, %%L2B47, %%T4, %%TH, %%TM, %%TL
3770 CLMUL_STEP %%KEYP2, %%L2B47, %%L2B03, \
3774 vinserti64x2 %%L, XWORD(%%L2B03), 2
3775 vinserti64x2 %%H, XWORD(%%L2B47), 2
3777 ;; =====================================================================
3780 CLMUL_INIT %%KEYP3, %%L3B47, %%T4, %%TH, %%TM, %%TL
3781 CLMUL_STEP %%KEYP3, %%L3B47, %%L3B03, \
3785 vinserti64x2 %%L, XWORD(%%L3B03), 3
3786 vinserti64x2 %%H, XWORD(%%L3B47), 3
3788 ;; =====================================================================
3789 ;; =====================================================================
3790 ;; first phase of the reduction <H(hi):L(low)>
3791 ;; - reducing L, rather H, due to bit ordering
3793 vmovdqu64 %%T3, [rel POLY2]
3795 vpclmulqdq %%T4, %%T3, %%L, 0x01
3796 vpslldq %%T4, %%T4, 8 ; shift-L xmm2 2 DWs
3798 vpxorq %%L, %%L, %%T4 ; first phase of the reduction complete
3800 ;; =====================================================================
3801 ;; second phase of the reduction
3802 vpclmulqdq %%T4, %%T3, %%L, 0x00
3803 vpsrldq %%T4, %%T4, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
3805 vpclmulqdq %%GHASH, %%T3, %%L, 0x10
3806 vpslldq %%GHASH, %%GHASH, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
3808 ;; second phase of the reduction complete
3809 vpternlogq %%GHASH, %%T4, %%H, 0x96 ; GHASH = GHASH xor T4 xor H
3810 ;; =====================================================================
3813 ;;; ===========================================================================
3814 ;;; ===========================================================================
3815 ;;; GCM_ENC_DEC_4x128 Encodes/Decodes given data
3816 ;;; - 4 lanes, 8 blocks at a time (hence 4x128 bytes or 4x8 blocks)
3817 ;;; - assumes that the passed gcm_context_data struct has been initialized by GCM_INIT
3818 ;;; - requires the input data be multiple of 128 bytes
3819 ;;; Input: gcm_key_data struct *GDATA_KEY[4]
3820 ;;; gcm_context_data *GDATA_CTX[4]
3821 ;;; input text PLAIN_CYPH_IN[4]
3822 ;;; input text length (PLAIN_CYPH_LEN) and
3823 ;;; whether encoding or decoding (ENC_DEC).
3824 ;;; Output: A cipher of the given plain text CYPH_PLAIN_OUT[4]
3825 ;;; updated GDATA_CTX[4]
3826 ;;; Linux clobbers: rax, rbx, rcx, rdx, rbp, r8-r15, zmm0-zmm31
3827 ;;; Windows clobbers: rax, rbx, rdi ,rsi, rbp, r8-r15, zmm0-zmm31
3828 ;;; ===========================================================================
3829 %macro GCM_ENC_DEC_4x128 3
3830 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded keys
3831 %define %%PLAIN_CYPH_LEN %2 ; [in] length of the text to process (multiple of 128 bytes)
3832 %define %%ENC_DEC %3 ; [in] ENC (encrypt) or DEC (decrypt) selector
3834 %define %%GDATA_KEY %%STATE + _gcm_args_keys
3835 %define %%GDATA_CTX %%STATE + _gcm_args_ctx
3836 %define %%CYPH_PLAIN_OUT %%STATE + _gcm_args_out
3837 %define %%PLAIN_CYPH_IN %%STATE + _gcm_args_in
3839 %define %%LEN_REG %%PLAIN_CYPH_LEN
3840 %define %%DATA_OFFSET r14 ;; @note: on windows this reg is used to retrive stack args
3842 ;;; ===========================================================================
3843 ;;; register mappings within the macro
3855 %ifidn __OUTPUT_FORMAT__, win64
3867 %define %%L0B03 zmm0
3868 %define %%L0B47 zmm1
3869 %define %%L1B03 zmm2
3870 %define %%L1B47 zmm3
3871 %define %%L2B03 zmm4
3872 %define %%L2B47 zmm5
3873 %define %%L3B03 zmm6
3874 %define %%L3B47 zmm7
3895 %define %%GHASH zmm26
3897 %define %%CTRL0 zmm27
3898 %define %%CTRL1 zmm28
3899 %define %%CTRL2 zmm29
3900 %define %%CTRL3 zmm30
3902 %define %%ZMM_SHUF_MASK zmm31
3904 ;;; ===========================================================================
3905 ;;; virtual register mappings
3907 %define %%PREVLO0 %%T11 ; 4 lanes x 8 blocks of cipher text for GHASH
3908 %define %%PREVHI0 %%T12
3909 %define %%PREVLO1 %%T13
3910 %define %%PREVHI1 %%T14
3911 %define %%PREVLO2 %%T15
3912 %define %%PREVHI2 %%T16
3913 %define %%PREVLO3 %%T17
3914 %define %%PREVHI3 %%T18
3916 ;;; ===========================================================================
3918 or %%LEN_REG, %%LEN_REG
3919 jz %%_enc_dec_done_x4
3921 mov %%DATA_OFFSET, 128
3923 ;; load GCM CTX pointers for 4 lanes
3924 mov %%TPTR0, [%%GDATA_CTX + (0*8)]
3925 mov %%TPTR1, [%%GDATA_CTX + (1*8)]
3926 mov %%TPTR2, [%%GDATA_CTX + (2*8)]
3927 mov %%TPTR3, [%%GDATA_CTX + (3*8)]
3929 ;; load common constants used in the code
3930 vmovdqa64 %%ZMM_SHUF_MASK, [rel SHUF_MASK]
3932 ;; Update length of data processed
3933 add [%%TPTR0 + InLen], %%LEN_REG
3934 add [%%TPTR1 + InLen], %%LEN_REG
3935 add [%%TPTR2 + InLen], %%LEN_REG
3936 add [%%TPTR3 + InLen], %%LEN_REG
3938 ;; extract current hash values from 4 lanes
3939 vmovdqu64 XWORD(%%GHASH), [%%TPTR0 + AadHash]
3940 vinserti64x2 %%GHASH, [%%TPTR1 + AadHash], 1
3941 vinserti64x2 %%GHASH, [%%TPTR2 + AadHash], 2
3942 vinserti64x2 %%GHASH, [%%TPTR3 + AadHash], 3
3944 ;; lift CTR set from initial_blocks to here
3945 vmovdqa64 %%T1, [rel ddq_add_1234]
3946 vmovdqa64 %%T2, [rel ddq_add_5678]
3947 vbroadcastf64x2 %%CTRL0, [%%TPTR0 + CurCount]
3948 vbroadcastf64x2 %%CTRL1, [%%TPTR1 + CurCount]
3949 vbroadcastf64x2 %%CTRL2, [%%TPTR2 + CurCount]
3950 vbroadcastf64x2 %%CTRL3, [%%TPTR3 + CurCount]
3951 vpaddd %%L0B03, %%CTRL0, %%T1
3952 vpaddd %%L1B03, %%CTRL1, %%T1
3953 vpaddd %%L2B03, %%CTRL2, %%T1
3954 vpaddd %%L3B03, %%CTRL3, %%T1
3955 vpaddd %%L0B47, %%CTRL0, %%T2
3956 vpaddd %%L1B47, %%CTRL1, %%T2
3957 vpaddd %%L2B47, %%CTRL2, %%T2
3958 vpaddd %%L3B47, %%CTRL3, %%T2
3959 vmovdqa64 %%CTRL0, %%L0B47
3960 vmovdqa64 %%CTRL1, %%L1B47
3961 vmovdqa64 %%CTRL2, %%L2B47
3962 vmovdqa64 %%CTRL3, %%L3B47
3964 ;; load GCM key pointers for 4 lanes
3965 mov %%KPTR0, [%%GDATA_KEY + (0*8)]
3966 mov %%KPTR1, [%%GDATA_KEY + (1*8)]
3967 mov %%KPTR2, [%%GDATA_KEY + (2*8)]
3968 mov %%KPTR3, [%%GDATA_KEY + (3*8)]
3971 ;; run cipher only over the first 8 blocks
3972 INITIAL_BLOCKS_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
3973 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
3974 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
3975 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3976 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3978 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, \
3979 %%T9, %%T10, %%T11, %%T12, \
3980 %%ZMM_SHUF_MASK, %%ENC_DEC
3984 jz %%_encrypt_done_x4
3986 vmovq %%GPR0, XWORD(%%CTRL0)
3987 vmovq %%GPR1, XWORD(%%CTRL1)
3988 vmovq %%GPR2, XWORD(%%CTRL2)
3989 vmovq %%GPR3, XWORD(%%CTRL3)
3996 ;; shuffle the counters to BE
3997 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
3998 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
3999 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4000 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4002 %%_encrypt_by_8_parallel_x4:
4003 ;; get max counter value
4005 cmova %%GPR1, %%GPR0
4007 cmova %%GPR1, %%GPR2
4009 cmova %%GPR1, %%GPR3
4010 ;; at this stage %%GPR1 includes max 8-bit LS counter from 4 lanes
4012 ;; if max counter is above 244 then overflow will occur
4014 ja %%_encrypt_by_8_overflow_x4
4016 ;; (256 - 8) because we process 8 blocks at a time
4017 ;; Max number of blocks that can be processed in a lane
4018 ;; without shuffling is (256 - 8)
4019 mov %%GPR0, (256 - 8)
4022 ;; GPR0 holds number of iterations based on remaing blocks before overflow
4024 ;; get number of iterations from the remaining byte length
4025 mov %%GPR1, %%LEN_REG
4028 ;; pick the smallest one (GPR0 will be the counter)
4030 cmovb %%GPR0, %%GPR1
4033 ;; copy previously encrypted blocks for GHASH
4034 vmovdqa64 %%PREVLO0, %%L0B03
4035 vmovdqa64 %%PREVHI0, %%L0B47
4036 vmovdqa64 %%PREVLO1, %%L1B03
4037 vmovdqa64 %%PREVHI1, %%L1B47
4038 vmovdqa64 %%PREVLO2, %%L2B03
4039 vmovdqa64 %%PREVHI2, %%L2B47
4040 vmovdqa64 %%PREVLO3, %%L3B03
4041 vmovdqa64 %%PREVHI3, %%L3B47
4043 ;; - no byte overflow and no shuffling required
4044 vmovdqa64 %%T1, [rel ddq_addbe_4444]
4045 vmovdqa64 %%T2, [rel ddq_addbe_8888]
4047 vpaddd %%L0B03, %%CTRL0, %%T1
4048 vpaddd %%L1B03, %%CTRL1, %%T1
4049 vpaddd %%L2B03, %%CTRL2, %%T1
4050 vpaddd %%L3B03, %%CTRL3, %%T1
4051 vpaddd %%L0B47, %%CTRL0, %%T2
4052 vpaddd %%L1B47, %%CTRL1, %%T2
4053 vpaddd %%L2B47, %%CTRL2, %%T2
4054 vpaddd %%L3B47, %%CTRL3, %%T2
4056 vmovdqa64 %%CTRL0, %%L0B47
4057 vmovdqa64 %%CTRL1, %%L1B47
4058 vmovdqa64 %%CTRL2, %%L2B47
4059 vmovdqa64 %%CTRL3, %%L3B47
4061 GHASH_8_ENCRYPT_8_PARALLEL_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
4062 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4063 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
4065 %%CTRL0, %%CTRL1, %%CTRL2, %%CTRL3, \
4066 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4067 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4069 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, \
4070 %%T8, %%T9, %%T10, \
4071 %%PREVLO0, %%PREVHI0, %%PREVLO1, %%PREVHI1, \
4072 %%PREVLO2, %%PREVHI2, %%PREVLO3, %%PREVHI3, \
4073 %%ZMM_SHUF_MASK, %%ENC_DEC
4074 add %%DATA_OFFSET, 128
4077 jnz %%_encrypt_by_8_x4
4079 %%_encrypt_by_8_overflow_x4:
4080 ;; shuffle the counters back to LE
4081 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
4082 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
4083 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4084 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4086 or %%LEN_REG, %%LEN_REG
4087 jz %%_encrypt_done_x4
4089 ;; copy previously encrypted blocks for GHASH
4090 vmovdqa64 %%PREVLO0, %%L0B03
4091 vmovdqa64 %%PREVHI0, %%L0B47
4092 vmovdqa64 %%PREVLO1, %%L1B03
4093 vmovdqa64 %%PREVHI1, %%L1B47
4094 vmovdqa64 %%PREVLO2, %%L2B03
4095 vmovdqa64 %%PREVHI2, %%L2B47
4096 vmovdqa64 %%PREVLO3, %%L3B03
4097 vmovdqa64 %%PREVHI3, %%L3B47
4099 ;; prepare new counter blocks in LE
4100 vmovdqa64 %%T1, [rel ddq_add_4444]
4101 vmovdqa64 %%T2, [rel ddq_add_8888]
4102 vpaddd %%L0B03, %%CTRL0, %%T1
4103 vpaddd %%L1B03, %%CTRL1, %%T1
4104 vpaddd %%L2B03, %%CTRL2, %%T1
4105 vpaddd %%L3B03, %%CTRL3, %%T1
4106 vpaddd %%L0B47, %%CTRL0, %%T2
4107 vpaddd %%L1B47, %%CTRL1, %%T2
4108 vpaddd %%L2B47, %%CTRL2, %%T2
4109 vpaddd %%L3B47, %%CTRL3, %%T2
4111 ;; save the counter to GPR's for calculation of number of loops
4112 vmovq %%GPR0, XWORD(%%L0B47)
4113 vmovq %%GPR1, XWORD(%%L1B47)
4114 vmovq %%GPR2, XWORD(%%L2B47)
4115 vmovq %%GPR3, XWORD(%%L3B47)
4122 ;; convert counter blocks to BE
4123 vpshufb %%L0B03, %%ZMM_SHUF_MASK
4124 vpshufb %%L0B47, %%ZMM_SHUF_MASK
4125 vpshufb %%L1B03, %%ZMM_SHUF_MASK
4126 vpshufb %%L1B47, %%ZMM_SHUF_MASK
4127 vpshufb %%L2B03, %%ZMM_SHUF_MASK
4128 vpshufb %%L2B47, %%ZMM_SHUF_MASK
4129 vpshufb %%L3B03, %%ZMM_SHUF_MASK
4130 vpshufb %%L3B47, %%ZMM_SHUF_MASK
4132 ;; update 4 lane CTR in BE
4133 vmovdqa64 %%CTRL0, %%L0B47
4134 vmovdqa64 %%CTRL1, %%L1B47
4135 vmovdqa64 %%CTRL2, %%L2B47
4136 vmovdqa64 %%CTRL3, %%L3B47
4138 GHASH_8_ENCRYPT_8_PARALLEL_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
4139 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4140 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
4142 %%CTRL0, %%CTRL1, %%CTRL2, %%CTRL3, \
4143 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4144 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4146 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, \
4147 %%T8, %%T9, %%T10, \
4148 %%PREVLO0, %%PREVHI0, %%PREVLO1, %%PREVHI1, \
4149 %%PREVLO2, %%PREVHI2, %%PREVLO3, %%PREVHI3, \
4150 %%ZMM_SHUF_MASK, %%ENC_DEC
4151 add %%DATA_OFFSET, 128
4153 jnz %%_encrypt_by_8_parallel_x4
4155 ;; shuffle the counters back to LE
4156 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
4157 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
4158 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4159 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4162 GHASH_LAST_8x4 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4163 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4164 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4166 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, \
4167 %%T7, %%T8, %%T9, %%T10, %%T11, %%T12
4170 mov %%TPTR0, [%%GDATA_CTX + (0*8)]
4171 mov %%TPTR1, [%%GDATA_CTX + (1*8)]
4172 mov %%TPTR2, [%%GDATA_CTX + (2*8)]
4173 mov %%TPTR3, [%%GDATA_CTX + (3*8)]
4175 ;; save current counter blocks
4176 vextracti32x4 [%%TPTR0 + CurCount], %%CTRL0, 3
4177 vextracti32x4 [%%TPTR1 + CurCount], %%CTRL1, 3
4178 vextracti32x4 [%%TPTR2 + CurCount], %%CTRL2, 3
4179 vextracti32x4 [%%TPTR3 + CurCount], %%CTRL3, 3
4181 ;; save current hash values
4182 vmovdqu64 [%%TPTR0 + AadHash], XWORD(%%GHASH)
4183 vextracti64x2 [%%TPTR1 + AadHash], %%GHASH, 1
4184 vextracti64x2 [%%TPTR2 + AadHash], %%GHASH, 2
4185 vextracti64x2 [%%TPTR3 + AadHash], %%GHASH, 3
4188 ;; increment the input / output pointers
4189 ;; - output and input pointers are next to one another in the structure
4190 ;; so updating all 8 pointers with a single zmm
4191 vpbroadcastq %%T1, %%DATA_OFFSET ; DATA_OFFSET should be equal to length
4192 vpaddq %%T2, %%T1, [%%CYPH_PLAIN_OUT]
4193 vmovdqu64 [%%CYPH_PLAIN_OUT], %%T2
4194 vmovdqu64 YWORD(%%T3), [%%STATE + _gcm_lens]
4195 vpsubq YWORD(%%T3), YWORD(%%T3), YWORD(%%T1)
4196 vmovdqu64 [%%STATE + _gcm_lens], YWORD(%%T3)
4201 %endmacro ; GCM_ENC_DEC_4x128
4203 ;;; ===========================================================================
4204 ;;; ===========================================================================
4205 ;;; GCM_COMPLETE_x4 - completes one of MB jobs
4206 ;;; Clobbers rax, r9-r12, r14, r15 and zmm0-zmm31
4207 ;;; ===========================================================================
4208 %macro GCM_COMPLETE_x4 3
4209 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4210 %define %%IDX %2 ; [in] lane index to be completed
4211 %define %%ENC_DEC %3
4213 %ifidn __OUTPUT_FORMAT__, win64
4214 %define %%GDATA_KEY rdi
4215 %define %%GDATA_CTX rsi
4216 %define %%CYPH_PLAIN_OUT r11
4217 %define %%PLAIN_CYPH_IN r9
4219 %define %%GDATA_KEY arg3
4220 %define %%GDATA_CTX arg4
4221 %define %%CYPH_PLAIN_OUT r8
4222 %define %%PLAIN_CYPH_IN r9
4226 %define %%PLAIN_CYPH_LEN rbp
4227 %define %%AUTH_TAG rbp
4228 %define %%AUTH_TAGLEN rbp
4232 %define %%DATA_OFFSET rbx
4234 mov %%PLAIN_CYPH_LEN, [%%STATE + _gcm_lens + %%IDX*8]
4235 mov %%GDATA_KEY, [%%STATE + _gcm_args_keys + %%IDX*8]
4236 mov %%GDATA_CTX, [%%STATE + _gcm_args_ctx + %%IDX*8]
4237 mov %%PLAIN_CYPH_IN, [%%STATE + _gcm_args_in + %%IDX*8]
4238 mov %%CYPH_PLAIN_OUT, [%%STATE + _gcm_args_out + %%IDX*8]
4240 vmovdqu64 xmm16, [%%GDATA_KEY + HashKey]
4241 vmovdqu64 xmm17, [%%GDATA_CTX + AadHash]
4243 ;;; ===========================================================================
4244 ;;; finalize last blocks (<128 bytes)
4247 ; calculate the number of 16byte blocks in the message
4248 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
4249 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
4250 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
4252 or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
4253 je %%_enc_dec_done_x4
4255 xor %%DATA_OFFSET, %%DATA_OFFSET
4257 ;; Update length of data processed
4258 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
4260 vmovdqa64 xmm13, xmm16 ; load HashKey
4261 vmovdqa64 xmm8, xmm17 ; load AadHash; xmm8 is hash_in for gcm_enc_dec_small
4262 vmovdqu xmm9, [%%GDATA_CTX + CurCount]
4264 ;; Save the amount of data left to process in r10
4265 mov r13, %%PLAIN_CYPH_LEN
4267 ;; Determine how many blocks to process in INITIAL
4272 ;; Process one additional block in INITIAL if there is a partial block
4276 shr r10, 4 ; 0 - if 4LSB of length are all zero, 1 - otherwise
4277 add r12, r10 ; process an additional INITIAL block if r10 is not zero
4279 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
4280 %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
4281 r13, r12, xmm9, xmm14, multi_call
4284 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; current_counter = xmm9
4285 vmovdqa64 xmm17, xmm14 ; AadHash = xmm14
4288 ;;; ===========================================================================
4291 ;; Start AES as early as possible
4292 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
4293 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
4295 ;; If the GCM function is called as a single function call rather
4296 ;; than invoking the individual parts (init, update, finalize) we
4297 ;; can remove a write to read dependency on AadHash.
4298 vmovdqa64 xmm14, xmm17 ; xmm14 = AadHash
4299 vmovdqa64 xmm13, xmm16 ; load HashKey
4301 ;; Encrypt the final partial block. If we did this as a single call then
4302 ;; the partial block was handled in the main GCM_ENC_DEC macro.
4303 cmp qword [%%GDATA_CTX + PBlockLen], 0
4304 je %%_partial_done_x4
4306 ;; xmm14: hash value [in/out]
4307 ;; xmm13: hash key [in]
4308 ;; xmm0, xmm10, xmm11, xmm5, xmm6 - temporary registers
4309 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
4310 vmovdqa64 xmm17, xmm14 ; AadHash = xmm14
4313 mov %%GPR, [%%GDATA_CTX + AadLen] ; aadLen (number of bytes)
4314 shl %%GPR, 3 ; convert into number of bits
4315 vmovd xmm15, DWORD(%%GPR) ; len(A) in xmm15
4317 mov %%GPR, [%%GDATA_CTX + InLen]
4318 shl %%GPR, 3 ; len(C) in bits (*128)
4320 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
4321 vpor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
4323 ;; prep auth_tag store mask
4324 mov %%AUTH_TAGLEN, [%%STATE + _gcm_args_taglen + %%IDX*8]
4325 lea %%GPR, [rel byte_len_to_mask_table]
4326 kmovw k1, [%%GPR + %%AUTH_TAGLEN*2]
4327 mov %%AUTH_TAG, [%%STATE + _gcm_args_tag + %%IDX*8]
4329 ;; XOR current hash value with the next block xmm15
4332 ;; xmm14: hash value [in/out]
4333 ;; xmm13: hash key [in]
4334 ;; xmm0, xmm10, xmm11, xmm5, xmm6 - temporary registers
4335 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
4336 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
4338 vpxor xmm9, xmm9, xmm14
4341 vmovdqu8 [%%AUTH_TAG]{k1}, xmm9 ; store TAG
4342 vmovdqu64 [%%GDATA_CTX + AadHash], xmm17 ; store AadHash
4344 ;; put the lane back on free list
4345 mov rax, [%%STATE + _gcm_unused_lanes]
4348 mov [%%STATE + _gcm_unused_lanes], rax
4350 ;; mark job as complete
4351 mov rax, [%%STATE + _gcm_job_in_lane + 8*%%IDX]
4352 or dword [rax + _status], STS_COMPLETED
4353 ;; clear job pointer in this lane
4354 mov qword [%%STATE + _gcm_job_in_lane + 8*%%IDX], 0
4355 ;; return finished job (rax)
4357 %endmacro ; GCM_COMPLETE_x4
4360 ;;; ===========================================================================
4361 ;;; ===========================================================================
4362 ;;; GCM_FINALIZE_x4:
4363 ;;; - runs all lanes in parallel for %LEN
4364 ;;; - completes slected lane (any outstanding bytes < 128 bytes)
4365 ;;; - returns pointer of completed JOB
4366 ;;; Clobbers rax, r9-r12, r14, r15 and zmm0-zmm31
4367 ;;; ===========================================================================
4368 %macro GCM_FINALIZE_x4 4
4369 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4370 %define %%IDX %2 ; [in] lane index to be completed
4371 %define %%LEN %3 ; [in] common length to be prcessed across all lanes
4372 %define %%ENC_DEC %4
4374 %%_gcm_finalize_4x128:
4375 mov [rsp + 0*8], %%IDX ; save %IDX as it will get clobbered
4378 GCM_ENC_DEC_4x128 %%STATE, arg2, %%ENC_DEC
4380 %%_gcm_complete_min_lane:
4381 mov arg2, [rsp + 0*8] ; restore %%IDX
4382 GCM_COMPLETE_x4 %%STATE, arg2, %%ENC_DEC
4383 %endmacro ; GCM_FINALIZE_x4
4384 ;;; ===========================================================================
4386 ;;; ===========================================================================
4387 ;;; ===========================================================================
4389 ;;; - finds min not null lane
4390 ;;; - replicates non_null data across null lanes
4391 ;;; - returns min length lane index and length
4392 ;;; ===========================================================================
4393 %macro GCM_FLUSH_MB 3
4394 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4395 %define %%IDX %2 ; [out] lane index to be completed
4396 %define %%LEN %3 ; [out] common length to be prcessed across all lanes
4398 ;; put max length into null lanes
4399 vmovdqu64 ymm0, [%%STATE + _gcm_job_in_lane]
4401 vpcmpq k2, ymm0, ymm1, 0 ; EQ
4403 kmovq rax, k2 ; k2 = mask for null lanes
4405 kmovq k1, rax ; k1 = mask for not null lanes (~k2)
4407 vmovdqu64 ymm2, [%%STATE + _gcm_lens]
4408 vbroadcastf64x2 ymm4, [rel ALL_F]
4409 vporq ymm2{k2}, ymm2, ymm4
4411 ;; find min lane & index
4412 vpsllq ymm3, ymm2, 2 ;
4413 vporq ymm3, ymm3, [rel index_to_lane4]
4414 vextracti32x4 xmm2, ymm3, 1
4415 vpminuq xmm2, xmm3, xmm2
4416 vpsrldq xmm3, xmm2, 8
4417 vpminuq xmm2, xmm3, xmm2
4423 ;; %%LEN - min length
4424 ;; %%IDX - lane index
4426 ;; load context structure content from the non-null lane
4427 ;; it is 88 bytes long (64 + 24)
4431 mov r10, [%%STATE + _gcm_args_ctx + 8*%%IDX]
4432 vmovdqu64 zmm7, [r10]
4433 vmovdqu64 ymm11{k3}, [r10 + 64]
4435 vmovdqu64 ymm7, [%%STATE + _gcm_args_in]
4436 vmovdqu64 ymm8, [%%STATE + _gcm_args_out]
4437 vmovdqu64 ymm9, [%%STATE + _gcm_args_keys]
4438 mov r10, [%%STATE + _gcm_args_in + 8*%%IDX]
4439 mov r11, [%%STATE + _gcm_args_out + 8*%%IDX]
4440 mov r12, [%%STATE + _gcm_args_keys + 8*%%IDX]
4441 ;; r10 = (min lane) valid in ptr
4442 ;; r11 = (min lane) valid out ptr
4443 ;; r12 = (min lane) valid keys ptr
4445 ;; store valid in/out/key pointers to empty lanes
4446 vpbroadcastq ymm4, r10
4447 vpbroadcastq ymm5, r11
4448 vpbroadcastq ymm6, r12
4450 vmovdqa64 ymm4{k1}, ymm7
4451 vmovdqa64 ymm5{k1}, ymm8
4452 vmovdqa64 ymm6{k1}, ymm9
4454 vmovdqu64 [%%STATE + _gcm_args_in], ymm4
4455 vmovdqu64 [%%STATE + _gcm_args_out], ymm5
4456 vmovdqu64 [%%STATE + _gcm_args_keys], ymm6
4458 ;; copy valid context into empty lanes
4459 kmovq rax, k2 ; null lane mask to rax
4461 jz %%_copy_ctx_lane1
4462 mov r10, [%%STATE + _gcm_args_ctx + 8*0]
4463 vmovdqu64 [r10], zmm7
4464 vmovdqu64 [r10 + 64]{k3}, ymm11
4467 jz %%_copy_ctx_lane2
4468 mov r10, [%%STATE + _gcm_args_ctx + 8*1]
4469 vmovdqu64 [r10], zmm7
4470 vmovdqu64 [r10 + 64]{k3}, ymm11
4473 jz %%_copy_ctx_lane3
4474 mov r10, [%%STATE + _gcm_args_ctx + 8*2]
4475 vmovdqu64 [r10], zmm7
4476 vmovdqu64 [r10 + 64]{k3}, ymm11
4480 mov r10, [%%STATE + _gcm_args_ctx + 8*3]
4481 vmovdqu64 [r10], zmm7
4482 vmovdqu64 [r10 + 64]{k3}, ymm11
4485 %endmacro ; GCM_FLUSH_MB
4486 ;;; ===========================================================================
4488 ;;; ===========================================================================
4489 ;;; ===========================================================================
4491 ;;; - finds free lane and populates it with data from JOB
4492 ;;; - if all lanes populated then finds min common length
4493 ;;; - returns min length lane index and size
4494 ;;; ===========================================================================
4495 %macro GCM_SUBMIT_MB 4
4496 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4497 %define %%JOB %2 ; [in] lane index to be completed / [out] index
4498 %define %%LEN %3 ; [out] common length to be prcessed across all lanes
4499 %define %%ENC_DEC %4 ; [in] encrypt / decrypt selector
4502 %define %%RET_IDX %%JOB
4503 %ifidn __OUTPUT_FORMAT__, win64
4509 mov rbx, [%%STATE + _gcm_unused_lanes]
4513 mov [%%STATE + _gcm_unused_lanes], rbx
4515 ;; copy job data into the lane
4516 mov [%%STATE + _gcm_job_in_lane + 8*%%IDX], %%JOB
4518 mov r9, [%%JOB + _aes_enc_key_expanded]
4519 mov [%%STATE + _gcm_args_keys + 8*%%IDX], r9
4521 mov rax, [%%JOB + _src]
4522 add rax, [%%JOB + _cipher_start_src_offset_in_bytes]
4523 mov [%%STATE + _gcm_args_in + 8*%%IDX], rax
4525 mov rax, [%%JOB + _dst]
4526 mov [%%STATE + _gcm_args_out + 8*%%IDX], rax
4528 mov rax, [%%JOB + _auth_tag_output]
4529 mov [%%STATE + _gcm_args_tag + 8*%%IDX], rax
4531 mov rax, [%%JOB + _auth_tag_output_len_in_bytes]
4532 mov [%%STATE + _gcm_args_taglen + 8*%%IDX], rax
4534 vpbroadcastq ymm15, [%%JOB + _msg_len_to_cipher_in_bytes]
4536 lea rax, [rel index_to_lane4_mask]
4537 kmovw k2, [rax + (index_to_lane4_not_mask - index_to_lane4_mask) + %%IDX*2]
4538 kmovw k1, [rax + %%IDX*2]
4539 vmovdqu64 ymm14{k2}{z}, [%%STATE + _gcm_lens]
4540 vporq ymm14{k1}, ymm14, ymm15
4541 vmovdqu64 [%%STATE + _gcm_lens], ymm14
4542 vmovdqu64 ymm31, ymm14
4545 mov r13, [%%JOB + _iv]
4546 mov r14, [%%JOB + _gcm_aad]
4547 mov rax, [%%JOB + _gcm_aad_len]
4548 mov %%LCTX, [%%STATE + _gcm_args_ctx + 8*%%IDX]
4555 ;; r10 to 12 - temporary GPR's
4556 GCM_INIT r9, %%LCTX, r13, r14, rax, r10, r11, r12
4558 ;; check if all lanes populated
4561 %%_gcm_ooo_not_ready:
4562 xor rax, rax ; return NULL
4563 jmp %%_gcm_submit_return
4566 ;; find min lane & index
4567 vpsllq ymm2, ymm31, 2 ;
4568 vporq ymm2, ymm2, [rel index_to_lane4]
4569 vextracti32x4 xmm3, ymm2, 1
4570 vpminuq xmm2, xmm3, xmm2
4571 vpsrldq xmm3, xmm2, 8
4572 vpminuq xmm2, xmm3, xmm2
4574 mov %%RET_IDX, %%LEN
4578 ;; %%LEN - min length
4579 ;; %%RET_IDX - lane index
4581 ;; finalize puts returned job into RAX
4583 ;; arg2 - min_lane_idx
4586 GCM_FINALIZE_x4 arg1, arg2, arg3, %%ENC_DEC
4587 ;; rax = finished job pointer
4588 %%_gcm_submit_return:
4590 %endmacro ; GCM_SUBMIT_MB
4591 ;;; ===========================================================================
4593 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4594 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
4595 ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
4596 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
4597 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
4598 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4599 %macro GCM_COMPLETE 6
4600 %define %%GDATA_KEY %1
4601 %define %%GDATA_CTX %2
4602 %define %%AUTH_TAG %3
4603 %define %%AUTH_TAG_LEN %4
4604 %define %%ENC_DEC %5
4605 %define %%INSTANCE_TYPE %6
4606 %define %%PLAIN_CYPH_LEN rax
4608 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
4609 ;; Start AES as early as possible
4610 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
4611 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
4613 %ifidn %%INSTANCE_TYPE, multi_call
4614 ;; If the GCM function is called as a single function call rather
4615 ;; than invoking the individual parts (init, update, finalize) we
4616 ;; can remove a write to read dependency on AadHash.
4617 vmovdqu xmm14, [%%GDATA_CTX + AadHash]
4619 ;; Encrypt the final partial block. If we did this as a single call then
4620 ;; the partial block was handled in the main GCM_ENC_DEC macro.
4621 mov r12, [%%GDATA_CTX + PBlockLen]
4626 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
4627 vmovdqu [%%GDATA_CTX + AadHash], xmm14
4633 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
4634 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
4636 shl r12, 3 ; convert into number of bits
4637 vmovd xmm15, r12d ; len(A) in xmm15
4639 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
4640 vmovq xmm1, %%PLAIN_CYPH_LEN
4641 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
4642 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
4645 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
4646 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
4648 vpxor xmm9, xmm9, xmm14
4652 mov r10, %%AUTH_TAG ; r10 = authTag
4653 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
4664 simd_store_avx r10, xmm9, r11, r12, rax
4665 jmp %%_return_T_done
4669 jmp %%_return_T_done
4673 vpsrldq xmm9, xmm9, 8
4676 jmp %%_return_T_done
4681 %endmacro ; GCM_COMPLETE
4684 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4685 ;void aes_gcm_precomp_128_vaes_avx512 /
4686 ; aes_gcm_precomp_192_vaes_avx512 /
4687 ; aes_gcm_precomp_256_vaes_avx512
4688 ; (struct gcm_key_data *key_data)
4689 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4690 MKGLOBAL(FN_NAME(precomp,_),function,)
4701 sub rsp, VARIABLE_OFFSET
4702 and rsp, ~63 ; align rsp to 64 bytes
4704 %ifidn __OUTPUT_FORMAT__, win64
4705 ; only xmm6 needs to be maintained
4706 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
4710 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
4712 vpshufb xmm6, [rel SHUF_MASK]
4713 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
4715 vpsllq xmm6, xmm6, 1
4716 vpsrlq xmm2, xmm2, 63
4718 vpslldq xmm2, xmm2, 8
4719 vpsrldq xmm1, xmm1, 8
4720 vpor xmm6, xmm6, xmm2
4722 vpshufd xmm2, xmm1, 00100100b
4723 vpcmpeqd xmm2, [rel TWOONE]
4724 vpand xmm2, xmm2, [rel POLY]
4725 vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
4726 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4727 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
4730 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
4732 %ifidn __OUTPUT_FORMAT__, win64
4733 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
4744 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4745 ;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
4746 ; (const struct gcm_key_data *key_data,
4747 ; struct gcm_context_data *context_data,
4751 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4752 MKGLOBAL(FN_NAME(init,_),function,)
4756 %ifidn __OUTPUT_FORMAT__, win64
4760 ; xmm6:xmm15 need to be maintained for Windows
4762 movdqu [rsp + 0*16], xmm6
4765 GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12
4767 %ifidn __OUTPUT_FORMAT__, win64
4768 movdqu xmm6 , [rsp + 0*16]
4778 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4779 ;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
4780 ; aes_gcm_enc_256_update_vaes_avx512
4781 ; (const struct gcm_key_data *key_data,
4782 ; struct gcm_context_data *context_data,
4785 ; u64 plaintext_len);
4786 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4787 MKGLOBAL(FN_NAME(enc,_update_),function,)
4788 FN_NAME(enc,_update_):
4792 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
4799 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4800 ;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
4801 ; aes_gcm_dec_256_update_vaes_avx512
4802 ; (const struct gcm_key_data *key_data,
4803 ; struct gcm_context_data *context_data,
4806 ; u64 plaintext_len);
4807 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4808 MKGLOBAL(FN_NAME(dec,_update_),function,)
4809 FN_NAME(dec,_update_):
4813 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
4818 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4819 ;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
4820 ; aes_gcm_enc_256_finalize_vaes_avx512
4821 ; (const struct gcm_key_data *key_data,
4822 ; struct gcm_context_data *context_data,
4824 ; u64 auth_tag_len);
4825 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4826 MKGLOBAL(FN_NAME(enc,_finalize_),function,)
4827 FN_NAME(enc,_finalize_):
4831 %ifidn __OUTPUT_FORMAT__, win64
4832 ; xmm6:xmm15 need to be maintained for Windows
4834 vmovdqu [rsp + 0*16], xmm6
4835 vmovdqu [rsp + 1*16], xmm9
4836 vmovdqu [rsp + 2*16], xmm11
4837 vmovdqu [rsp + 3*16], xmm14
4838 vmovdqu [rsp + 4*16], xmm15
4840 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
4842 %ifidn __OUTPUT_FORMAT__, win64
4843 vmovdqu xmm15, [rsp + 4*16]
4844 vmovdqu xmm14, [rsp + 3*16]
4845 vmovdqu xmm11, [rsp + 2*16]
4846 vmovdqu xmm9, [rsp + 1*16]
4847 vmovdqu xmm6, [rsp + 0*16]
4855 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4856 ;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
4857 ; aes_gcm_dec_256_finalize_vaes_avx512
4858 ; (const struct gcm_key_data *key_data,
4859 ; struct gcm_context_data *context_data,
4861 ; u64 auth_tag_len);
4862 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4863 MKGLOBAL(FN_NAME(dec,_finalize_),function,)
4864 FN_NAME(dec,_finalize_):
4868 %ifidn __OUTPUT_FORMAT__, win64
4869 ; xmm6:xmm15 need to be maintained for Windows
4871 vmovdqu [rsp + 0*16], xmm6
4872 vmovdqu [rsp + 1*16], xmm9
4873 vmovdqu [rsp + 2*16], xmm11
4874 vmovdqu [rsp + 3*16], xmm14
4875 vmovdqu [rsp + 4*16], xmm15
4877 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
4879 %ifidn __OUTPUT_FORMAT__, win64
4880 vmovdqu xmm15, [rsp + 4*16]
4881 vmovdqu xmm14, [rsp + 3*16]
4882 vmovdqu xmm11, [rsp + 2*16]
4883 vmovdqu xmm9, [rsp + 1*16]
4884 vmovdqu xmm6, [rsp + 0*16]
4892 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4893 ;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
4894 ; (const struct gcm_key_data *key_data,
4895 ; struct gcm_context_data *context_data,
4898 ; u64 plaintext_len,
4903 ; u64 auth_tag_len);
4904 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4905 MKGLOBAL(FN_NAME(enc,_),function,)
4910 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
4912 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
4914 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
4920 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4921 ;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
4922 ; (const struct gcm_key_data *key_data,
4923 ; struct gcm_context_data *context_data,
4926 ; u64 plaintext_len,
4931 ; u64 auth_tag_len);
4932 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4933 MKGLOBAL(FN_NAME(dec,_),function,)
4938 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
4940 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
4942 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
4948 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4949 ;JOB_AES_HMAC *aes_gcm_enc_128_submit_vaes_vaes_avx512 / aes_gcm_enc_192_submit_vaes_vaes_avx512 /
4950 ; aes_gcm_enc_256_submit_vaes_vaes_avx512
4951 ; (MB_MGR_GCM_OOO *state, JOB_AES_HMAC *job)
4952 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4953 MKGLOBAL(FN_NAME(enc,_submit_),function,internal)
4954 FN_NAME(enc,_submit_):
4957 ;; arg1 - [in] state
4958 ;; arg2 - [in] job / [out] index
4959 ;; arg3 - [out] length
4960 GCM_SUBMIT_MB arg1, arg2, arg3, ENC
4965 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4966 ;JOB_AES_HMAC *aes_gcm_enc_128_flush_vaes_avx512 / aes_gcm_enc_192_flush_vaes_avx512 /
4967 ; aes_gcm_enc_256_flush_vaes_avx512
4968 ; (MB_MGR_GCM_OOO *state)
4969 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4970 MKGLOBAL(FN_NAME(enc,_flush_),function,internal)
4971 FN_NAME(enc,_flush_):
4974 ;; arg1 - [in] state
4975 ;; arg2 - [out] index
4976 ;; arg3 - [out] length
4977 GCM_FLUSH_MB arg1, arg2, arg3
4979 ;; finalize puts returned job into RAX
4981 ;; arg2 - min_lane_idx
4983 GCM_FINALIZE_x4 arg1, arg2, arg3, ENC
4988 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4989 ;JOB_AES_HMAC *aes_gcm_dec_128_submit_vaes_avx512 / aes_gcm_dec_192_submit_vaes_avx512 /
4990 ; aes_gcm_dec_256_submit_vaes_avx512
4991 ; (MB_MGR_GCM_OOO *state, JOB_AES_HMAC *job)
4992 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4993 MKGLOBAL(FN_NAME(dec,_submit_),function,internal)
4994 FN_NAME(dec,_submit_):
4997 ;; arg1 - [in] state
4998 ;; arg2 - [in] job / [out] index
4999 ;; arg3 - [out] length
5000 GCM_SUBMIT_MB arg1, arg2, arg3, DEC
5005 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5006 ;JOB_AES_HMAC *aes_gcm_dec_128_flush_vaes_avx512 / aes_gcm_dec_192_flush_vaes_avx512 /
5007 ; aes_gcm_dec_256_flush_vaes_avx512
5008 ; (MB_MGR_GCM_OOO *state)
5009 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5010 MKGLOBAL(FN_NAME(dec,_flush_),function,internal)
5011 FN_NAME(dec,_flush_):
5014 ;; arg1 - [in] state
5015 ;; arg2 - [out] index
5016 ;; arg3 - [out] length
5017 GCM_FLUSH_MB arg1, arg2, arg3
5019 ;; finalize puts returned job into RAX
5021 ;; arg2 - min_lane_idx
5023 GCM_FINALIZE_x4 arg1, arg2, arg3, DEC
5030 section .note.GNU-stack noalloc noexec nowrite progbits