]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx512 / gcm_vaes_avx512.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2018, Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;
31 ; Authors:
32 ; Erdinc Ozturk
33 ; Vinodh Gopal
34 ; James Guilford
35 ; Tomasz Kantecki
36 ;
37 ;
38 ; References:
39 ; This code was derived and highly optimized from the code described in paper:
40 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
41 ; The details of the implementation is explained in:
42 ; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
43 ;
44 ;
45 ;
46 ;
47 ; Assumptions:
48 ;
49 ;
50 ;
51 ; iv:
52 ; 0 1 2 3
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 ; | 0x1 |
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ;
63 ;
64 ;
65 ; AAD:
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
68 ;
69 ; if AAD is 8 bytes:
70 ; AAD[3] = {A0, A1};
71 ; padded AAD in xmm register = {A1 A0 0 0}
72 ;
73 ; 0 1 2 3
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76 ; | SPI (A1) |
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80 ; | 0x0 |
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ;
83 ; AAD Format with 32-bit Sequence Number
84 ;
85 ; if AAD is 12 bytes:
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
88 ;
89 ; 0 1 2 3
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 ; | SPI (A2) |
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
95 ; | |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 ; | 0x0 |
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ;
100 ; AAD Format with 64-bit Extended Sequence Number
101 ;
102 ;
103 ; aadLen:
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
106 ;
107 ; TLen:
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109 ;
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112 ;
113
114 %include "os.asm"
115 %include "reg_sizes.asm"
116 %include "gcm_defines.asm"
117 %include "mb_mgr_datastruct.asm"
118 %include "job_aes_hmac.asm"
119 %include "memcpy.asm"
120
121 %ifndef GCM128_MODE
122 %ifndef GCM192_MODE
123 %ifndef GCM256_MODE
124 %error "No GCM mode selected for gcm_avx512.asm!"
125 %endif
126 %endif
127 %endif
128
129 ;; Decide on AES-GCM key size to compile for
130 %ifdef GCM128_MODE
131 %define NROUNDS 9
132 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512
133 %endif
134
135 %ifdef GCM192_MODE
136 %define NROUNDS 11
137 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512
138 %endif
139
140 %ifdef GCM256_MODE
141 %define NROUNDS 13
142 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512
143 %endif
144
145 section .text
146 default rel
147
148 ; need to push 4 registers into stack to maintain
149 %define STACK_OFFSET 8*4
150
151 %ifidn __OUTPUT_FORMAT__, win64
152 %define XMM_STORAGE 16*10
153 %else
154 %define XMM_STORAGE 0
155 %endif
156
157 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
158 %define TMP3 16*1 ; Temporary storage for AES State 3
159 %define TMP4 16*2 ; Temporary storage for AES State 4
160 %define TMP5 16*3 ; Temporary storage for AES State 5
161 %define TMP6 16*4 ; Temporary storage for AES State 6
162 %define TMP7 16*5 ; Temporary storage for AES State 7
163 %define TMP8 16*6 ; Temporary storage for AES State 8
164 %define LOCAL_STORAGE 16*7
165 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
166
167 %define LOCAL_STORAGE_AVX512 2*8 ; temporary storage
168 %define STACK_SIZE_GP_AVX512 10*8 ; up to 10 GP registers (5 GP + 3 reserve places for the algorithmic code)
169 %define STACK_OFFSET_AVX512 (LOCAL_STORAGE_AVX512 + XMM_STORAGE)
170 %define VARIABLE_OFFSET_AVX512 (LOCAL_STORAGE_AVX512 + XMM_STORAGE + STACK_SIZE_GP_AVX512)
171
172 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 ; Utility Macros
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175
176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
177 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
178 ; Input: A and B (128-bits each, bit-reflected)
179 ; Output: C = A*B*x mod poly, (i.e. >>1 )
180 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
181 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
183 %macro GHASH_MUL 7
184 %define %%GH %1 ; 16 Bytes
185 %define %%HK %2 ; 16 Bytes
186 %define %%T1 %3
187 %define %%T2 %4
188 %define %%T3 %5
189 %define %%T4 %6
190 %define %%T5 %7
191 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
192
193 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
194 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
195 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
196 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
197 vpxor %%GH, %%GH, %%T3
198
199
200 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
201 vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
202
203 vpxor %%T1, %%T1, %%T3
204 vpxor %%GH, %%GH, %%T2
205
206 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207 ;first phase of the reduction
208 vmovdqu %%T3, [rel POLY2]
209
210 vpclmulqdq %%T2, %%T3, %%GH, 0x01
211 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
212
213 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
214 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
215 ;second phase of the reduction
216 vpclmulqdq %%T2, %%T3, %%GH, 0x00
217 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
218
219 vpclmulqdq %%GH, %%T3, %%GH, 0x10
220 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
221
222 vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
225 %endmacro
226
227
228 ; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
229 ; functions, but are kept to allow users to switch cpu architectures between calls
230 ; of pre, init, update, and finalize.
231 %macro PRECOMPUTE 8
232 %define %%GDATA %1
233 %define %%HK %2
234 %define %%T1 %3
235 %define %%T2 %4
236 %define %%T3 %5
237 %define %%T4 %6
238 %define %%T5 %7
239 %define %%T6 %8
240
241 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
242 vmovdqa %%T5, %%HK
243
244 vpshufd %%T1, %%T5, 01001110b
245 vpxor %%T1, %%T5
246 vmovdqu [%%GDATA + HashKey_k], %%T1
247
248 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
249 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
250 vpshufd %%T1, %%T5, 01001110b
251 vpxor %%T1, %%T5
252 vmovdqu [%%GDATA + HashKey_2_k], %%T1
253
254 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
255 vmovdqu [%%GDATA + HashKey_3], %%T5
256 vpshufd %%T1, %%T5, 01001110b
257 vpxor %%T1, %%T5
258 vmovdqu [%%GDATA + HashKey_3_k], %%T1
259
260 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
261 vmovdqu [%%GDATA + HashKey_4], %%T5
262 vpshufd %%T1, %%T5, 01001110b
263 vpxor %%T1, %%T5
264 vmovdqu [%%GDATA + HashKey_4_k], %%T1
265
266 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
267 vmovdqu [%%GDATA + HashKey_5], %%T5
268 vpshufd %%T1, %%T5, 01001110b
269 vpxor %%T1, %%T5
270 vmovdqu [%%GDATA + HashKey_5_k], %%T1
271
272 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
273 vmovdqu [%%GDATA + HashKey_6], %%T5
274 vpshufd %%T1, %%T5, 01001110b
275 vpxor %%T1, %%T5
276 vmovdqu [%%GDATA + HashKey_6_k], %%T1
277
278 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
279 vmovdqu [%%GDATA + HashKey_7], %%T5
280 vpshufd %%T1, %%T5, 01001110b
281 vpxor %%T1, %%T5
282 vmovdqu [%%GDATA + HashKey_7_k], %%T1
283
284 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
285 vmovdqu [%%GDATA + HashKey_8], %%T5
286 vpshufd %%T1, %%T5, 01001110b
287 vpxor %%T1, %%T5
288 vmovdqu [%%GDATA + HashKey_8_k], %%T1
289 %endmacro
290
291
292 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
294 ; Returns 0 if data has length 0.
295 ; Input: The input data (INPUT), that data's length (LENGTH).
296 ; Output: The packed xmm register (OUTPUT).
297 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
298 %macro READ_SMALL_DATA_INPUT 4
299 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
300 %define %%INPUT %2
301 %define %%LENGTH %3
302 %define %%TMP1 %4
303
304 lea %%TMP1, [rel byte_len_to_mask_table]
305 %ifidn __OUTPUT_FORMAT__, win64
306 add %%TMP1, %%LENGTH
307 add %%TMP1, %%LENGTH
308 kmovw k1, [%%TMP1]
309 %else
310 kmovw k1, [%%TMP1 + %%LENGTH*2]
311 %endif
312 vmovdqu8 XWORD(%%OUTPUT){k1}{z}, [%%INPUT]
313
314 %endmacro ; READ_SMALL_DATA_INPUT
315
316
317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
319 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
320 ; Output: The hash of the data (AAD_HASH).
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322 %macro CALC_AAD_HASH 13
323 %define %%A_IN %1
324 %define %%A_LEN %2
325 %define %%AAD_HASH %3
326 %define %%GDATA_KEY %4
327 %define %%XTMP0 %5 ; xmm temp reg 5
328 %define %%XTMP1 %6 ; xmm temp reg 5
329 %define %%XTMP2 %7
330 %define %%XTMP3 %8
331 %define %%XTMP4 %9
332 %define %%XTMP5 %10 ; xmm temp reg 5
333 %define %%T1 %11 ; temp reg 1
334 %define %%T2 %12
335 %define %%T3 %13
336
337
338 mov %%T1, %%A_IN ; T1 = AAD
339 mov %%T2, %%A_LEN ; T2 = aadLen
340 vpxor %%AAD_HASH, %%AAD_HASH
341
342 %%_get_AAD_loop128:
343 cmp %%T2, 128
344 jl %%_exit_AAD_loop128
345
346 vmovdqu %%XTMP0, [%%T1 + 16*0]
347 vpshufb %%XTMP0, [rel SHUF_MASK]
348
349 vpxor %%XTMP0, %%AAD_HASH
350
351 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
352 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
353 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
354 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
355 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
356 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
357
358 %assign i 1
359 %assign j 7
360 %rep 7
361 vmovdqu %%XTMP0, [%%T1 + 16*i]
362 vpshufb %%XTMP0, [rel SHUF_MASK]
363
364 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
365 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
366 vpxor %%XTMP1, %%XTMP1, %%XTMP4
367
368 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
369 vpxor %%XTMP2, %%XTMP2, %%XTMP4
370
371 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
372 vpxor %%XTMP3, %%XTMP3, %%XTMP4
373 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
374 vpxor %%XTMP3, %%XTMP3, %%XTMP4
375 %assign i (i + 1)
376 %assign j (j - 1)
377 %endrep
378
379 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
380 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
381 vpxor %%XTMP2, %%XTMP2, %%XTMP4
382 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
383
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 ;first phase of the reduction
386 vmovdqa %%XTMP5, [rel POLY2]
387 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
388 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
389 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
390
391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392 ;second phase of the reduction
393 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
394 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
395
396 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
397 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
398
399 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
400 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
401 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
402
403 sub %%T2, 128
404 je %%_CALC_AAD_done
405
406 add %%T1, 128
407 jmp %%_get_AAD_loop128
408
409 %%_exit_AAD_loop128:
410 cmp %%T2, 16
411 jl %%_get_small_AAD_block
412
413 ;; calculate hash_key position to start with
414 mov %%T3, %%T2
415 and %%T3, -16 ; 1 to 7 blocks possible here
416 neg %%T3
417 add %%T3, HashKey_1 + 16
418 lea %%T3, [%%GDATA_KEY + %%T3]
419
420 vmovdqu %%XTMP0, [%%T1]
421 vpshufb %%XTMP0, [rel SHUF_MASK]
422
423 vpxor %%XTMP0, %%AAD_HASH
424
425 vmovdqu %%XTMP5, [%%T3]
426 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
427 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
428 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
429 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
430 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
431
432 add %%T3, 16 ; move to next hashkey
433 add %%T1, 16 ; move to next data block
434 sub %%T2, 16
435 cmp %%T2, 16
436 jl %%_AAD_reduce
437
438 %%_AAD_blocks:
439 vmovdqu %%XTMP0, [%%T1]
440 vpshufb %%XTMP0, [rel SHUF_MASK]
441
442 vmovdqu %%XTMP5, [%%T3]
443 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
444 vpxor %%XTMP1, %%XTMP1, %%XTMP4
445
446 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
447 vpxor %%XTMP2, %%XTMP2, %%XTMP4
448
449 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
450 vpxor %%XTMP3, %%XTMP3, %%XTMP4
451 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
452 vpxor %%XTMP3, %%XTMP3, %%XTMP4
453
454 add %%T3, 16 ; move to next hashkey
455 add %%T1, 16
456 sub %%T2, 16
457 cmp %%T2, 16
458 jl %%_AAD_reduce
459 jmp %%_AAD_blocks
460
461 %%_AAD_reduce:
462 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
463 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
464 vpxor %%XTMP2, %%XTMP2, %%XTMP4
465 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
466
467 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
468 ;first phase of the reduction
469 vmovdqa %%XTMP5, [rel POLY2]
470 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
471 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
472 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
473
474 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
475 ;second phase of the reduction
476 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
477 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
478
479 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
480 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
481
482 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
483 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
484 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
485
486 or %%T2, %%T2
487 je %%_CALC_AAD_done
488
489 %%_get_small_AAD_block:
490 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
491 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3
492 ;byte-reflect the AAD data
493 vpshufb %%XTMP1, [rel SHUF_MASK]
494 vpxor %%AAD_HASH, %%XTMP1
495 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
496
497 %%_CALC_AAD_done:
498
499 %endmacro ; CALC_AAD_HASH
500
501 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
502 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
503 ; Requires the input data be at least 1 byte long.
504 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
505 ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
506 ; and whether encoding or decoding (ENC_DEC)
507 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
508 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
509 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
510 %macro PARTIAL_BLOCK 8
511 %define %%GDATA_KEY %1
512 %define %%GDATA_CTX %2
513 %define %%CYPH_PLAIN_OUT %3
514 %define %%PLAIN_CYPH_IN %4
515 %define %%PLAIN_CYPH_LEN %5
516 %define %%DATA_OFFSET %6
517 %define %%AAD_HASH %7
518 %define %%ENC_DEC %8
519
520 mov r13, [%%GDATA_CTX + PBlockLen]
521 cmp r13, 0
522 je %%_partial_block_done ;Leave Macro if no partial blocks
523
524 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
525 jl %%_fewer_than_16_bytes
526 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
527 jmp %%_data_read
528
529 %%_fewer_than_16_bytes:
530 lea r10, [%%PLAIN_CYPH_IN]
531 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax
532
533 %%_data_read: ;Finished reading in data
534
535 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
536 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
537
538 lea r12, [rel SHIFT_MASK]
539
540 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
541 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
542 vpshufb xmm9, xmm2 ;shift right r13 bytes
543
544 %ifidn %%ENC_DEC, DEC
545 vmovdqa xmm3, xmm1
546 %endif
547 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
548
549 mov r15, %%PLAIN_CYPH_LEN
550 add r15, r13
551 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
552 jge %%_no_extra_mask ;Determine if if partial block is not being filled and shift mask accordingly
553 sub r12, r15
554 %%_no_extra_mask:
555
556 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
557 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
558
559 %ifidn %%ENC_DEC, DEC
560 vpand xmm3, xmm1
561 vpshufb xmm3, [rel SHUF_MASK]
562 vpshufb xmm3, xmm2
563 vpxor %%AAD_HASH, xmm3
564 %else
565 vpshufb xmm9, [rel SHUF_MASK]
566 vpshufb xmm9, xmm2
567 vpxor %%AAD_HASH, xmm9
568 %endif
569 cmp r15,0
570 jl %%_partial_incomplete
571
572 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
573 xor rax,rax
574 mov [%%GDATA_CTX + PBlockLen], rax
575 jmp %%_enc_dec_done
576 %%_partial_incomplete:
577 %ifidn __OUTPUT_FORMAT__, win64
578 mov rax, %%PLAIN_CYPH_LEN
579 add [%%GDATA_CTX + PBlockLen], rax
580 %else
581 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
582 %endif
583 %%_enc_dec_done:
584 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
585
586 %ifidn %%ENC_DEC, ENC
587 vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
588 vpshufb xmm9, xmm2
589 %endif
590
591 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
592 ; output encrypted Bytes
593 cmp r15,0
594 jl %%_partial_fill
595 mov r12, r13
596 mov r13, 16
597 sub r13, r12 ; Set r13 to be the number of bytes to write out
598 jmp %%_count_set
599 %%_partial_fill:
600 mov r13, %%PLAIN_CYPH_LEN
601 %%_count_set:
602 lea rax, [rel byte_len_to_mask_table]
603 kmovw k1, [rax + r13*2]
604 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9
605 add %%DATA_OFFSET, r13
606 %%_partial_block_done:
607 %endmacro ; PARTIAL_BLOCK
608
609
610 %macro GHASH_SINGLE_MUL 9
611 %define %%GDATA %1
612 %define %%HASHKEY %2
613 %define %%CIPHER %3
614 %define %%STATE_11 %4
615 %define %%STATE_00 %5
616 %define %%STATE_MID %6
617 %define %%T1 %7
618 %define %%T2 %8
619 %define %%FIRST %9
620
621 vmovdqu %%T1, [%%GDATA + %%HASHKEY]
622 %ifidn %%FIRST, first
623 vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
624 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
625 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
626 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
627 vpxor %%STATE_MID, %%STATE_MID, %%T2
628 %else
629 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
630 vpxor %%STATE_11, %%STATE_11, %%T2
631
632 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
633 vpxor %%STATE_00, %%STATE_00, %%T2
634
635 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
636 vpxor %%STATE_MID, %%STATE_MID, %%T2
637
638 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
639 vpxor %%STATE_MID, %%STATE_MID, %%T2
640 %endif
641
642 %endmacro
643
644 ; if a = number of total plaintext bytes
645 ; b = floor(a/16)
646 ; %%num_initial_blocks = b mod 8;
647 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
648 ; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
649 ; Updated AAD_HASH is returned in %%T3
650
651 %macro INITIAL_BLOCKS 23
652 %define %%GDATA_KEY %1
653 %define %%CYPH_PLAIN_OUT %2
654 %define %%PLAIN_CYPH_IN %3
655 %define %%LENGTH %4
656 %define %%DATA_OFFSET %5
657 %define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
658 %define %%T1 %7
659 %define %%T2 %8
660 %define %%T3 %9
661 %define %%T4 %10
662 %define %%T5 %11
663 %define %%CTR %12
664 %define %%XMM1 %13
665 %define %%XMM2 %14
666 %define %%XMM3 %15
667 %define %%XMM4 %16
668 %define %%XMM5 %17
669 %define %%XMM6 %18
670 %define %%XMM7 %19
671 %define %%XMM8 %20
672 %define %%T6 %21
673 %define %%T_key %22
674 %define %%ENC_DEC %23
675
676 %assign i (8-%%num_initial_blocks)
677 ;; Move AAD_HASH to temp reg
678 vmovdqu %%T2, %%XMM8
679 ;; Start AES for %%num_initial_blocks blocks
680 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
681
682 %assign i (9-%%num_initial_blocks)
683 %rep %%num_initial_blocks
684 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
685 vmovdqa reg(i), %%CTR
686 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
687 %assign i (i+1)
688 %endrep
689
690 %if(%%num_initial_blocks>0)
691 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
692 %assign i (9-%%num_initial_blocks)
693 %rep %%num_initial_blocks
694 vpxor reg(i),reg(i),%%T_key
695 %assign i (i+1)
696 %endrep
697
698 %assign j 1
699 %rep NROUNDS
700 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
701 %assign i (9-%%num_initial_blocks)
702 %rep %%num_initial_blocks
703 vaesenc reg(i),%%T_key
704 %assign i (i+1)
705 %endrep
706
707 %assign j (j+1)
708 %endrep
709
710
711 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
712 %assign i (9-%%num_initial_blocks)
713 %rep %%num_initial_blocks
714 vaesenclast reg(i),%%T_key
715 %assign i (i+1)
716 %endrep
717
718 %endif ; %if(%%num_initial_blocks>0)
719
720
721
722 %assign i (9-%%num_initial_blocks)
723 %rep %%num_initial_blocks
724 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
725 vpxor reg(i), reg(i), %%T1
726 ;; Write back ciphertext for %%num_initial_blocks blocks
727 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
728 add %%DATA_OFFSET, 16
729 %ifidn %%ENC_DEC, DEC
730 vmovdqa reg(i), %%T1
731 %endif
732 ;; Prepare ciphertext for GHASH computations
733 vpshufb reg(i), [rel SHUF_MASK]
734 %assign i (i+1)
735 %endrep
736
737
738 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
739
740 %assign i (9-%%num_initial_blocks)
741 %if(%%num_initial_blocks>0)
742 vmovdqa %%T3, reg(i)
743 %assign i (i+1)
744 %endif
745 %if %%num_initial_blocks>1
746 %rep %%num_initial_blocks-1
747 vmovdqu [rsp + TMP %+ i], reg(i)
748 %assign i (i+1)
749 %endrep
750 %endif
751 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
752 ;; Haskey_i_k holds XORed values of the low and high parts of
753 ;; the Haskey_i
754 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
755 vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
756 vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
757 vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
758 vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
759 vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
760 vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
761 vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
762 vmovdqa %%CTR, %%XMM8
763
764 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
765 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
766 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
767 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
768 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
769 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
770 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
771 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
772
773 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
774 vpxor %%XMM1, %%XMM1, %%T_key
775 vpxor %%XMM2, %%XMM2, %%T_key
776 vpxor %%XMM3, %%XMM3, %%T_key
777 vpxor %%XMM4, %%XMM4, %%T_key
778 vpxor %%XMM5, %%XMM5, %%T_key
779 vpxor %%XMM6, %%XMM6, %%T_key
780 vpxor %%XMM7, %%XMM7, %%T_key
781 vpxor %%XMM8, %%XMM8, %%T_key
782
783 %assign i (8-%%num_initial_blocks)
784 %assign j (9-%%num_initial_blocks)
785 %assign k (%%num_initial_blocks)
786
787 %define %%T4_2 %%T4
788 %if(%%num_initial_blocks>0)
789 ;; Hash in AES state
790 ;; T2 - incoming AAD hash
791 vpxor %%T2, %%T3
792
793 ;; GDATA, HASHKEY, CIPHER,
794 ;; STATE_11, STATE_00, STATE_MID, T1, T2
795 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
796 %%T1, %%T4, %%T6, %%T5, %%T3, first
797 %endif
798
799 vmovdqu %%T_key, [%%GDATA_KEY+16*1]
800 vaesenc %%XMM1, %%T_key
801 vaesenc %%XMM2, %%T_key
802 vaesenc %%XMM3, %%T_key
803 vaesenc %%XMM4, %%T_key
804 vaesenc %%XMM5, %%T_key
805 vaesenc %%XMM6, %%T_key
806 vaesenc %%XMM7, %%T_key
807 vaesenc %%XMM8, %%T_key
808
809 vmovdqu %%T_key, [%%GDATA_KEY+16*2]
810 vaesenc %%XMM1, %%T_key
811 vaesenc %%XMM2, %%T_key
812 vaesenc %%XMM3, %%T_key
813 vaesenc %%XMM4, %%T_key
814 vaesenc %%XMM5, %%T_key
815 vaesenc %%XMM6, %%T_key
816 vaesenc %%XMM7, %%T_key
817 vaesenc %%XMM8, %%T_key
818
819 %assign i (i+1)
820 %assign j (j+1)
821 %assign k (k-1)
822 %if(%%num_initial_blocks>1)
823 ;; GDATA, HASHKEY, CIPHER,
824 ;; STATE_11, STATE_00, STATE_MID, T1, T2
825 vmovdqu %%T2, [rsp + TMP %+ j]
826 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
827 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
828 %endif
829
830 vmovdqu %%T_key, [%%GDATA_KEY+16*3]
831 vaesenc %%XMM1, %%T_key
832 vaesenc %%XMM2, %%T_key
833 vaesenc %%XMM3, %%T_key
834 vaesenc %%XMM4, %%T_key
835 vaesenc %%XMM5, %%T_key
836 vaesenc %%XMM6, %%T_key
837 vaesenc %%XMM7, %%T_key
838 vaesenc %%XMM8, %%T_key
839
840 vmovdqu %%T_key, [%%GDATA_KEY+16*4]
841 vaesenc %%XMM1, %%T_key
842 vaesenc %%XMM2, %%T_key
843 vaesenc %%XMM3, %%T_key
844 vaesenc %%XMM4, %%T_key
845 vaesenc %%XMM5, %%T_key
846 vaesenc %%XMM6, %%T_key
847 vaesenc %%XMM7, %%T_key
848 vaesenc %%XMM8, %%T_key
849
850 %assign i (i+1)
851 %assign j (j+1)
852 %assign k (k-1)
853 %if(%%num_initial_blocks>2)
854 ;; GDATA, HASHKEY, CIPHER,
855 ;; STATE_11, STATE_00, STATE_MID, T1, T2
856 vmovdqu %%T2, [rsp + TMP %+ j]
857 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
858 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
859 %endif
860
861 %assign i (i+1)
862 %assign j (j+1)
863 %assign k (k-1)
864 %if(%%num_initial_blocks>3)
865 ;; GDATA, HASHKEY, CIPHER,
866 ;; STATE_11, STATE_00, STATE_MID, T1, T2
867 vmovdqu %%T2, [rsp + TMP %+ j]
868 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
869 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
870 %endif
871
872 vmovdqu %%T_key, [%%GDATA_KEY+16*5]
873 vaesenc %%XMM1, %%T_key
874 vaesenc %%XMM2, %%T_key
875 vaesenc %%XMM3, %%T_key
876 vaesenc %%XMM4, %%T_key
877 vaesenc %%XMM5, %%T_key
878 vaesenc %%XMM6, %%T_key
879 vaesenc %%XMM7, %%T_key
880 vaesenc %%XMM8, %%T_key
881
882 vmovdqu %%T_key, [%%GDATA_KEY+16*6]
883 vaesenc %%XMM1, %%T_key
884 vaesenc %%XMM2, %%T_key
885 vaesenc %%XMM3, %%T_key
886 vaesenc %%XMM4, %%T_key
887 vaesenc %%XMM5, %%T_key
888 vaesenc %%XMM6, %%T_key
889 vaesenc %%XMM7, %%T_key
890 vaesenc %%XMM8, %%T_key
891
892 %assign i (i+1)
893 %assign j (j+1)
894 %assign k (k-1)
895 %if(%%num_initial_blocks>4)
896 ;; GDATA, HASHKEY, CIPHER,
897 ;; STATE_11, STATE_00, STATE_MID, T1, T2
898 vmovdqu %%T2, [rsp + TMP %+ j]
899 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
900 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
901 %endif
902
903 vmovdqu %%T_key, [%%GDATA_KEY+16*7]
904 vaesenc %%XMM1, %%T_key
905 vaesenc %%XMM2, %%T_key
906 vaesenc %%XMM3, %%T_key
907 vaesenc %%XMM4, %%T_key
908 vaesenc %%XMM5, %%T_key
909 vaesenc %%XMM6, %%T_key
910 vaesenc %%XMM7, %%T_key
911 vaesenc %%XMM8, %%T_key
912
913 vmovdqu %%T_key, [%%GDATA_KEY+16*8]
914 vaesenc %%XMM1, %%T_key
915 vaesenc %%XMM2, %%T_key
916 vaesenc %%XMM3, %%T_key
917 vaesenc %%XMM4, %%T_key
918 vaesenc %%XMM5, %%T_key
919 vaesenc %%XMM6, %%T_key
920 vaesenc %%XMM7, %%T_key
921 vaesenc %%XMM8, %%T_key
922
923 %assign i (i+1)
924 %assign j (j+1)
925 %assign k (k-1)
926 %if(%%num_initial_blocks>5)
927 ;; GDATA, HASHKEY, CIPHER,
928 ;; STATE_11, STATE_00, STATE_MID, T1, T2
929 vmovdqu %%T2, [rsp + TMP %+ j]
930 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
931 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
932 %endif
933
934 vmovdqu %%T_key, [%%GDATA_KEY+16*9]
935 vaesenc %%XMM1, %%T_key
936 vaesenc %%XMM2, %%T_key
937 vaesenc %%XMM3, %%T_key
938 vaesenc %%XMM4, %%T_key
939 vaesenc %%XMM5, %%T_key
940 vaesenc %%XMM6, %%T_key
941 vaesenc %%XMM7, %%T_key
942 vaesenc %%XMM8, %%T_key
943
944 %ifndef GCM128_MODE
945 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
946 vaesenc %%XMM1, %%T_key
947 vaesenc %%XMM2, %%T_key
948 vaesenc %%XMM3, %%T_key
949 vaesenc %%XMM4, %%T_key
950 vaesenc %%XMM5, %%T_key
951 vaesenc %%XMM6, %%T_key
952 vaesenc %%XMM7, %%T_key
953 vaesenc %%XMM8, %%T_key
954 %endif
955
956 %assign i (i+1)
957 %assign j (j+1)
958 %assign k (k-1)
959 %if(%%num_initial_blocks>6)
960 ;; GDATA, HASHKEY, CIPHER,
961 ;; STATE_11, STATE_00, STATE_MID, T1, T2
962 vmovdqu %%T2, [rsp + TMP %+ j]
963 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
964 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
965 %endif
966
967 %ifdef GCM128_MODE
968 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
969 vaesenclast %%XMM1, %%T_key
970 vaesenclast %%XMM2, %%T_key
971 vaesenclast %%XMM3, %%T_key
972 vaesenclast %%XMM4, %%T_key
973 vaesenclast %%XMM5, %%T_key
974 vaesenclast %%XMM6, %%T_key
975 vaesenclast %%XMM7, %%T_key
976 vaesenclast %%XMM8, %%T_key
977 %endif
978
979 %ifdef GCM192_MODE
980 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
981 vaesenc %%XMM1, %%T_key
982 vaesenc %%XMM2, %%T_key
983 vaesenc %%XMM3, %%T_key
984 vaesenc %%XMM4, %%T_key
985 vaesenc %%XMM5, %%T_key
986 vaesenc %%XMM6, %%T_key
987 vaesenc %%XMM7, %%T_key
988 vaesenc %%XMM8, %%T_key
989
990 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
991 vaesenclast %%XMM1, %%T_key
992 vaesenclast %%XMM2, %%T_key
993 vaesenclast %%XMM3, %%T_key
994 vaesenclast %%XMM4, %%T_key
995 vaesenclast %%XMM5, %%T_key
996 vaesenclast %%XMM6, %%T_key
997 vaesenclast %%XMM7, %%T_key
998 vaesenclast %%XMM8, %%T_key
999 %endif
1000 %ifdef GCM256_MODE
1001 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
1002 vaesenc %%XMM1, %%T_key
1003 vaesenc %%XMM2, %%T_key
1004 vaesenc %%XMM3, %%T_key
1005 vaesenc %%XMM4, %%T_key
1006 vaesenc %%XMM5, %%T_key
1007 vaesenc %%XMM6, %%T_key
1008 vaesenc %%XMM7, %%T_key
1009 vaesenc %%XMM8, %%T_key
1010
1011 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
1012 vaesenc %%XMM1, %%T_key
1013 vaesenc %%XMM2, %%T_key
1014 vaesenc %%XMM3, %%T_key
1015 vaesenc %%XMM4, %%T_key
1016 vaesenc %%XMM5, %%T_key
1017 vaesenc %%XMM6, %%T_key
1018 vaesenc %%XMM7, %%T_key
1019 vaesenc %%XMM8, %%T_key
1020 %endif
1021
1022 %assign i (i+1)
1023 %assign j (j+1)
1024 %assign k (k-1)
1025 %if(%%num_initial_blocks>7)
1026 ;; GDATA, HASHKEY, CIPHER,
1027 ;; STATE_11, STATE_00, STATE_MID, T1, T2
1028 vmovdqu %%T2, [rsp + TMP %+ j]
1029 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1030 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
1031 %endif
1032
1033 %ifdef GCM256_MODE ; GCM256
1034 vmovdqu %%T_key, [%%GDATA_KEY+16*13]
1035 vaesenc %%XMM1, %%T_key
1036 vaesenc %%XMM2, %%T_key
1037 vaesenc %%XMM3, %%T_key
1038 vaesenc %%XMM4, %%T_key
1039 vaesenc %%XMM5, %%T_key
1040 vaesenc %%XMM6, %%T_key
1041 vaesenc %%XMM7, %%T_key
1042 vaesenc %%XMM8, %%T_key
1043
1044 vmovdqu %%T_key, [%%GDATA_KEY+16*14]
1045 vaesenclast %%XMM1, %%T_key
1046 vaesenclast %%XMM2, %%T_key
1047 vaesenclast %%XMM3, %%T_key
1048 vaesenclast %%XMM4, %%T_key
1049 vaesenclast %%XMM5, %%T_key
1050 vaesenclast %%XMM6, %%T_key
1051 vaesenclast %%XMM7, %%T_key
1052 vaesenclast %%XMM8, %%T_key
1053 %endif ; GCM256 mode
1054
1055 %if(%%num_initial_blocks>0)
1056 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1057 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1058 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1059 vpxor %%T4, %%T6, %%T4
1060
1061 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1062 ; First phase of the reduction
1063 vmovdqu %%T3, [rel POLY2]
1064
1065 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1066 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
1067
1068 ;; First phase of the reduction complete
1069 vpxor %%T4, %%T4, %%T2
1070
1071 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1072 ; Second phase of the reduction
1073 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1074 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1075 vpsrldq %%T2, %%T2, 4
1076
1077 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1078 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1079 vpslldq %%T4, %%T4, 4
1080 ;; Second phase of the reduction complete
1081 vpxor %%T4, %%T4, %%T2
1082 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1083 ; The result is in %%T3
1084 vpxor %%T3, %%T1, %%T4
1085 %else
1086 ;; The hash should end up in T3
1087 vmovdqa %%T3, %%T2
1088 %endif
1089
1090 ;; Final hash is now in T3
1091 %if %%num_initial_blocks > 0
1092 ;; NOTE: obsolete in case %%num_initial_blocks = 0
1093 sub %%LENGTH, 16*%%num_initial_blocks
1094 %endif
1095
1096 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
1097 vpxor %%XMM1, %%XMM1, %%T1
1098 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
1099 %ifidn %%ENC_DEC, DEC
1100 vmovdqa %%XMM1, %%T1
1101 %endif
1102
1103 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
1104 vpxor %%XMM2, %%XMM2, %%T1
1105 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
1106 %ifidn %%ENC_DEC, DEC
1107 vmovdqa %%XMM2, %%T1
1108 %endif
1109
1110 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
1111 vpxor %%XMM3, %%XMM3, %%T1
1112 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
1113 %ifidn %%ENC_DEC, DEC
1114 vmovdqa %%XMM3, %%T1
1115 %endif
1116
1117 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
1118 vpxor %%XMM4, %%XMM4, %%T1
1119 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
1120 %ifidn %%ENC_DEC, DEC
1121 vmovdqa %%XMM4, %%T1
1122 %endif
1123
1124 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
1125 vpxor %%XMM5, %%XMM5, %%T1
1126 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
1127 %ifidn %%ENC_DEC, DEC
1128 vmovdqa %%XMM5, %%T1
1129 %endif
1130
1131 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
1132 vpxor %%XMM6, %%XMM6, %%T1
1133 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
1134 %ifidn %%ENC_DEC, DEC
1135 vmovdqa %%XMM6, %%T1
1136 %endif
1137
1138 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
1139 vpxor %%XMM7, %%XMM7, %%T1
1140 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
1141 %ifidn %%ENC_DEC, DEC
1142 vmovdqa %%XMM7, %%T1
1143 %endif
1144
1145 %if %%num_initial_blocks > 0
1146 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
1147 ;; This macro is executed for lenght 128 and up,
1148 ;; zero length is checked in GCM_ENC_DEC.
1149 ;; If the last block is partial then the xor will be done later
1150 ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
1151 ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
1152 cmp %%LENGTH, 128
1153 jl %%_initial_skip_last_word_write
1154 %endif
1155 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
1156 vpxor %%XMM8, %%XMM8, %%T1
1157 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
1158 %ifidn %%ENC_DEC, DEC
1159 vmovdqa %%XMM8, %%T1
1160 %endif
1161
1162 ;; Update %%LENGTH with the number of blocks processed
1163 sub %%LENGTH, 16
1164 add %%DATA_OFFSET, 16
1165 %%_initial_skip_last_word_write:
1166 sub %%LENGTH, 128-16
1167 add %%DATA_OFFSET, 128-16
1168
1169 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
1170 ;; Combine GHASHed value with the corresponding ciphertext
1171 vpxor %%XMM1, %%XMM1, %%T3
1172 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
1173 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
1174 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
1175 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
1176 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
1177 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
1178 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
1179
1180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1181
1182 %%_initial_blocks_done:
1183
1184
1185 %endmacro
1186
1187 ;;; INITIAL_BLOCKS macro with support for a partial final block.
1188 ;;; num_initial_blocks is expected to include the partial final block
1189 ;;; in the count.
1190 %macro INITIAL_BLOCKS_PARTIAL 25
1191 %define %%GDATA_KEY %1
1192 %define %%GDATA_CTX %2
1193 %define %%CYPH_PLAIN_OUT %3
1194 %define %%PLAIN_CYPH_IN %4
1195 %define %%LENGTH %5
1196 %define %%DATA_OFFSET %6
1197 %define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
1198 %define %%T1 %8
1199 %define %%T2 %9
1200 %define %%T3 %10 ; [out] hash value
1201 %define %%T4 %11
1202 %define %%T5 %12
1203 %define %%CTR %13
1204 %define %%XMM1 %14
1205 %define %%XMM2 %15
1206 %define %%XMM3 %16
1207 %define %%XMM4 %17
1208 %define %%XMM5 %18
1209 %define %%XMM6 %19
1210 %define %%XMM7 %20
1211 %define %%XMM8 %21 ; [in] hash value
1212 %define %%T6 %22
1213 %define %%T_key %23
1214 %define %%ENC_DEC %24
1215 %define %%INSTANCE_TYPE %25
1216
1217 ;; Move AAD_HASH to temp reg
1218 vmovdqu %%T2, %%XMM8
1219
1220 %assign i (9-%%num_initial_blocks)
1221 %rep %%num_initial_blocks
1222 ;; Compute AES counters
1223 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
1224 vmovdqa reg(i), %%CTR
1225 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
1226 %assign i (i+1)
1227 %endrep
1228
1229 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
1230 %assign i (9-%%num_initial_blocks)
1231 %rep %%num_initial_blocks
1232 ; Start AES for %%num_initial_blocks blocks
1233 vpxor reg(i),reg(i),%%T_key
1234 %assign i (i+1)
1235 %endrep
1236
1237 %assign j 1
1238 %rep NROUNDS
1239 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1240 %assign i (9-%%num_initial_blocks)
1241 %rep %%num_initial_blocks
1242 vaesenc reg(i),%%T_key
1243 %assign i (i+1)
1244 %endrep
1245
1246 %assign j (j+1)
1247 %endrep
1248
1249
1250 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1251 %assign i (9-%%num_initial_blocks)
1252 %rep %%num_initial_blocks
1253 vaesenclast reg(i),%%T_key
1254 %assign i (i+1)
1255 %endrep
1256
1257 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1258 ;;; Hash all but the last block of data
1259 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1260
1261 %assign i (9-%%num_initial_blocks)
1262 %rep %%num_initial_blocks-1
1263 ;; Encrypt the message for all but the last block
1264 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1265 vpxor reg(i), reg(i), %%T1
1266 ;; write back ciphertext for %%num_initial_blocks blocks
1267 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1268 add %%DATA_OFFSET, 16
1269 %ifidn %%ENC_DEC, DEC
1270 vmovdqa reg(i), %%T1
1271 %endif
1272 ;; Prepare ciphertext for GHASH computations
1273 vpshufb reg(i), [rel SHUF_MASK]
1274 %assign i (i+1)
1275 %endrep
1276
1277 %if %%num_initial_blocks > 1
1278 ;; The final block of data may be <16B
1279 sub %%LENGTH, 16*(%%num_initial_blocks-1)
1280 %endif
1281
1282 %if %%num_initial_blocks < 8
1283 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
1284 ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
1285 cmp %%LENGTH, 16
1286 jl %%_small_initial_partial_block
1287
1288 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1289 ;;; Handle a full length final block - encrypt and hash all blocks
1290 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1291
1292 sub %%LENGTH, 16
1293 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1294
1295 ;; Encrypt the message
1296 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1297 vpxor reg(i), reg(i), %%T1
1298 ;; write back ciphertext for %%num_initial_blocks blocks
1299 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1300 add %%DATA_OFFSET, 16
1301 %ifidn %%ENC_DEC, DEC
1302 vmovdqa reg(i), %%T1
1303 %endif
1304 ;; Prepare ciphertext for GHASH computations
1305 vpshufb reg(i), [rel SHUF_MASK]
1306
1307 ;; Hash all of the data
1308 %assign i (8-%%num_initial_blocks)
1309 %assign j (9-%%num_initial_blocks)
1310 %assign k (%%num_initial_blocks)
1311 %assign last_block_to_hash 0
1312
1313 %if(%%num_initial_blocks>last_block_to_hash)
1314 ;; Hash in AES state
1315 vpxor %%T2, reg(j)
1316
1317 ;; T2 - incoming AAD hash
1318 ;; reg(i) holds ciphertext
1319 ;; T5 - hash key
1320 ;; T6 - updated xor
1321 ;; reg(1)/xmm1 should now be available for tmp use
1322 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1323 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1324 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1325 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1326 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1327 vpxor %%T6, %%T6, %%T5
1328 %endif
1329
1330 %assign i (i+1)
1331 %assign j (j+1)
1332 %assign k (k-1)
1333 %assign rep_count (%%num_initial_blocks-1)
1334 %rep rep_count
1335
1336 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1337 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1338 vpxor %%T1, %%T1, %%T3
1339
1340 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1341 vpxor %%T4, %%T4, %%T3
1342
1343 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1344 vpxor %%T6, %%T6, %%T3
1345
1346 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1347 vpxor %%T6, %%T6, %%T3
1348
1349 %assign i (i+1)
1350 %assign j (j+1)
1351 %assign k (k-1)
1352 %endrep
1353
1354 ;; Record that a reduction is needed
1355 mov r12, 1
1356
1357 jmp %%_small_initial_compute_hash
1358
1359
1360 %endif ; %if %%num_initial_blocks < 8
1361
1362 %%_small_initial_partial_block:
1363
1364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1365 ;;; Handle ghash for a <16B final block
1366 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1367
1368 ;; In this case if it's a single call to encrypt we can
1369 ;; hash all of the data but if it's an init / update / finalize
1370 ;; series of call we need to leave the last block if it's
1371 ;; less than a full block of data.
1372
1373 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1374 vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
1375 ;; Handle a partial final block
1376 ;; GDATA, KEY, T1, T2
1377 ;; r13 - length
1378 ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
1379 ;; NOTE: could be replaced with %%LENGTH but at this point
1380 ;; %%LENGTH is always less than 16.
1381 ;; No PLAIN_CYPH_LEN argument available in this macro.
1382 ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
1383 vpshufb reg(i), [rel SHUF_MASK]
1384
1385 %ifidn %%INSTANCE_TYPE, multi_call
1386 %assign i (8-%%num_initial_blocks)
1387 %assign j (9-%%num_initial_blocks)
1388 %assign k (%%num_initial_blocks-1)
1389 %assign last_block_to_hash 1
1390 %else
1391 %assign i (8-%%num_initial_blocks)
1392 %assign j (9-%%num_initial_blocks)
1393 %assign k (%%num_initial_blocks)
1394 %assign last_block_to_hash 0
1395 %endif
1396
1397 %if(%%num_initial_blocks>last_block_to_hash)
1398 ;; Record that a reduction is needed
1399 mov r12, 1
1400 ;; Hash in AES state
1401 vpxor %%T2, reg(j)
1402
1403 ;; T2 - incoming AAD hash
1404 ;; reg(i) holds ciphertext
1405 ;; T5 - hash key
1406 ;; T6 - updated xor
1407 ;; reg(1)/xmm1 should now be available for tmp use
1408 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1409 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1410 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1411 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1412 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1413 vpxor %%T6, %%T6, %%T5
1414 %else
1415 ;; Record that a reduction is not needed -
1416 ;; In this case no hashes are computed because there
1417 ;; is only one initial block and it is < 16B in length.
1418 xor r12, r12
1419 %endif
1420
1421 %assign i (i+1)
1422 %assign j (j+1)
1423 %assign k (k-1)
1424 %ifidn %%INSTANCE_TYPE, multi_call
1425 %assign rep_count (%%num_initial_blocks-2)
1426 %%_multi_call_hash:
1427 %else
1428 %assign rep_count (%%num_initial_blocks-1)
1429 %endif
1430
1431 %if rep_count < 0
1432 ;; fix for negative rep_count
1433 %assign rep_count 0
1434 %endif
1435
1436 %rep rep_count
1437
1438 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1439 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1440 vpxor %%T1, %%T1, %%T3
1441
1442 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1443 vpxor %%T4, %%T4, %%T3
1444
1445 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1446 vpxor %%T6, %%T6, %%T3
1447
1448 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1449 vpxor %%T6, %%T6, %%T3
1450
1451 %assign i (i+1)
1452 %assign j (j+1)
1453 %assign k (k-1)
1454 %endrep
1455
1456 %%_small_initial_compute_hash:
1457
1458 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1459 ;;; Ghash reduction
1460 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1461
1462 %if(%%num_initial_blocks=1)
1463 %ifidn %%INSTANCE_TYPE, multi_call
1464 ;; We only need to check if a reduction is needed if
1465 ;; initial_blocks == 1 and init/update/final is being used.
1466 ;; In this case we may just have a partial block, and that
1467 ;; gets hashed in finalize.
1468 ;; cmp r12, 0
1469 or r12, r12
1470 je %%_no_reduction_needed
1471 %endif
1472 %endif
1473
1474 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1475 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1476 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1477 vpxor %%T4, %%T6, %%T4
1478
1479 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1480 ;; First phase of the reduction
1481 vmovdqu %%T3, [rel POLY2]
1482
1483 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1484 ;; shift-L xmm2 2 DWs
1485 vpslldq %%T2, %%T2, 8
1486 vpxor %%T4, %%T4, %%T2
1487
1488 ;; First phase of the reduction complete
1489 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1490 ;; Second phase of the reduction
1491
1492 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1493 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1494 vpsrldq %%T2, %%T2, 4
1495
1496 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1497 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1498 vpslldq %%T4, %%T4, 4
1499
1500 vpxor %%T4, %%T4, %%T2
1501 ;; Second phase of the reduction complete
1502 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1503 vpxor %%T3, %%T1, %%T4
1504
1505 %ifidn %%INSTANCE_TYPE, multi_call
1506 ;; If using init/update/finalize, we need to xor any partial block data
1507 ;; into the hash.
1508 %if %%num_initial_blocks > 1
1509 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
1510 %if %%num_initial_blocks != 8
1511 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
1512 cmp qword [%%GDATA_CTX + PBlockLen], 0
1513 je %%_no_partial_block_xor
1514 %endif ; %%num_initial_blocks != 8
1515 vpxor %%T3, %%T3, reg(8)
1516 %%_no_partial_block_xor:
1517 %endif ; %%num_initial_blocks > 1
1518 %endif ; %%INSTANCE_TYPE, multi_call
1519
1520 %if(%%num_initial_blocks=1)
1521 %ifidn %%INSTANCE_TYPE, multi_call
1522 ;; NOTE: %%_no_reduction_needed case only valid for
1523 ;; multi_call with initial_blocks = 1.
1524 ;; Look for comment above around '_no_reduction_needed'
1525 ;; The jmp below is obsolete as the code will fall through.
1526
1527 ;; The result is in %%T3
1528 jmp %%_after_reduction
1529
1530 %%_no_reduction_needed:
1531 ;; The hash should end up in T3. The only way we should get here is if
1532 ;; there is a partial block of data, so xor that into the hash.
1533 vpxor %%T3, %%T2, reg(8)
1534 %endif ; %%INSTANCE_TYPE = multi_call
1535 %endif ; %%num_initial_blocks=1
1536
1537 %%_after_reduction:
1538 ;; Final hash is now in T3
1539
1540 %endmacro ; INITIAL_BLOCKS_PARTIAL
1541
1542
1543
1544 ; encrypt 8 blocks at a time
1545 ; ghash the 8 previously encrypted ciphertext blocks
1546 ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
1547 ; %%DATA_OFFSET is the data offset value
1548 %macro GHASH_8_ENCRYPT_8_PARALLEL 23
1549 %define %%GDATA %1
1550 %define %%CYPH_PLAIN_OUT %2
1551 %define %%PLAIN_CYPH_IN %3
1552 %define %%DATA_OFFSET %4
1553 %define %%T1 %5
1554 %define %%T2 %6
1555 %define %%T3 %7
1556 %define %%T4 %8
1557 %define %%T5 %9
1558 %define %%T6 %10
1559 %define %%CTR %11
1560 %define %%XMM1 %12
1561 %define %%XMM2 %13
1562 %define %%XMM3 %14
1563 %define %%XMM4 %15
1564 %define %%XMM5 %16
1565 %define %%XMM6 %17
1566 %define %%XMM7 %18
1567 %define %%XMM8 %19
1568 %define %%T7 %20
1569 %define %%loop_idx %21
1570 %define %%ENC_DEC %22
1571 %define %%FULL_PARTIAL %23
1572
1573 vmovdqa %%T2, %%XMM1
1574 vmovdqu [rsp + TMP2], %%XMM2
1575 vmovdqu [rsp + TMP3], %%XMM3
1576 vmovdqu [rsp + TMP4], %%XMM4
1577 vmovdqu [rsp + TMP5], %%XMM5
1578 vmovdqu [rsp + TMP6], %%XMM6
1579 vmovdqu [rsp + TMP7], %%XMM7
1580 vmovdqu [rsp + TMP8], %%XMM8
1581
1582 %ifidn %%loop_idx, in_order
1583 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
1584 vmovdqu %%T5, [rel TWO]
1585 vpaddd %%XMM2, %%CTR, %%T5
1586 vpaddd %%XMM3, %%XMM1, %%T5
1587 vpaddd %%XMM4, %%XMM2, %%T5
1588 vpaddd %%XMM5, %%XMM3, %%T5
1589 vpaddd %%XMM6, %%XMM4, %%T5
1590 vpaddd %%XMM7, %%XMM5, %%T5
1591 vpaddd %%XMM8, %%XMM6, %%T5
1592 vmovdqa %%CTR, %%XMM8
1593
1594 vmovdqu %%T5, [rel SHUF_MASK]
1595 vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
1596 vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
1597 vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
1598 vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
1599 vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
1600 vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
1601 vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
1602 vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
1603 %else
1604 vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
1605 vmovdqu %%T5, [rel TWOf]
1606 vpaddd %%XMM2, %%CTR, %%T5
1607 vpaddd %%XMM3, %%XMM1, %%T5
1608 vpaddd %%XMM4, %%XMM2, %%T5
1609 vpaddd %%XMM5, %%XMM3, %%T5
1610 vpaddd %%XMM6, %%XMM4, %%T5
1611 vpaddd %%XMM7, %%XMM5, %%T5
1612 vpaddd %%XMM8, %%XMM6, %%T5
1613 vmovdqa %%CTR, %%XMM8
1614 %endif
1615
1616
1617
1618 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1619
1620 vmovdqu %%T1, [%%GDATA + 16*0]
1621 vpxor %%XMM1, %%XMM1, %%T1
1622 vpxor %%XMM2, %%XMM2, %%T1
1623 vpxor %%XMM3, %%XMM3, %%T1
1624 vpxor %%XMM4, %%XMM4, %%T1
1625 vpxor %%XMM5, %%XMM5, %%T1
1626 vpxor %%XMM6, %%XMM6, %%T1
1627 vpxor %%XMM7, %%XMM7, %%T1
1628 vpxor %%XMM8, %%XMM8, %%T1
1629
1630 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1631
1632 vmovdqu %%T1, [%%GDATA + 16*1]
1633 vaesenc %%XMM1, %%T1
1634 vaesenc %%XMM2, %%T1
1635 vaesenc %%XMM3, %%T1
1636 vaesenc %%XMM4, %%T1
1637 vaesenc %%XMM5, %%T1
1638 vaesenc %%XMM6, %%T1
1639 vaesenc %%XMM7, %%T1
1640 vaesenc %%XMM8, %%T1
1641
1642
1643 vmovdqu %%T1, [%%GDATA + 16*2]
1644 vaesenc %%XMM1, %%T1
1645 vaesenc %%XMM2, %%T1
1646 vaesenc %%XMM3, %%T1
1647 vaesenc %%XMM4, %%T1
1648 vaesenc %%XMM5, %%T1
1649 vaesenc %%XMM6, %%T1
1650 vaesenc %%XMM7, %%T1
1651 vaesenc %%XMM8, %%T1
1652
1653 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1654
1655 vmovdqu %%T5, [%%GDATA + HashKey_8]
1656 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1657 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
1658 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1659 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1660 vpxor %%T6, %%T6, %%T5
1661
1662 vmovdqu %%T1, [%%GDATA + 16*3]
1663 vaesenc %%XMM1, %%T1
1664 vaesenc %%XMM2, %%T1
1665 vaesenc %%XMM3, %%T1
1666 vaesenc %%XMM4, %%T1
1667 vaesenc %%XMM5, %%T1
1668 vaesenc %%XMM6, %%T1
1669 vaesenc %%XMM7, %%T1
1670 vaesenc %%XMM8, %%T1
1671
1672 vmovdqu %%T1, [rsp + TMP2]
1673 vmovdqu %%T5, [%%GDATA + HashKey_7]
1674 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1675 vpxor %%T4, %%T4, %%T3
1676
1677 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1678 vpxor %%T7, %%T7, %%T3
1679
1680 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1681 vpxor %%T6, %%T6, %%T3
1682
1683 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1684 vpxor %%T6, %%T6, %%T3
1685
1686 vmovdqu %%T1, [%%GDATA + 16*4]
1687 vaesenc %%XMM1, %%T1
1688 vaesenc %%XMM2, %%T1
1689 vaesenc %%XMM3, %%T1
1690 vaesenc %%XMM4, %%T1
1691 vaesenc %%XMM5, %%T1
1692 vaesenc %%XMM6, %%T1
1693 vaesenc %%XMM7, %%T1
1694 vaesenc %%XMM8, %%T1
1695
1696 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1697 vmovdqu %%T1, [rsp + TMP3]
1698 vmovdqu %%T5, [%%GDATA + HashKey_6]
1699 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1700 vpxor %%T4, %%T4, %%T3
1701
1702 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1703 vpxor %%T7, %%T7, %%T3
1704
1705 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1706 vpxor %%T6, %%T6, %%T3
1707
1708 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1709 vpxor %%T6, %%T6, %%T3
1710
1711 vmovdqu %%T1, [%%GDATA + 16*5]
1712 vaesenc %%XMM1, %%T1
1713 vaesenc %%XMM2, %%T1
1714 vaesenc %%XMM3, %%T1
1715 vaesenc %%XMM4, %%T1
1716 vaesenc %%XMM5, %%T1
1717 vaesenc %%XMM6, %%T1
1718 vaesenc %%XMM7, %%T1
1719 vaesenc %%XMM8, %%T1
1720
1721
1722 vmovdqu %%T1, [rsp + TMP4]
1723 vmovdqu %%T5, [%%GDATA + HashKey_5]
1724 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1725 vpxor %%T4, %%T4, %%T3
1726
1727 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1728 vpxor %%T7, %%T7, %%T3
1729
1730 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1731 vpxor %%T6, %%T6, %%T3
1732
1733 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1734 vpxor %%T6, %%T6, %%T3
1735
1736 vmovdqu %%T1, [%%GDATA + 16*6]
1737 vaesenc %%XMM1, %%T1
1738 vaesenc %%XMM2, %%T1
1739 vaesenc %%XMM3, %%T1
1740 vaesenc %%XMM4, %%T1
1741 vaesenc %%XMM5, %%T1
1742 vaesenc %%XMM6, %%T1
1743 vaesenc %%XMM7, %%T1
1744 vaesenc %%XMM8, %%T1
1745
1746 vmovdqu %%T1, [rsp + TMP5]
1747 vmovdqu %%T5, [%%GDATA + HashKey_4]
1748 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1749 vpxor %%T4, %%T4, %%T3
1750
1751 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1752 vpxor %%T7, %%T7, %%T3
1753
1754 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1755 vpxor %%T6, %%T6, %%T3
1756
1757 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1758 vpxor %%T6, %%T6, %%T3
1759
1760 vmovdqu %%T1, [%%GDATA + 16*7]
1761 vaesenc %%XMM1, %%T1
1762 vaesenc %%XMM2, %%T1
1763 vaesenc %%XMM3, %%T1
1764 vaesenc %%XMM4, %%T1
1765 vaesenc %%XMM5, %%T1
1766 vaesenc %%XMM6, %%T1
1767 vaesenc %%XMM7, %%T1
1768 vaesenc %%XMM8, %%T1
1769
1770 vmovdqu %%T1, [rsp + TMP6]
1771 vmovdqu %%T5, [%%GDATA + HashKey_3]
1772 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1773 vpxor %%T4, %%T4, %%T3
1774
1775 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1776 vpxor %%T7, %%T7, %%T3
1777
1778 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1779 vpxor %%T6, %%T6, %%T3
1780
1781 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1782 vpxor %%T6, %%T6, %%T3
1783
1784 vmovdqu %%T1, [%%GDATA + 16*8]
1785 vaesenc %%XMM1, %%T1
1786 vaesenc %%XMM2, %%T1
1787 vaesenc %%XMM3, %%T1
1788 vaesenc %%XMM4, %%T1
1789 vaesenc %%XMM5, %%T1
1790 vaesenc %%XMM6, %%T1
1791 vaesenc %%XMM7, %%T1
1792 vaesenc %%XMM8, %%T1
1793
1794 vmovdqu %%T1, [rsp + TMP7]
1795 vmovdqu %%T5, [%%GDATA + HashKey_2]
1796 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1797 vpxor %%T4, %%T4, %%T3
1798
1799 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1800 vpxor %%T7, %%T7, %%T3
1801
1802 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1803 vpxor %%T6, %%T6, %%T3
1804
1805 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1806 vpxor %%T6, %%T6, %%T3
1807
1808 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1809
1810 vmovdqu %%T5, [%%GDATA + 16*9]
1811 vaesenc %%XMM1, %%T5
1812 vaesenc %%XMM2, %%T5
1813 vaesenc %%XMM3, %%T5
1814 vaesenc %%XMM4, %%T5
1815 vaesenc %%XMM5, %%T5
1816 vaesenc %%XMM6, %%T5
1817 vaesenc %%XMM7, %%T5
1818 vaesenc %%XMM8, %%T5
1819
1820 vmovdqu %%T1, [rsp + TMP8]
1821 vmovdqu %%T5, [%%GDATA + HashKey]
1822
1823
1824 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1825 vpxor %%T7, %%T7, %%T3
1826
1827 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1828 vpxor %%T6, %%T6, %%T3
1829
1830 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1831 vpxor %%T6, %%T6, %%T3
1832
1833 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1834 vpxor %%T1, %%T4, %%T3
1835
1836
1837 vmovdqu %%T5, [%%GDATA + 16*10]
1838 %ifndef GCM128_MODE ; GCM192 or GCM256
1839 vaesenc %%XMM1, %%T5
1840 vaesenc %%XMM2, %%T5
1841 vaesenc %%XMM3, %%T5
1842 vaesenc %%XMM4, %%T5
1843 vaesenc %%XMM5, %%T5
1844 vaesenc %%XMM6, %%T5
1845 vaesenc %%XMM7, %%T5
1846 vaesenc %%XMM8, %%T5
1847
1848 vmovdqu %%T5, [%%GDATA + 16*11]
1849 vaesenc %%XMM1, %%T5
1850 vaesenc %%XMM2, %%T5
1851 vaesenc %%XMM3, %%T5
1852 vaesenc %%XMM4, %%T5
1853 vaesenc %%XMM5, %%T5
1854 vaesenc %%XMM6, %%T5
1855 vaesenc %%XMM7, %%T5
1856 vaesenc %%XMM8, %%T5
1857
1858 vmovdqu %%T5, [%%GDATA + 16*12]
1859 %endif
1860 %ifdef GCM256_MODE
1861 vaesenc %%XMM1, %%T5
1862 vaesenc %%XMM2, %%T5
1863 vaesenc %%XMM3, %%T5
1864 vaesenc %%XMM4, %%T5
1865 vaesenc %%XMM5, %%T5
1866 vaesenc %%XMM6, %%T5
1867 vaesenc %%XMM7, %%T5
1868 vaesenc %%XMM8, %%T5
1869
1870 vmovdqu %%T5, [%%GDATA + 16*13]
1871 vaesenc %%XMM1, %%T5
1872 vaesenc %%XMM2, %%T5
1873 vaesenc %%XMM3, %%T5
1874 vaesenc %%XMM4, %%T5
1875 vaesenc %%XMM5, %%T5
1876 vaesenc %%XMM6, %%T5
1877 vaesenc %%XMM7, %%T5
1878 vaesenc %%XMM8, %%T5
1879
1880 vmovdqu %%T5, [%%GDATA + 16*14]
1881 %endif ; GCM256
1882
1883 %assign i 0
1884 %assign j 1
1885 %rep 8
1886
1887 ;; SNP TBD: This is pretty ugly - consider whether just XORing the
1888 ;; data in after vaesenclast is simpler and performant. Would
1889 ;; also have to ripple it through partial block and ghash_mul_8.
1890 %ifidn %%FULL_PARTIAL, full
1891 %ifdef NT_LD
1892 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1893 vpxor %%T2, %%T2, %%T5
1894 %else
1895 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1896 %endif
1897
1898 %ifidn %%ENC_DEC, ENC
1899 vaesenclast reg(j), reg(j), %%T2
1900 %else
1901 vaesenclast %%T3, reg(j), %%T2
1902 vpxor reg(j), %%T2, %%T5
1903 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1904 %endif
1905
1906 %else
1907 ; Don't read the final data during partial block processing
1908 %ifdef NT_LD
1909 %if (i<7)
1910 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1911 vpxor %%T2, %%T2, %%T5
1912 %else
1913 ;; Stage the key directly in T2 rather than hash it with plaintext
1914 vmovdqu %%T2, %%T5
1915 %endif
1916 %else
1917 %if (i<7)
1918 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1919 %else
1920 ;; Stage the key directly in T2 rather than hash it with plaintext
1921 vmovdqu %%T2, %%T5
1922 %endif
1923 %endif
1924
1925 %ifidn %%ENC_DEC, ENC
1926 vaesenclast reg(j), reg(j), %%T2
1927 %else
1928 %if (i<7)
1929 vaesenclast %%T3, reg(j), %%T2
1930 vpxor reg(j), %%T2, %%T5
1931 ;; Do not read the data since it could fault
1932 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1933 %else
1934 vaesenclast reg(j), reg(j), %%T2
1935 %endif
1936 %endif
1937 %endif
1938
1939 %assign i (i+1)
1940 %assign j (j+1)
1941 %endrep
1942
1943
1944 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1945
1946
1947 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
1948 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
1949 vpxor %%T7, %%T7, %%T3
1950 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
1951
1952
1953
1954 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1955 ;first phase of the reduction
1956 vmovdqu %%T3, [rel POLY2]
1957
1958 vpclmulqdq %%T2, %%T3, %%T7, 0x01
1959 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
1960
1961 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
1962 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1963
1964 %ifidn %%ENC_DEC, ENC
1965 ; Write to the Ciphertext buffer
1966 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
1967 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
1968 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
1969 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
1970 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
1971 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
1972 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
1973 %ifidn %%FULL_PARTIAL, full
1974 ;; Avoid writing past the buffer if handling a partial block
1975 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
1976 %endif
1977 %endif
1978
1979
1980 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1981 ;second phase of the reduction
1982 vpclmulqdq %%T2, %%T3, %%T7, 0x00
1983 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1984
1985 vpclmulqdq %%T4, %%T3, %%T7, 0x10
1986 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1987
1988 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
1989 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1990 vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
1991
1992 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
1993 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
1994 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
1995 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
1996 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
1997 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
1998 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
1999 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
2000
2001
2002 vpxor %%XMM1, %%T1
2003
2004
2005 %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
2006
2007
2008 ; GHASH the last 4 ciphertext blocks.
2009 %macro GHASH_LAST_8 16
2010 %define %%GDATA %1
2011 %define %%T1 %2
2012 %define %%T2 %3
2013 %define %%T3 %4
2014 %define %%T4 %5
2015 %define %%T5 %6
2016 %define %%T6 %7
2017 %define %%T7 %8
2018 %define %%XMM1 %9
2019 %define %%XMM2 %10
2020 %define %%XMM3 %11
2021 %define %%XMM4 %12
2022 %define %%XMM5 %13
2023 %define %%XMM6 %14
2024 %define %%XMM7 %15
2025 %define %%XMM8 %16
2026
2027 ;; Karatsuba Method
2028
2029 vmovdqu %%T5, [%%GDATA + HashKey_8]
2030
2031 vpshufd %%T2, %%XMM1, 01001110b
2032 vpshufd %%T3, %%T5, 01001110b
2033 vpxor %%T2, %%T2, %%XMM1
2034 vpxor %%T3, %%T3, %%T5
2035
2036 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2037 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2038
2039 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2040
2041 ;;;;;;;;;;;;;;;;;;;;;;
2042
2043 vmovdqu %%T5, [%%GDATA + HashKey_7]
2044 vpshufd %%T2, %%XMM2, 01001110b
2045 vpshufd %%T3, %%T5, 01001110b
2046 vpxor %%T2, %%T2, %%XMM2
2047 vpxor %%T3, %%T3, %%T5
2048
2049 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2050 vpxor %%T6, %%T6, %%T4
2051
2052 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2053 vpxor %%T7, %%T7, %%T4
2054
2055 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2056
2057 vpxor %%XMM1, %%XMM1, %%T2
2058
2059 ;;;;;;;;;;;;;;;;;;;;;;
2060
2061 vmovdqu %%T5, [%%GDATA + HashKey_6]
2062 vpshufd %%T2, %%XMM3, 01001110b
2063 vpshufd %%T3, %%T5, 01001110b
2064 vpxor %%T2, %%T2, %%XMM3
2065 vpxor %%T3, %%T3, %%T5
2066
2067 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2068 vpxor %%T6, %%T6, %%T4
2069
2070 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2071 vpxor %%T7, %%T7, %%T4
2072
2073 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2074
2075 vpxor %%XMM1, %%XMM1, %%T2
2076
2077 ;;;;;;;;;;;;;;;;;;;;;;
2078
2079 vmovdqu %%T5, [%%GDATA + HashKey_5]
2080 vpshufd %%T2, %%XMM4, 01001110b
2081 vpshufd %%T3, %%T5, 01001110b
2082 vpxor %%T2, %%T2, %%XMM4
2083 vpxor %%T3, %%T3, %%T5
2084
2085 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2086 vpxor %%T6, %%T6, %%T4
2087
2088 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2089 vpxor %%T7, %%T7, %%T4
2090
2091 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2092
2093 vpxor %%XMM1, %%XMM1, %%T2
2094
2095 ;;;;;;;;;;;;;;;;;;;;;;
2096
2097 vmovdqu %%T5, [%%GDATA + HashKey_4]
2098 vpshufd %%T2, %%XMM5, 01001110b
2099 vpshufd %%T3, %%T5, 01001110b
2100 vpxor %%T2, %%T2, %%XMM5
2101 vpxor %%T3, %%T3, %%T5
2102
2103 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2104 vpxor %%T6, %%T6, %%T4
2105
2106 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2107 vpxor %%T7, %%T7, %%T4
2108
2109 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2110
2111 vpxor %%XMM1, %%XMM1, %%T2
2112
2113 ;;;;;;;;;;;;;;;;;;;;;;
2114
2115 vmovdqu %%T5, [%%GDATA + HashKey_3]
2116 vpshufd %%T2, %%XMM6, 01001110b
2117 vpshufd %%T3, %%T5, 01001110b
2118 vpxor %%T2, %%T2, %%XMM6
2119 vpxor %%T3, %%T3, %%T5
2120
2121 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2122 vpxor %%T6, %%T6, %%T4
2123
2124 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2125 vpxor %%T7, %%T7, %%T4
2126
2127 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2128
2129 vpxor %%XMM1, %%XMM1, %%T2
2130
2131 ;;;;;;;;;;;;;;;;;;;;;;
2132
2133 vmovdqu %%T5, [%%GDATA + HashKey_2]
2134 vpshufd %%T2, %%XMM7, 01001110b
2135 vpshufd %%T3, %%T5, 01001110b
2136 vpxor %%T2, %%T2, %%XMM7
2137 vpxor %%T3, %%T3, %%T5
2138
2139 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2140 vpxor %%T6, %%T6, %%T4
2141
2142 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2143 vpxor %%T7, %%T7, %%T4
2144
2145 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2146
2147 vpxor %%XMM1, %%XMM1, %%T2
2148
2149 ;;;;;;;;;;;;;;;;;;;;;;
2150
2151 vmovdqu %%T5, [%%GDATA + HashKey]
2152 vpshufd %%T2, %%XMM8, 01001110b
2153 vpshufd %%T3, %%T5, 01001110b
2154 vpxor %%T2, %%T2, %%XMM8
2155 vpxor %%T3, %%T3, %%T5
2156
2157 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
2158 vpxor %%T6, %%T6, %%T4
2159
2160 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
2161 vpxor %%T7, %%T7, %%T4
2162
2163 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2164
2165 vpxor %%XMM1, %%XMM1, %%T2
2166 vpxor %%XMM1, %%XMM1, %%T6
2167 vpxor %%T2, %%XMM1, %%T7
2168
2169
2170
2171
2172 vpslldq %%T4, %%T2, 8
2173 vpsrldq %%T2, %%T2, 8
2174
2175 vpxor %%T7, %%T7, %%T4
2176 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2177
2178 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2179 ;first phase of the reduction
2180 vmovdqu %%T3, [rel POLY2]
2181
2182 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2183 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2184
2185 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2186 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2187
2188
2189 ;second phase of the reduction
2190 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2191 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2192
2193 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2194 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2195
2196 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2197 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2198 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2199 %endmacro
2200
2201
2202 ; GHASH the last 4 ciphertext blocks.
2203 %macro GHASH_LAST_7 15
2204 %define %%GDATA %1
2205 %define %%T1 %2
2206 %define %%T2 %3
2207 %define %%T3 %4
2208 %define %%T4 %5
2209 %define %%T5 %6
2210 %define %%T6 %7
2211 %define %%T7 %8
2212 %define %%XMM1 %9
2213 %define %%XMM2 %10
2214 %define %%XMM3 %11
2215 %define %%XMM4 %12
2216 %define %%XMM5 %13
2217 %define %%XMM6 %14
2218 %define %%XMM7 %15
2219
2220 ;; Karatsuba Method
2221
2222 vmovdqu %%T5, [%%GDATA + HashKey_7]
2223
2224 vpshufd %%T2, %%XMM1, 01001110b
2225 vpshufd %%T3, %%T5, 01001110b
2226 vpxor %%T2, %%T2, %%XMM1
2227 vpxor %%T3, %%T3, %%T5
2228
2229 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2230 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2231
2232 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2233
2234 ;;;;;;;;;;;;;;;;;;;;;;
2235
2236 vmovdqu %%T5, [%%GDATA + HashKey_6]
2237 vpshufd %%T2, %%XMM2, 01001110b
2238 vpshufd %%T3, %%T5, 01001110b
2239 vpxor %%T2, %%T2, %%XMM2
2240 vpxor %%T3, %%T3, %%T5
2241
2242 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2243 vpxor %%T6, %%T6, %%T4
2244
2245 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2246 vpxor %%T7, %%T7, %%T4
2247
2248 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2249
2250 vpxor %%XMM1, %%XMM1, %%T2
2251
2252 ;;;;;;;;;;;;;;;;;;;;;;
2253
2254 vmovdqu %%T5, [%%GDATA + HashKey_5]
2255 vpshufd %%T2, %%XMM3, 01001110b
2256 vpshufd %%T3, %%T5, 01001110b
2257 vpxor %%T2, %%T2, %%XMM3
2258 vpxor %%T3, %%T3, %%T5
2259
2260 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2261 vpxor %%T6, %%T6, %%T4
2262
2263 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2264 vpxor %%T7, %%T7, %%T4
2265
2266 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2267
2268 vpxor %%XMM1, %%XMM1, %%T2
2269
2270 ;;;;;;;;;;;;;;;;;;;;;;
2271
2272 vmovdqu %%T5, [%%GDATA + HashKey_4]
2273 vpshufd %%T2, %%XMM4, 01001110b
2274 vpshufd %%T3, %%T5, 01001110b
2275 vpxor %%T2, %%T2, %%XMM4
2276 vpxor %%T3, %%T3, %%T5
2277
2278 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2279 vpxor %%T6, %%T6, %%T4
2280
2281 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2282 vpxor %%T7, %%T7, %%T4
2283
2284 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2285
2286 vpxor %%XMM1, %%XMM1, %%T2
2287
2288 ;;;;;;;;;;;;;;;;;;;;;;
2289
2290 vmovdqu %%T5, [%%GDATA + HashKey_3]
2291 vpshufd %%T2, %%XMM5, 01001110b
2292 vpshufd %%T3, %%T5, 01001110b
2293 vpxor %%T2, %%T2, %%XMM5
2294 vpxor %%T3, %%T3, %%T5
2295
2296 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2297 vpxor %%T6, %%T6, %%T4
2298
2299 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2300 vpxor %%T7, %%T7, %%T4
2301
2302 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2303
2304 vpxor %%XMM1, %%XMM1, %%T2
2305
2306 ;;;;;;;;;;;;;;;;;;;;;;
2307
2308 vmovdqu %%T5, [%%GDATA + HashKey_2]
2309 vpshufd %%T2, %%XMM6, 01001110b
2310 vpshufd %%T3, %%T5, 01001110b
2311 vpxor %%T2, %%T2, %%XMM6
2312 vpxor %%T3, %%T3, %%T5
2313
2314 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2315 vpxor %%T6, %%T6, %%T4
2316
2317 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2318 vpxor %%T7, %%T7, %%T4
2319
2320 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2321
2322 vpxor %%XMM1, %%XMM1, %%T2
2323
2324 ;;;;;;;;;;;;;;;;;;;;;;
2325
2326 vmovdqu %%T5, [%%GDATA + HashKey_1]
2327 vpshufd %%T2, %%XMM7, 01001110b
2328 vpshufd %%T3, %%T5, 01001110b
2329 vpxor %%T2, %%T2, %%XMM7
2330 vpxor %%T3, %%T3, %%T5
2331
2332 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2333 vpxor %%T6, %%T6, %%T4
2334
2335 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2336 vpxor %%T7, %%T7, %%T4
2337
2338 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2339
2340 vpxor %%XMM1, %%XMM1, %%T2
2341
2342 ;;;;;;;;;;;;;;;;;;;;;;
2343
2344 vpxor %%XMM1, %%XMM1, %%T6
2345 vpxor %%T2, %%XMM1, %%T7
2346
2347
2348
2349
2350 vpslldq %%T4, %%T2, 8
2351 vpsrldq %%T2, %%T2, 8
2352
2353 vpxor %%T7, %%T7, %%T4
2354 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2355
2356 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2357 ;first phase of the reduction
2358 vmovdqu %%T3, [rel POLY2]
2359
2360 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2361 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2362
2363 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2365
2366
2367 ;second phase of the reduction
2368 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2369 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2370
2371 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2372 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2373
2374 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2376 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2377 %endmacro
2378
2379
2380
2381 ;;; Handle encryption of the final partial block
2382 ;;; IN:
2383 ;;; r13 - Number of bytes to read
2384 ;;; MODIFIES:
2385 ;;; KEY - Key for encrypting the partial block
2386 ;;; HASH - Current hash value
2387 ;;; SMASHES:
2388 ;;; r10, r12, r15, rax
2389 ;;; T1, T2
2390 ;;; Note:
2391 ;;; PLAIN_CYPH_LEN, %7, is passed only to determine
2392 ;;; if buffer is big enough to do a 16 byte read & shift.
2393 ;;; 'LT16' is passed here only if buffer is known to be smaller
2394 ;;; than 16 bytes.
2395 ;;; Any other value passed here will result in 16 byte read
2396 ;;; code path.
2397 ;;; TBD: Remove HASH from the instantiation
2398 %macro ENCRYPT_FINAL_PARTIAL_BLOCK 8
2399 %define %%KEY %1
2400 %define %%T1 %2
2401 %define %%T2 %3
2402 %define %%CYPH_PLAIN_OUT %4
2403 %define %%PLAIN_CYPH_IN %5
2404 %define %%PLAIN_CYPH_LEN %6
2405 %define %%ENC_DEC %7
2406 %define %%DATA_OFFSET %8
2407
2408 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
2409
2410 ;; T1 - packed output
2411 ;; r10 - input data address
2412 ;; r13 - input data length
2413 ;; rax - temp registers
2414 ;; out:
2415 ;; k1 - valid byte mask
2416 READ_SMALL_DATA_INPUT %%T1, r10, r13, rax
2417
2418 ;; At this point T1 contains the partial block data
2419 %ifidn %%ENC_DEC, DEC
2420 ;; Plaintext XOR E(K, Yn)
2421 ;; Set aside the ciphertext
2422 ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
2423 vmovdqu8 %%T2{k1}{z}, %%T1
2424 vpxor %%KEY, %%KEY, %%T1
2425 %else
2426 ;; Plaintext XOR E(K, Yn)
2427 ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
2428 vpxor %%KEY, %%KEY, %%T1
2429 %endif
2430 vmovdqu8 %%KEY{k1}{z}, %%KEY
2431
2432 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2433 ;; Output r13 Bytes
2434 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY
2435
2436 %ifidn %%ENC_DEC, DEC
2437 ;; If decrypt, restore the ciphertext into %%KEY
2438 vmovdqa %%KEY, %%T2
2439 %endif
2440 %endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
2441
2442
2443
2444 ; Encryption of a single block
2445 %macro ENCRYPT_SINGLE_BLOCK 2
2446 %define %%GDATA %1
2447 %define %%XMM0 %2
2448
2449 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
2450 %assign i 1
2451 %rep NROUNDS
2452 vaesenc %%XMM0, [%%GDATA+16*i]
2453 %assign i (i+1)
2454 %endrep
2455 vaesenclast %%XMM0, [%%GDATA+16*i]
2456 %endmacro
2457
2458
2459 ;; Start of Stack Setup
2460
2461 %macro FUNC_SAVE 0
2462 ;; Required for Update/GMC_ENC
2463 ;the number of pushes must equal STACK_OFFSET
2464 push r12
2465 push r13
2466 push r14
2467 push r15
2468 mov r14, rsp
2469
2470 sub rsp, VARIABLE_OFFSET
2471 and rsp, ~63
2472
2473 %ifidn __OUTPUT_FORMAT__, win64
2474 ; xmm6:xmm15 need to be maintained for Windows
2475 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2476 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
2477 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
2478 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
2479 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
2480 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
2481 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
2482 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
2483 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
2484 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
2485 %endif
2486 %endmacro
2487
2488
2489 %macro FUNC_RESTORE 0
2490
2491 %ifidn __OUTPUT_FORMAT__, win64
2492 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
2493 vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
2494 vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
2495 vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
2496 vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
2497 vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
2498 vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
2499 vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
2500 vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
2501 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2502 %endif
2503 ;; Required for Update/GMC_ENC
2504 mov rsp, r14
2505 pop r15
2506 pop r14
2507 pop r13
2508 pop r12
2509 %endmacro
2510
2511 %macro FUNC_SAVE_AVX512 0
2512 ;; Required for Update/GMC_ENC
2513 ;the number of pushes must equal STACK_OFFSET
2514 mov rax, rsp
2515
2516 sub rsp, VARIABLE_OFFSET_AVX512
2517 and rsp, ~63
2518
2519 mov [rsp + STACK_OFFSET_AVX512 + 0*8], r12
2520 mov [rsp + STACK_OFFSET_AVX512 + 1*8], r13
2521 mov [rsp + STACK_OFFSET_AVX512 + 2*8], r14
2522 mov [rsp + STACK_OFFSET_AVX512 + 3*8], r15
2523 mov [rsp + STACK_OFFSET_AVX512 + 4*8], rax ; stack
2524 mov r14, rax ; r14 is used to retrieve stack args
2525 mov [rsp + STACK_OFFSET_AVX512 + 5*8], rbp
2526 mov [rsp + STACK_OFFSET_AVX512 + 6*8], rbx
2527 %ifidn __OUTPUT_FORMAT__, win64
2528 mov [rsp + STACK_OFFSET_AVX512 + 7*8], rdi
2529 mov [rsp + STACK_OFFSET_AVX512 + 8*8], rsi
2530 %endif
2531
2532 %ifidn __OUTPUT_FORMAT__, win64
2533 ; xmm6:xmm15 need to be maintained for Windows
2534 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 0*16], xmm6
2535 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 1*16], xmm7
2536 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 2*16], xmm8
2537 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 3*16], xmm9
2538 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 4*16], xmm10
2539 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 5*16], xmm11
2540 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 6*16], xmm12
2541 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 7*16], xmm13
2542 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 8*16], xmm14
2543 vmovdqu [rsp + LOCAL_STORAGE_AVX512 + 9*16], xmm15
2544 %endif
2545 %endmacro
2546
2547
2548 %macro FUNC_RESTORE_AVX512 0
2549
2550 %ifidn __OUTPUT_FORMAT__, win64
2551 vmovdqu xmm15, [rsp + LOCAL_STORAGE_AVX512 + 9*16]
2552 vmovdqu xmm14, [rsp + LOCAL_STORAGE_AVX512 + 8*16]
2553 vmovdqu xmm13, [rsp + LOCAL_STORAGE_AVX512 + 7*16]
2554 vmovdqu xmm12, [rsp + LOCAL_STORAGE_AVX512 + 6*16]
2555 vmovdqu xmm11, [rsp + LOCAL_STORAGE_AVX512 + 5*16]
2556 vmovdqu xmm10, [rsp + LOCAL_STORAGE_AVX512 + 4*16]
2557 vmovdqu xmm9, [rsp + LOCAL_STORAGE_AVX512 + 3*16]
2558 vmovdqu xmm8, [rsp + LOCAL_STORAGE_AVX512 + 2*16]
2559 vmovdqu xmm7, [rsp + LOCAL_STORAGE_AVX512 + 1*16]
2560 vmovdqu xmm6, [rsp + LOCAL_STORAGE_AVX512 + 0*16]
2561 %endif
2562
2563 ;; Required for Update/GMC_ENC
2564 mov rbp, [rsp + STACK_OFFSET_AVX512 + 5*8]
2565 mov rbx, [rsp + STACK_OFFSET_AVX512 + 6*8]
2566 %ifidn __OUTPUT_FORMAT__, win64
2567 mov rdi, [rsp + STACK_OFFSET_AVX512 + 7*8]
2568 mov rsi, [rsp + STACK_OFFSET_AVX512 + 8*8]
2569 %endif
2570 mov r12, [rsp + STACK_OFFSET_AVX512 + 0*8]
2571 mov r13, [rsp + STACK_OFFSET_AVX512 + 1*8]
2572 mov r14, [rsp + STACK_OFFSET_AVX512 + 2*8]
2573 mov r15, [rsp + STACK_OFFSET_AVX512 + 3*8]
2574 mov rsp, [rsp + STACK_OFFSET_AVX512 + 4*8] ; stack
2575 %endmacro
2576
2577
2578 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2579 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
2580 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
2581 ; Additional Authentication data (A_IN), Additional Data length (A_LEN).
2582 ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
2583 ; Clobbers rax, r10-r13, and xmm0-xmm6
2584 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2585 %macro GCM_INIT 8
2586 %define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
2587 %define %%GDATA_CTX %2 ; [in] GCM context pointer
2588 %define %%IV %3 ; [in] IV pointer
2589 %define %%A_IN %4 ; [in] AAD pointer
2590 %define %%A_LEN %5 ; [in] AAD length in bytes
2591 %define %%GPR1 %6 ; temp GPR
2592 %define %%GPR2 %7 ; temp GPR
2593 %define %%GPR3 %8 ; temp GPR
2594
2595 %define %%AAD_HASH xmm14
2596
2597 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3
2598
2599 mov %%GPR1, %%A_LEN
2600 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
2601 mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length
2602
2603 xor %%GPR1, %%GPR1
2604 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0
2605 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0
2606
2607 ;; read 12 IV bytes and pad with 0x00000001
2608 mov %%GPR2, %%IV
2609 vmovd xmm3, [%%GPR2 + 8]
2610 vpslldq xmm3, 8
2611 vmovq xmm2, [%%GPR2]
2612 vmovdqa xmm4, [rel ONEf]
2613 vpternlogq xmm2, xmm3, xmm4, 0xfe ; xmm2 = xmm2 or xmm3 or xmm4
2614
2615 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
2616
2617 ;; store IV as counter in LE format
2618 vpshufb xmm2, [rel SHUF_MASK]
2619 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
2620 %endmacro
2621
2622 %macro GCM_ENC_DEC_SMALL 12
2623 %define %%GDATA_KEY %1
2624 %define %%GDATA_CTX %2
2625 %define %%CYPH_PLAIN_OUT %3
2626 %define %%PLAIN_CYPH_IN %4
2627 %define %%PLAIN_CYPH_LEN %5
2628 %define %%ENC_DEC %6
2629 %define %%DATA_OFFSET %7
2630 %define %%LENGTH %8 ; assumed r13
2631 %define %%NUM_BLOCKS %9
2632 %define %%CTR %10 ; assumed xmm9
2633 %define %%HASH_OUT %11 ; assumed xmm14
2634 %define %%INSTANCE_TYPE %12
2635
2636 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
2637 ;; cmp %%NUM_BLOCKS, 0
2638 ;; je %%_small_initial_blocks_encrypted
2639 cmp %%NUM_BLOCKS, 8
2640 je %%_small_initial_num_blocks_is_8
2641 cmp %%NUM_BLOCKS, 7
2642 je %%_small_initial_num_blocks_is_7
2643 cmp %%NUM_BLOCKS, 6
2644 je %%_small_initial_num_blocks_is_6
2645 cmp %%NUM_BLOCKS, 5
2646 je %%_small_initial_num_blocks_is_5
2647 cmp %%NUM_BLOCKS, 4
2648 je %%_small_initial_num_blocks_is_4
2649 cmp %%NUM_BLOCKS, 3
2650 je %%_small_initial_num_blocks_is_3
2651 cmp %%NUM_BLOCKS, 2
2652 je %%_small_initial_num_blocks_is_2
2653
2654 jmp %%_small_initial_num_blocks_is_1
2655
2656
2657 %%_small_initial_num_blocks_is_8:
2658 ;; r13 - %%LENGTH
2659 ;; xmm12 - T1
2660 ;; xmm13 - T2
2661 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2662 ;; xmm15 - T4
2663 ;; xmm11 - T5
2664 ;; xmm9 - CTR
2665 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2666 ;; xmm2 - XMM2
2667 ;; xmm3 - XMM3
2668 ;; xmm4 - XMM4
2669 ;; xmm5 - XMM5
2670 ;; xmm6 - XMM6
2671 ;; xmm7 - XMM7
2672 ;; xmm8 - XMM8 - AAD HASH IN
2673 ;; xmm10 - T6
2674 ;; xmm0 - T_key
2675 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2676 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \
2677 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2678 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2679 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2680 jmp %%_small_initial_blocks_encrypted
2681
2682 %%_small_initial_num_blocks_is_7:
2683 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2684 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \
2685 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2686 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2687 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2688 jmp %%_small_initial_blocks_encrypted
2689
2690 %%_small_initial_num_blocks_is_6:
2691 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2692 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \
2693 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2694 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2695 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2696 jmp %%_small_initial_blocks_encrypted
2697
2698 %%_small_initial_num_blocks_is_5:
2699 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2700 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \
2701 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2702 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2703 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2704 jmp %%_small_initial_blocks_encrypted
2705
2706 %%_small_initial_num_blocks_is_4:
2707 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2708 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \
2709 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2710 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2711 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2712 jmp %%_small_initial_blocks_encrypted
2713
2714 %%_small_initial_num_blocks_is_3:
2715 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2716 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \
2717 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2718 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2719 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2720 jmp %%_small_initial_blocks_encrypted
2721
2722 %%_small_initial_num_blocks_is_2:
2723 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2724 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \
2725 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2726 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2727 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2728 jmp %%_small_initial_blocks_encrypted
2729
2730 %%_small_initial_num_blocks_is_1:
2731 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2732 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \
2733 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2734 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2735 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2736 %%_small_initial_blocks_encrypted:
2737
2738 %endmacro ; GCM_ENC_DEC_SMALL
2739
2740 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2741 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
2742 ; has been initialized by GCM_INIT
2743 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
2744 ; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
2745 ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
2746 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
2747 ; Clobbers rax, r10-r15, and xmm0-xmm15
2748 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2749 %macro GCM_ENC_DEC 7
2750 %define %%GDATA_KEY %1
2751 %define %%GDATA_CTX %2
2752 %define %%CYPH_PLAIN_OUT %3
2753 %define %%PLAIN_CYPH_IN %4
2754 %define %%PLAIN_CYPH_LEN %5
2755 %define %%ENC_DEC %6
2756 %define %%INSTANCE_TYPE %7
2757 %define %%DATA_OFFSET r11
2758
2759 ; Macro flow:
2760 ; calculate the number of 16byte blocks in the message
2761 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
2762 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
2763 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
2764
2765 %ifidn __OUTPUT_FORMAT__, win64
2766 cmp %%PLAIN_CYPH_LEN, 0
2767 %else
2768 or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
2769 %endif
2770 je %%_enc_dec_done
2771
2772 xor %%DATA_OFFSET, %%DATA_OFFSET
2773 ;; Update length of data processed
2774 %ifidn __OUTPUT_FORMAT__, win64
2775 mov rax, %%PLAIN_CYPH_LEN
2776 add [%%GDATA_CTX + InLen], rax
2777 %else
2778 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
2779 %endif
2780 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2781 vmovdqu xmm8, [%%GDATA_CTX + AadHash]
2782
2783 %ifidn %%INSTANCE_TYPE, multi_call
2784 ;; NOTE: partial block processing makes only sense for multi_call here.
2785 ;; Used for the update flow - if there was a previous partial
2786 ;; block fill the remaining bytes here.
2787 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
2788 %endif
2789
2790 ;; lift CTR set from initial_blocks to here
2791 %ifidn %%INSTANCE_TYPE, single_call
2792 vmovdqu xmm9, xmm2
2793 %else
2794 vmovdqu xmm9, [%%GDATA_CTX + CurCount]
2795 %endif
2796
2797 ;; Save the amount of data left to process in r10
2798 mov r13, %%PLAIN_CYPH_LEN
2799 %ifidn %%INSTANCE_TYPE, multi_call
2800 ;; NOTE: %%DATA_OFFSET is zero in single_call case.
2801 ;; Consequently PLAIN_CYPH_LEN will never be zero after
2802 ;; %%DATA_OFFSET subtraction below.
2803 sub r13, %%DATA_OFFSET
2804
2805 ;; There may be no more data if it was consumed in the partial block.
2806 cmp r13, 0
2807 je %%_enc_dec_done
2808 %endif ; %%INSTANCE_TYPE, multi_call
2809 mov r10, r13
2810
2811 ;; Determine how many blocks to process in INITIAL
2812 mov r12, r13
2813 shr r12, 4
2814 and r12, 7
2815
2816 ;; Process one additional block in INITIAL if there is a partial block
2817 and r10, 0xf
2818 blsmsk r10, r10 ; Set CF if zero
2819 cmc ; Flip CF
2820 adc r12, 0x0 ; Process an additional INITIAL block if CF set
2821
2822 ;; Less than 127B will be handled by the small message code, which
2823 ;; can process up to 7 16B blocks.
2824 cmp r13, 128
2825 jge %%_large_message_path
2826
2827 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
2828 jmp %%_ghash_done
2829
2830 %%_large_message_path:
2831 and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
2832 ; can be handled by the x8 partial loop.
2833
2834 cmp r12, 0
2835 je %%_initial_num_blocks_is_0
2836 cmp r12, 7
2837 je %%_initial_num_blocks_is_7
2838 cmp r12, 6
2839 je %%_initial_num_blocks_is_6
2840 cmp r12, 5
2841 je %%_initial_num_blocks_is_5
2842 cmp r12, 4
2843 je %%_initial_num_blocks_is_4
2844 cmp r12, 3
2845 je %%_initial_num_blocks_is_3
2846 cmp r12, 2
2847 je %%_initial_num_blocks_is_2
2848
2849 jmp %%_initial_num_blocks_is_1
2850
2851 %%_initial_num_blocks_is_7:
2852 ;; r13 - %%LENGTH
2853 ;; xmm12 - T1
2854 ;; xmm13 - T2
2855 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2856 ;; xmm15 - T4
2857 ;; xmm11 - T5
2858 ;; xmm9 - CTR
2859 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2860 ;; xmm2 - XMM2
2861 ;; xmm3 - XMM3
2862 ;; xmm4 - XMM4
2863 ;; xmm5 - XMM5
2864 ;; xmm6 - XMM6
2865 ;; xmm7 - XMM7
2866 ;; xmm8 - XMM8 - AAD HASH IN
2867 ;; xmm10 - T6
2868 ;; xmm0 - T_key
2869 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2870 jmp %%_initial_blocks_encrypted
2871
2872 %%_initial_num_blocks_is_6:
2873 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2874 jmp %%_initial_blocks_encrypted
2875
2876 %%_initial_num_blocks_is_5:
2877 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2878 jmp %%_initial_blocks_encrypted
2879
2880 %%_initial_num_blocks_is_4:
2881 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2882 jmp %%_initial_blocks_encrypted
2883
2884 %%_initial_num_blocks_is_3:
2885 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2886 jmp %%_initial_blocks_encrypted
2887
2888 %%_initial_num_blocks_is_2:
2889 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2890 jmp %%_initial_blocks_encrypted
2891
2892 %%_initial_num_blocks_is_1:
2893 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2894 jmp %%_initial_blocks_encrypted
2895
2896 %%_initial_num_blocks_is_0:
2897 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2898
2899
2900 %%_initial_blocks_encrypted:
2901 ;; The entire message was encrypted processed in initial and now need to be hashed
2902 cmp r13, 0
2903 je %%_encrypt_done
2904
2905 ;; Encrypt the final <16 byte (partial) block, then hash
2906 cmp r13, 16
2907 jl %%_encrypt_final_partial
2908
2909 ;; Process 7 full blocks plus a partial block
2910 cmp r13, 128
2911 jl %%_encrypt_by_8_partial
2912
2913
2914 %%_encrypt_by_8_parallel:
2915 ;; in_order vs. out_order is an optimization to increment the counter without shuffling
2916 ;; it back into little endian. r15d keeps track of when we need to increent in order so
2917 ;; that the carry is handled correctly.
2918 vmovd r15d, xmm9
2919 and r15d, 255
2920 vpshufb xmm9, [rel SHUF_MASK]
2921
2922
2923 %%_encrypt_by_8_new:
2924 cmp r15d, 255-8
2925 jg %%_encrypt_by_8
2926
2927
2928
2929 ;; xmm0 - T1
2930 ;; xmm10 - T2
2931 ;; xmm11 - T3
2932 ;; xmm12 - T4
2933 ;; xmm13 - T5
2934 ;; xmm14 - T6
2935 ;; xmm9 - CTR
2936 ;; xmm1 - XMM1
2937 ;; xmm2 - XMM2
2938 ;; xmm3 - XMM3
2939 ;; xmm4 - XMM4
2940 ;; xmm5 - XMM5
2941 ;; xmm6 - XMM6
2942 ;; xmm7 - XMM7
2943 ;; xmm8 - XMM8
2944 ;; xmm15 - T7
2945 add r15b, 8
2946 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
2947 add %%DATA_OFFSET, 128
2948 sub r13, 128
2949 cmp r13, 128
2950 jge %%_encrypt_by_8_new
2951
2952 vpshufb xmm9, [rel SHUF_MASK]
2953 jmp %%_encrypt_by_8_parallel_done
2954
2955 %%_encrypt_by_8:
2956 vpshufb xmm9, [rel SHUF_MASK]
2957 add r15b, 8
2958 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
2959 vpshufb xmm9, [rel SHUF_MASK]
2960 add %%DATA_OFFSET, 128
2961 sub r13, 128
2962 cmp r13, 128
2963 jge %%_encrypt_by_8_new
2964 vpshufb xmm9, [rel SHUF_MASK]
2965
2966
2967 %%_encrypt_by_8_parallel_done:
2968 ;; Test to see if we need a by 8 with partial block. At this point
2969 ;; bytes remaining should be either zero or between 113-127.
2970 cmp r13, 0
2971 je %%_encrypt_done
2972
2973 %%_encrypt_by_8_partial:
2974 ;; Shuffle needed to align key for partial block xor. out_order
2975 ;; is a little faster because it avoids extra shuffles.
2976 ;; TBD: Might need to account for when we don't have room to increment the counter.
2977
2978
2979 ;; Process parallel buffers with a final partial block.
2980 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
2981
2982
2983 add %%DATA_OFFSET, 128-16
2984 sub r13, 128-16
2985
2986 %%_encrypt_final_partial:
2987
2988 vpshufb xmm8, [rel SHUF_MASK]
2989 mov [%%GDATA_CTX + PBlockLen], r13
2990 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
2991
2992 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
2993 ;; GDATA, KEY, T1, T2
2994 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
2995
2996 vpshufb xmm8, [rel SHUF_MASK]
2997
2998
2999 %%_encrypt_done:
3000
3001 ;; Mapping to macro parameters
3002 ;; IN:
3003 ;; xmm9 contains the counter
3004 ;; xmm1-xmm8 contain the xor'd ciphertext
3005 ;; OUT:
3006 ;; xmm14 contains the final hash
3007 ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3008 %ifidn %%INSTANCE_TYPE, multi_call
3009 mov r13, [%%GDATA_CTX + PBlockLen]
3010 cmp r13, 0
3011 jz %%_hash_last_8
3012 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
3013 ;; XOR the partial word into the hash
3014 vpxor xmm14, xmm14, xmm8
3015 jmp %%_ghash_done
3016 %endif
3017 %%_hash_last_8:
3018 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3019
3020 %%_ghash_done:
3021 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
3022 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
3023
3024 %%_enc_dec_done:
3025
3026
3027 %endmacro ; GCM_ENC_DEC
3028
3029 ;;; ===========================================================================
3030 ;;; AESROUND4x128 macro
3031 ;;; - 4 lanes, 8 blocks per lane
3032 ;;; - it handles special cases: the last and zero rounds
3033 ;;; Uses NROUNDS macro defined at the top of the file to check the last round
3034 %macro AESROUND4x128 25
3035 %define %%L0B03 %1 ; [in/out] lane 0, blocks 0 to 3
3036 %define %%L0B47 %2 ; [in/out] lane 0, blocks 4 to 7
3037 %define %%L1B03 %3 ; [in/out] lane 1, blocks 0 to 3
3038 %define %%L1B47 %4 ; ...
3039 %define %%L2B03 %5
3040 %define %%L2B47 %6
3041 %define %%L3B03 %7 ; ...
3042 %define %%L3B47 %8 ; [in/out] lane 3, blocks 4 to 7
3043 %define %%TMP0 %9
3044 %define %%TMP1 %10
3045 %define %%TMP2 %11
3046 %define %%TMP3 %12
3047 %define %%KP0 %13 ; [in] expanded key pointer lane 0
3048 %define %%KP1 %14 ; [in] expanded key pointer lane 1
3049 %define %%KP2 %15 ; [in] expanded key pointer lane 2
3050 %define %%KP3 %16 ; [in] expanded key pointer lane 3
3051 %define %%ROUND %17 ; [in] round number
3052 %define %%D0L %18 ; [in] plain/cipher text blocks 0-3 lane 0 - NEEDED FOR THE LAST ROUND ONLY (CAN BE EMPTY OTHERWISE)
3053 %define %%D0H %19 ; [in] plain/cipher text blocks 4-7 lane 0
3054 %define %%D1L %20 ; [in] plain/cipher text blocks 0-3 lane 1
3055 %define %%D1H %21 ; ...
3056 %define %%D2L %22
3057 %define %%D2H %23
3058 %define %%D3L %24 ; ...
3059 %define %%D3H %25 ; [in] plain/cipher text blocks 4-7 lane 3
3060
3061 vbroadcastf64x2 %%TMP0, [%%KP0 + 16*(%%ROUND)]
3062 vbroadcastf64x2 %%TMP1, [%%KP1 + 16*(%%ROUND)]
3063 vbroadcastf64x2 %%TMP2, [%%KP2 + 16*(%%ROUND)]
3064 vbroadcastf64x2 %%TMP3, [%%KP3 + 16*(%%ROUND)]
3065 %if %%ROUND < 1
3066 ;; round 0
3067 vpxorq %%L0B03, %%L0B03, %%TMP0
3068 vpxorq %%L0B47, %%L0B47, %%TMP0
3069 vpxorq %%L1B03, %%L1B03, %%TMP1
3070 vpxorq %%L1B47, %%L1B47, %%TMP1
3071 vpxorq %%L2B03, %%L2B03, %%TMP2
3072 vpxorq %%L2B47, %%L2B47, %%TMP2
3073 vpxorq %%L3B03, %%L3B03, %%TMP3
3074 vpxorq %%L3B47, %%L3B47, %%TMP3
3075 %else
3076 %if %%ROUND <= NROUNDS
3077 ;; rounds 1 to 9/11/13
3078 vaesenc %%L0B03, %%L0B03, %%TMP0
3079 vaesenc %%L0B47, %%L0B47, %%TMP0
3080 vaesenc %%L1B03, %%L1B03, %%TMP1
3081 vaesenc %%L1B47, %%L1B47, %%TMP1
3082 vaesenc %%L2B03, %%L2B03, %%TMP2
3083 vaesenc %%L2B47, %%L2B47, %%TMP2
3084 vaesenc %%L3B03, %%L3B03, %%TMP3
3085 vaesenc %%L3B47, %%L3B47, %%TMP3
3086 %else
3087 ;; the last round - mix enclast with text xor's
3088 vaesenclast %%L0B03, %%L0B03, %%TMP0
3089 vpxorq %%L0B03, %%L0B03, %%D0L
3090 vaesenclast %%L0B47, %%L0B47, %%TMP0
3091 vpxorq %%L0B47, %%L0B47, %%D0H
3092 vaesenclast %%L1B03, %%L1B03, %%TMP1
3093 vpxorq %%L1B03, %%L1B03, %%D1L
3094 vaesenclast %%L1B47, %%L1B47, %%TMP1
3095 vpxorq %%L1B47, %%L1B47, %%D1H
3096 vaesenclast %%L2B03, %%L2B03, %%TMP2
3097 vpxorq %%L2B03, %%L2B03, %%D2L
3098 vaesenclast %%L2B47, %%L2B47, %%TMP2
3099 vpxorq %%L2B47, %%L2B47, %%D2H
3100 vaesenclast %%L3B03, %%L3B03, %%TMP3
3101 vpxorq %%L3B03, %%L3B03, %%D3L
3102 vaesenclast %%L3B47, %%L3B47, %%TMP3
3103 vpxorq %%L3B47, %%L3B47, %%D3H
3104 %endif
3105 %endif
3106 %endmacro ; AESROUND4x128
3107
3108 ;;; ===========================================================================
3109 ;;; ===========================================================================
3110 ;;; Horizontal XOR - 4 x 128bits xored together
3111 %macro VHPXORI4x128 2
3112 %define %%REG %1 ; [in/out] zmm512 4x128bits to xor; i128 on output
3113 %define %%TMP %2 ; temporary register
3114 vextracti64x4 YWORD(%%TMP), %%REG, 1
3115 vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
3116 vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1
3117 vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
3118 %endmacro ; VHPXORI4x128
3119
3120 ;;; ===========================================================================
3121 ;;; ===========================================================================
3122 ;;; schoolbook multiply - 1st step
3123 %macro CLMUL_INIT 6
3124 %define %%KP %1 ; [in] key pointer
3125 %define %%HI %2 ; [in] previous blocks 4 to 7
3126 %define %%TMP %3
3127 %define %%TH %4 ; [out] tmp high
3128 %define %%TM %5 ; [out] tmp medium
3129 %define %%TL %6 ; [out] tmp low
3130 vmovdqu64 %%TMP, [%%KP + HashKey_4]
3131 vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1
3132 vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0
3133 vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0
3134 vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1
3135 vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL]
3136 %endmacro ; CLMUL_INIT
3137
3138 ;;; ===========================================================================
3139 ;;; ===========================================================================
3140 ;;; schoolbook multiply - 2nd step
3141 %macro CLMUL_STEP 9
3142 %define %%KP %1 ; [in] key pointer
3143 %define %%HI %2 ; [out] high 128b of hash to reduce
3144 %define %%LO %3 ; [in/out] previous blocks 0 to 3; low 128b of hash to reduce
3145 %define %%TMP0 %4
3146 %define %%TMP1 %5
3147 %define %%TMP2 %6
3148 %define %%TH %7 ; [in] tmp high
3149 %define %%TM %8 ; [in] tmp medium
3150 %define %%TL %9 ; [in] tmp low
3151
3152 vmovdqu64 %%TMP0, [%%KP + HashKey_8]
3153 vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1
3154 vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1
3155 vpxorq %%TH, %%TH, %%TMP2
3156 vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0
3157 vpxorq %%TL, %%TL, %%TMP2
3158 vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0
3159 vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0
3160
3161 ;; finish multiplications
3162 vpsrldq %%TMP2, %%TM, 8
3163 vpxorq %%HI, %%TH, %%TMP2
3164 vpslldq %%TMP2, %%TM, 8
3165 vpxorq %%LO, %%TL, %%TMP2
3166
3167 ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
3168 ;; note: (X1+Y0) handled elsewhere
3169 VHPXORI4x128 %%HI, %%TMP2
3170 VHPXORI4x128 %%LO, %%TMP1
3171 ;; HIx holds top 128 bits
3172 ;; LOx holds low 128 bits
3173 ;; - further reductions to follow
3174 %endmacro ; CLMUL_STEP
3175
3176 ;;; ===========================================================================
3177 ;;; ===========================================================================
3178 ;;; Encrypt the initial 8 blocks from 4 lanes and apply ghash on the ciphertext
3179 %macro INITIAL_BLOCKS_x4 33
3180 %define %%IN %1 ; pointer to array of pointers to input text
3181 %define %%OUT %2 ; pointer to array of pointers to output text
3182 %define %%KEYP0 %3 ; pointer to expanded keys, lane 0
3183 %define %%KEYP1 %4 ; pointer to expanded keys, lane 1
3184 %define %%KEYP2 %5 ; pointer to expanded keys, lane 2
3185 %define %%KEYP3 %6 ; pointer to expanded keys, lane 3
3186 %define %%TPTR0 %7 ; temporary GP register
3187 %define %%TPTR1 %8 ; temporary GP register
3188 %define %%TPTR2 %9 ; temporary GP register
3189 %define %%TPTR3 %10 ; temporary GP register
3190 %define %%L0B03 %11 ; [out] cipher text blocks 0 to 3, lane 0
3191 %define %%L0B47 %12 ; [out] cipher text blocks 4 to 7, lane 0
3192 %define %%L1B03 %13 ; [out] cipher text blocks 0 to 3, lane 1
3193 %define %%L1B47 %14 ; ...
3194 %define %%L2B03 %15
3195 %define %%L2B47 %16
3196 %define %%L3B03 %17 ; ...
3197 %define %%L3B47 %18 ; [out] cipher text blocks 4 to 7, lane 3
3198 %define %%GHASH %19 ; [in] AAD lane 0, 1, 2 and 3
3199 %define %%T0 %20 ; temporary AVX512 register
3200 %define %%T1 %21 ; temporary AVX512 register
3201 %define %%T2 %22 ; temporary AVX512 register
3202 %define %%T3 %23 ; temporary AVX512 register
3203 %define %%T4 %24 ; temporary AVX512 register
3204 %define %%T5 %25 ; temporary AVX512 register
3205 %define %%T6 %26 ; temporary AVX512 register
3206 %define %%T7 %27 ; temporary AVX512 register
3207 %define %%T8 %28 ; temporary AVX512 register
3208 %define %%T9 %29 ; temporary AVX512 register
3209 %define %%T10 %30 ; temporary AVX512 register
3210 %define %%T11 %31 ; temporary AVX512 register
3211 %define %%ZMM_SHFMASK %32 ; [in] shuffle mask changing byte order in 4 128bit words
3212 %define %%ENC_DEC %33 ; [in] ENC (encrypt) or DEC (decrypt) selector
3213
3214 %define %%INP0 %%TPTR0
3215 %define %%INP1 %%TPTR1
3216 %define %%INP2 %%TPTR2
3217 %define %%INP3 %%TPTR3
3218
3219 %define %%OUTP0 %%TPTR0
3220 %define %%OUTP1 %%TPTR1
3221 %define %%OUTP2 %%TPTR2
3222 %define %%OUTP3 %%TPTR3
3223
3224 ;; load data in
3225 mov %%INP0, [%%IN + 8*0]
3226 mov %%INP1, [%%IN + 8*1]
3227 mov %%INP2, [%%IN + 8*2]
3228 mov %%INP3, [%%IN + 8*3]
3229
3230 VX512LDR %%T4, [%%INP0 + (16*0)]
3231 VX512LDR %%T5, [%%INP0 + (16*4)]
3232 VX512LDR %%T6, [%%INP1 + (16*0)]
3233 VX512LDR %%T7, [%%INP1 + (16*4)]
3234 VX512LDR %%T8, [%%INP2 + (16*0)]
3235 VX512LDR %%T9, [%%INP2 + (16*4)]
3236 VX512LDR %%T10,[%%INP3 + (16*0)]
3237 VX512LDR %%T11,[%%INP3 + (16*4)]
3238
3239 ;; shuffle IVB's
3240 vpshufb %%L0B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3241 vpshufb %%L0B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3242 vpshufb %%L1B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3243 vpshufb %%L1B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3244 vpshufb %%L2B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3245 vpshufb %%L2B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3246 vpshufb %%L3B03, %%ZMM_SHFMASK ; perform a 16Byte swap
3247 vpshufb %%L3B47, %%ZMM_SHFMASK ; perform a 16Byte swap
3248
3249 ;; move to AES encryption rounds
3250 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3251 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3252 %%T0, %%T1, %%T2, %%T3, \
3253 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 0, \
3254 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3255
3256 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3257 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3258 %%T0, %%T1, %%T2, %%T3, \
3259 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 1, \
3260 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3261
3262 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3263 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3264 %%T0, %%T1, %%T2, %%T3, \
3265 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 2, \
3266 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3267
3268 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3269 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3270 %%T0, %%T1, %%T2, %%T3, \
3271 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 3, \
3272 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3273
3274 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3275 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3276 %%T0, %%T1, %%T2, %%T3, \
3277 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 4, \
3278 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3279
3280 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3281 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3282 %%T0, %%T1, %%T2, %%T3, \
3283 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 5, \
3284 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3285
3286 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3287 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3288 %%T0, %%T1, %%T2, %%T3, \
3289 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 6, \
3290 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3291
3292 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3293 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3294 %%T0, %%T1, %%T2, %%T3, \
3295 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 7, \
3296 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3297
3298 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3299 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3300 %%T0, %%T1, %%T2, %%T3, \
3301 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 8, \
3302 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3303
3304 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3305 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3306 %%T0, %%T1, %%T2, %%T3, \
3307 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 9, \
3308 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3309
3310 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3311 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3312 %%T0, %%T1, %%T2, %%T3, \
3313 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 10, \
3314 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3315
3316 %ifndef GCM128_MODE
3317 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3318 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3319 %%T0, %%T1, %%T2, %%T3, \
3320 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 11, \
3321 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3322
3323 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3324 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3325 %%T0, %%T1, %%T2, %%T3, \
3326 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 12, \
3327 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3328
3329 %ifdef GCM256_MODE
3330 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3331 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3332 %%T0, %%T1, %%T2, %%T3, \
3333 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 13, \
3334 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3335
3336 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3337 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3338 %%T0, %%T1, %%T2, %%T3, \
3339 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 14, \
3340 %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
3341 %endif
3342 %endif
3343
3344 ;; store
3345 mov %%OUTP0, [%%OUT + 8*0]
3346 mov %%OUTP1, [%%OUT + 8*1]
3347 mov %%OUTP2, [%%OUT + 8*2]
3348 mov %%OUTP3, [%%OUT + 8*3]
3349
3350 VX512STR [%%OUTP0 + (16*0)], %%L0B03
3351 VX512STR [%%OUTP0 + (16*4)], %%L0B47
3352 VX512STR [%%OUTP1 + (16*0)], %%L1B03
3353 VX512STR [%%OUTP1 + (16*4)], %%L1B47
3354 VX512STR [%%OUTP2 + (16*0)], %%L2B03
3355 VX512STR [%%OUTP2 + (16*4)], %%L2B47
3356 VX512STR [%%OUTP3 + (16*0)], %%L3B03
3357 VX512STR [%%OUTP3 + (16*4)], %%L3B47
3358
3359 %ifidn %%ENC_DEC, DEC
3360 ;; decryption - cipher text needs to go to GHASH phase
3361 vpshufb %%L0B03, %%T4, %%ZMM_SHFMASK
3362 vpshufb %%L0B47, %%T5, %%ZMM_SHFMASK
3363 vpshufb %%L1B03, %%T6, %%ZMM_SHFMASK
3364 vpshufb %%L1B47, %%T7, %%ZMM_SHFMASK
3365 vpshufb %%L2B03, %%T8, %%ZMM_SHFMASK
3366 vpshufb %%L2B47, %%T9, %%ZMM_SHFMASK
3367 vpshufb %%L3B03, %%T10, %%ZMM_SHFMASK
3368 vpshufb %%L3B47, %%T11, %%ZMM_SHFMASK
3369 %else
3370 ;; encryption
3371 vpshufb %%L0B03, %%L0B03, %%ZMM_SHFMASK
3372 vpshufb %%L0B47, %%L0B47, %%ZMM_SHFMASK
3373 vpshufb %%L1B03, %%L1B03, %%ZMM_SHFMASK
3374 vpshufb %%L1B47, %%L1B47, %%ZMM_SHFMASK
3375 vpshufb %%L2B03, %%L2B03, %%ZMM_SHFMASK
3376 vpshufb %%L2B47, %%L2B47, %%ZMM_SHFMASK
3377 vpshufb %%L3B03, %%L3B03, %%ZMM_SHFMASK
3378 vpshufb %%L3B47, %%L3B47, %%ZMM_SHFMASK
3379 %endif
3380
3381 ;; xor encrypted block 0 with GHASH for the next GHASH round
3382 vmovdqa64 XWORD(%%T1), XWORD(%%GHASH)
3383 vextracti32x4 XWORD(%%T2), %%GHASH, 1
3384 vextracti32x4 XWORD(%%T3), %%GHASH, 2
3385 vextracti32x4 XWORD(%%T4), %%GHASH, 3
3386
3387 vpxorq %%L0B03, %%T1
3388 vpxorq %%L1B03, %%T2
3389 vpxorq %%L2B03, %%T3
3390 vpxorq %%L3B03, %%T4
3391 %endmacro ;INITIAL_BLOCKS_x4
3392
3393 ;;; ===========================================================================
3394 ;;; ===========================================================================
3395 ;;; Encrypt 8 blocks at a time on 4 lanes
3396 ;;; GHASH the 8 previously encrypted ciphertext blocks (4 lanes)
3397 %macro GHASH_8_ENCRYPT_8_PARALLEL_x4 44
3398 %define %%IN %1 ; pointer to array of pointers to plain/cipher text
3399 %define %%OUT %2 ; pointer to array of pointers to cipher/plain text
3400 %define %%KEYP0 %3 ; pointer to expanded keys, lane 0
3401 %define %%KEYP1 %4 ; pointer to expanded keys, lane 1
3402 %define %%KEYP2 %5 ; pointer to expanded keys, lane 2
3403 %define %%KEYP3 %6 ; pointer to expanded keys, lane 3
3404 %define %%TPTR0 %7 ; temporary GP register (used as pointer)
3405 %define %%TPTR1 %8 ; temporary GP register (used as pointer)
3406 %define %%TPTR2 %9 ; temporary GP register (used as pointer)
3407 %define %%TPTR3 %10 ; temporary GP register (used as pointer)
3408 %define %%DATA_OFFSET %11 ; current data offset (used with text loads and stores)
3409 %define %%CTRL0 %12 ; counter blocks 4 to 7 for lane 0
3410 %define %%CTRL1 %13 ; counter blocks 4 to 7 for lane 1
3411 %define %%CTRL2 %14 ; counter blocks 4 to 7 for lane 2
3412 %define %%CTRL3 %15 ; counter blocks 4 to 7 for lane 3
3413 %define %%L0B03 %16 ; lane 0 blocks 0 to 3
3414 %define %%L0B47 %17 ; lane 0 blocks 4 to 7
3415 %define %%L1B03 %18 ; lane 1 blocks 0 to 3
3416 %define %%L1B47 %19 ; lane 1 blocks 4 to 7
3417 %define %%L2B03 %20 ; lane 2 blocks 0 to 3
3418 %define %%L2B47 %21 ; lane 2 blocks 4 to 7
3419 %define %%L3B03 %22 ; lane 3 blocks 0 to 3
3420 %define %%L3B47 %23 ; lane 3 blocks 4 to 7
3421 %define %%GHASH %24 ; [in/out] GHASH for 4 lanes
3422 %define %%T0 %25
3423 %define %%T1 %26
3424 %define %%T2 %27
3425 %define %%T3 %28
3426 %define %%T4 %29
3427 %define %%T5 %30
3428 %define %%T6 %31
3429 %define %%T7 %32
3430 %define %%T8 %33
3431 %define %%T9 %34
3432 %define %%PREVLO0 %35 ; [in] 4 lanes x 8 blocks of cipher text for GHASH
3433 %define %%PREVHI0 %36
3434 %define %%PREVLO1 %37
3435 %define %%PREVHI1 %38
3436 %define %%PREVLO2 %39
3437 %define %%PREVHI2 %40
3438 %define %%PREVLO3 %41
3439 %define %%PREVHI3 %42
3440 %define %%ZMM_SHFMASK %43 ; [in] byte swap shuffle mask for 128 bits
3441 %define %%ENC_DEC %44 ; [in] ENC (encryption) or DEC (decryption)
3442
3443 ;;; ============================================================================
3444 ;;; a few virtual register mappings
3445 %define %%INP0 %%TPTR0
3446 %define %%INP1 %%TPTR1
3447 %define %%INP2 %%TPTR2
3448 %define %%INP3 %%TPTR3
3449
3450 %define %%OUTP0 %%TPTR0
3451 %define %%OUTP1 %%TPTR1
3452 %define %%OUTP2 %%TPTR2
3453 %define %%OUTP3 %%TPTR3
3454
3455 %define %%TH %%T5
3456 %define %%TM %%T6
3457 %define %%TL %%T7
3458
3459 %define %%TEXTL0B03 %%T8
3460 %define %%TEXTL0B47 %%T9
3461 %define %%TEXTL1B03 %%PREVLO1 ; GHASH needs to be complete before using these
3462 %define %%TEXTL1B47 %%PREVHI1
3463 %define %%TEXTL2B03 %%PREVLO2
3464 %define %%TEXTL2B47 %%PREVHI2
3465 %define %%TEXTL3B03 %%PREVLO3
3466 %define %%TEXTL3B47 %%PREVHI3
3467 ;;; ============================================================================
3468
3469 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3470 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3471 %%T0, %%T1, %%T2, %%T3, \
3472 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 0, \
3473 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3474 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3475
3476 mov %%INP0, [%%IN + 8*0]
3477 mov %%INP1, [%%IN + 8*1]
3478 mov %%INP2, [%%IN + 8*2]
3479 mov %%INP3, [%%IN + 8*3]
3480
3481 ;; =====================================================================
3482 CLMUL_INIT %%KEYP0, %%PREVHI0, %%T4, %%TH, %%TM, %%TL
3483 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3484 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3485 %%T0, %%T1, %%T2, %%T3, \
3486 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 1, \
3487 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3488 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3489
3490 CLMUL_STEP %%KEYP0, %%PREVHI0, %%PREVLO0, %%T4, %%T8, %%T9, %%TH, %%TM, %%TL
3491
3492 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3493 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3494 %%T0, %%T1, %%T2, %%T3, \
3495 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 2, \
3496 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3497 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3498
3499 ;; =====================================================================
3500
3501 CLMUL_INIT %%KEYP1, %%PREVHI1, %%T4, %%TH, %%TM, %%TL
3502
3503 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3504 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3505 %%T0, %%T1, %%T2, %%T3, \
3506 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 3, \
3507 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3508 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3509
3510 CLMUL_STEP %%KEYP1, %%PREVHI1, %%PREVLO1, %%T4, %%T8, %%T9, %%TH, %%TM, %%TL
3511
3512 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3513 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3514 %%T0, %%T1, %%T2, %%T3, \
3515 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 4, \
3516 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3517 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3518
3519 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3520 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO1), 1
3521 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI1), 1
3522
3523 ;; =====================================================================
3524
3525 CLMUL_INIT %%KEYP2, %%PREVHI2, %%T4, %%T5, %%T6, %%T7
3526
3527 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3528 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3529 %%T0, %%T1, %%T2, %%T3, \
3530 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 5, \
3531 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3532 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3533
3534 CLMUL_STEP %%KEYP2, %%PREVHI2, %%PREVLO2, %%T4, %%T8, %%T9, %%T5, %%T6, %%T7
3535
3536 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3537 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3538 %%T0, %%T1, %%T2, %%T3, \
3539 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 6, \
3540 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3541 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3542
3543 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3544 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO2), 2
3545 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI2), 2
3546
3547 ;; =====================================================================
3548
3549 CLMUL_INIT %%KEYP3, %%PREVHI3, %%T4, %%T5, %%T6, %%T7
3550
3551 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3552 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3553 %%T0, %%T1, %%T2, %%T3, \
3554 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 7, \
3555 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3556 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3557
3558 CLMUL_STEP %%KEYP3, %%PREVHI3, %%PREVLO3, %%T4, %%T8, %%T9, %%T5, %%T6, %%T7
3559
3560 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3561 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3562 %%T0, %%T1, %%T2, %%T3, \
3563 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 8, \
3564 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3565 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3566
3567 ;; accumulate GHASH results from 4 lanes into [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3568 vinserti64x2 %%PREVLO0, XWORD(%%PREVLO3), 3
3569 vinserti64x2 %%PREVHI0, XWORD(%%PREVHI3), 3
3570
3571 ;; =====================================================================
3572 ;; load plain/cipher text
3573 ;; - this cannot be done before GHASH is complete (reuses same registers)
3574
3575 VX512LDR %%TEXTL0B03, [%%INP0 + %%DATA_OFFSET + 64*0]
3576 VX512LDR %%TEXTL0B47, [%%INP0 + %%DATA_OFFSET + 64*1]
3577 VX512LDR %%TEXTL1B03, [%%INP1 + %%DATA_OFFSET + 64*0]
3578 VX512LDR %%TEXTL1B47, [%%INP1 + %%DATA_OFFSET + 64*1]
3579 VX512LDR %%TEXTL2B03, [%%INP2 + %%DATA_OFFSET + 64*0]
3580 VX512LDR %%TEXTL2B47, [%%INP2 + %%DATA_OFFSET + 64*1]
3581 VX512LDR %%TEXTL3B03, [%%INP3 + %%DATA_OFFSET + 64*0]
3582 VX512LDR %%TEXTL3B47, [%%INP3 + %%DATA_OFFSET + 64*1]
3583
3584 mov %%OUTP0, [%%OUT + 8*0]
3585 mov %%OUTP1, [%%OUT + 8*1]
3586 mov %%OUTP2, [%%OUT + 8*2]
3587 mov %%OUTP3, [%%OUT + 8*3]
3588
3589 ;; =====================================================================
3590 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3591 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3592 %%T0, %%T1, %%T2, %%T3, \
3593 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 9, \
3594 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3595 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3596
3597 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3598 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3599 %%T0, %%T1, %%T2, %%T3, \
3600 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 10, \
3601 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3602 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3603
3604 %ifndef GCM128_MODE
3605 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3606 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3607 %%T0, %%T1, %%T2, %%T3, \
3608 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 11, \
3609 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3610 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3611 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3612 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3613 %%T0, %%T1, %%T2, %%T3, \
3614 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 12, \
3615 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3616 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3617 %ifdef GCM256_MODE
3618 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3619 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3620 %%T0, %%T1, %%T2, %%T3, \
3621 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 13, \
3622 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3623 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3624 AESROUND4x128 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3625 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3626 %%T0, %%T1, %%T2, %%T3, \
3627 %%KEYP0, %%KEYP1, %%KEYP2, %%KEYP3, 14, \
3628 %%TEXTL0B03, %%TEXTL0B47, %%TEXTL1B03, %%TEXTL1B47, \
3629 %%TEXTL2B03, %%TEXTL2B47, %%TEXTL3B03, %%TEXTL3B47
3630 %endif ; GCM256
3631 %endif ; !GCM128
3632
3633 ;; =====================================================================
3634 ;; =====================================================================
3635 ;; =====================================================================
3636
3637 ;; =====================================================================
3638 ;; first phase of the reduction (barret)
3639 ;; - becasue of bit ordering, LSB 128 bit word is reduced rather than MSB
3640 ;; - accumulated GHASH in [%%PREVHI0 (msb) : %%PREVLO0 (lsb)]
3641
3642 vmovdqu64 %%T3, [rel POLY2]
3643
3644 vpclmulqdq %%T4, %%T3, %%PREVLO0, 0x01
3645 vpslldq %%T4, %%T4, 8 ; shift-L 2 DWs
3646 vpxorq %%PREVLO0, %%PREVLO0, %%T4 ; first phase of the reduction complete
3647
3648 ;; =====================================================================
3649 ;; store cipher/plain text
3650
3651 VX512STR [%%OUTP0 + %%DATA_OFFSET + 64*0], %%L0B03
3652 VX512STR [%%OUTP0 + %%DATA_OFFSET + 64*1], %%L0B47
3653 VX512STR [%%OUTP1 + %%DATA_OFFSET + 64*0], %%L1B03
3654 VX512STR [%%OUTP1 + %%DATA_OFFSET + 64*1], %%L1B47
3655 VX512STR [%%OUTP2 + %%DATA_OFFSET + 64*0], %%L2B03
3656 VX512STR [%%OUTP2 + %%DATA_OFFSET + 64*1], %%L2B47
3657 VX512STR [%%OUTP3 + %%DATA_OFFSET + 64*0], %%L3B03
3658 VX512STR [%%OUTP3 + %%DATA_OFFSET + 64*1], %%L3B47
3659
3660 ;; =====================================================================
3661 ;; second phase of the reduction
3662 vpclmulqdq %%T4, %%T3, %%PREVLO0, 0x00
3663 vpsrldq %%T4, %%T4, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
3664
3665 vpclmulqdq %%GHASH, %%T3, %%PREVLO0, 0x10
3666 vpslldq %%GHASH, %%GHASH, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
3667
3668 ;; second phase of the reduction complete
3669 vpternlogq %%GHASH, %%T4, %%PREVHI0, 0x96 ; GHASH = GHASH xor T4 xor PREVHI0
3670
3671 ;; =====================================================================
3672
3673 ;; prepare cipher blocks for the next GHASH round
3674 %ifidn %%ENC_DEC, ENC
3675 vpshufb %%L0B03, %%L0B03, %%ZMM_SHFMASK
3676 vpshufb %%L0B47, %%L0B47, %%ZMM_SHFMASK
3677 vpshufb %%L1B03, %%L1B03, %%ZMM_SHFMASK
3678 vpshufb %%L1B47, %%L1B47, %%ZMM_SHFMASK
3679 vpshufb %%L2B03, %%L2B03, %%ZMM_SHFMASK
3680 vpshufb %%L2B47, %%L2B47, %%ZMM_SHFMASK
3681 vpshufb %%L3B03, %%L3B03, %%ZMM_SHFMASK
3682 vpshufb %%L3B47, %%L3B47, %%ZMM_SHFMASK
3683 %else
3684 ;; GHASH is computed over cipher text (use text)
3685 vpshufb %%L0B03, %%TEXTL0B03, %%ZMM_SHFMASK
3686 vpshufb %%L0B47, %%TEXTL0B47, %%ZMM_SHFMASK
3687 vpshufb %%L1B03, %%TEXTL1B03, %%ZMM_SHFMASK
3688 vpshufb %%L1B47, %%TEXTL1B47, %%ZMM_SHFMASK
3689 vpshufb %%L2B03, %%TEXTL2B03, %%ZMM_SHFMASK
3690 vpshufb %%L2B47, %%TEXTL2B47, %%ZMM_SHFMASK
3691 vpshufb %%L3B03, %%TEXTL3B03, %%ZMM_SHFMASK
3692 vpshufb %%L3B47, %%TEXTL3B47, %%ZMM_SHFMASK
3693 %endif
3694
3695 ;; xor encrypted block 0 with GHASH for the next round
3696 vmovdqa64 XWORD(%%T1), XWORD(%%GHASH)
3697 vextracti32x4 XWORD(%%T2), %%GHASH, 1
3698 vextracti32x4 XWORD(%%T3), %%GHASH, 2
3699 vextracti32x4 XWORD(%%T4), %%GHASH, 3
3700
3701 vpxorq %%L0B03, %%T1
3702 vpxorq %%L1B03, %%T2
3703 vpxorq %%L2B03, %%T3
3704 vpxorq %%L3B03, %%T4
3705 %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL_x4
3706
3707 ;;; ===========================================================================
3708 ;;; ===========================================================================
3709 ;;; GHASH the last 8 ciphertext blocks on 4 lanes
3710 %macro GHASH_LAST_8x4 25
3711 %define %%KEYP0 %1 ; [in] pointer to expanded keys, lane 0
3712 %define %%KEYP1 %2 ; [in] pointer to expanded keys, lane 1
3713 %define %%KEYP2 %3 ; [in] pointer to expanded keys, lane 2
3714 %define %%KEYP3 %4 ; [in] pointer to expanded keys, lane 3
3715 %define %%L0B03 %5 ; [in] clobbered, ciper text, lane 0, blocks 0 to 3 (Y0 already XOR'ed on X1)
3716 %define %%L0B47 %6 ; [in] clobbered, ciper text, lane 0, blocks 4 to 7
3717 %define %%L1B03 %7 ; ...
3718 %define %%L1B47 %8
3719 %define %%L2B03 %9
3720 %define %%L2B47 %10
3721 %define %%L3B03 %11 ; ...
3722 %define %%L3B47 %12 ; [in] clobbered, ciper text, lane 3, blocks 4 to 7
3723 %define %%GHASH %13 ; [out] ghash output
3724 %define %%T1 %14
3725 %define %%T2 %15
3726 %define %%T3 %16
3727 %define %%T4 %17
3728 %define %%T5 %18
3729 %define %%T6 %19
3730 %define %%T7 %20
3731 %define %%T8 %21
3732 %define %%T9 %22
3733 %define %%T10 %23
3734 %define %%T11 %24
3735 %define %%T12 %25
3736
3737 %define %%TH %%T5
3738 %define %%TM %%T6
3739 %define %%TL %%T7
3740
3741 %define %%L %%T1
3742 %define %%H %%T2
3743
3744 ;; =====================================================================
3745 ;; lane 0, 8 blocks
3746
3747 CLMUL_INIT %%KEYP0, %%L0B47, %%T4, %%TH, %%TM, %%TL
3748 CLMUL_STEP %%KEYP0, %%L0B47, %%L0B03, \
3749 %%T4, %%T8, %%T9, \
3750 %%TH, %%TM, %%TL
3751
3752 vmovdqa64 XWORD(%%L), XWORD(%%L0B03)
3753 vmovdqa64 XWORD(%%H), XWORD(%%L0B47)
3754
3755 ;; =====================================================================
3756 ;; lane 1, 8 blocks
3757
3758 CLMUL_INIT %%KEYP1, %%L1B47, %%T4, %%TH, %%TM, %%TL
3759 CLMUL_STEP %%KEYP1, %%L1B47, %%L1B03, \
3760 %%T4, %%T8, %%T9, \
3761 %%TH, %%TM, %%TL
3762
3763 vinserti64x2 %%L, XWORD(%%L1B03), 1
3764 vinserti64x2 %%H, XWORD(%%L1B47), 1
3765
3766 ;; =====================================================================
3767 ;; lane 2, 8 blocks
3768
3769 CLMUL_INIT %%KEYP2, %%L2B47, %%T4, %%TH, %%TM, %%TL
3770 CLMUL_STEP %%KEYP2, %%L2B47, %%L2B03, \
3771 %%T4, %%T8, %%T9, \
3772 %%TH, %%TM, %%TL
3773
3774 vinserti64x2 %%L, XWORD(%%L2B03), 2
3775 vinserti64x2 %%H, XWORD(%%L2B47), 2
3776
3777 ;; =====================================================================
3778 ;; lane 3, 8 blocks
3779
3780 CLMUL_INIT %%KEYP3, %%L3B47, %%T4, %%TH, %%TM, %%TL
3781 CLMUL_STEP %%KEYP3, %%L3B47, %%L3B03, \
3782 %%T4, %%T8, %%T9, \
3783 %%TH, %%TM, %%TL
3784
3785 vinserti64x2 %%L, XWORD(%%L3B03), 3
3786 vinserti64x2 %%H, XWORD(%%L3B47), 3
3787
3788 ;; =====================================================================
3789 ;; =====================================================================
3790 ;; first phase of the reduction <H(hi):L(low)>
3791 ;; - reducing L, rather H, due to bit ordering
3792
3793 vmovdqu64 %%T3, [rel POLY2]
3794
3795 vpclmulqdq %%T4, %%T3, %%L, 0x01
3796 vpslldq %%T4, %%T4, 8 ; shift-L xmm2 2 DWs
3797
3798 vpxorq %%L, %%L, %%T4 ; first phase of the reduction complete
3799
3800 ;; =====================================================================
3801 ;; second phase of the reduction
3802 vpclmulqdq %%T4, %%T3, %%L, 0x00
3803 vpsrldq %%T4, %%T4, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
3804
3805 vpclmulqdq %%GHASH, %%T3, %%L, 0x10
3806 vpslldq %%GHASH, %%GHASH, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
3807
3808 ;; second phase of the reduction complete
3809 vpternlogq %%GHASH, %%T4, %%H, 0x96 ; GHASH = GHASH xor T4 xor H
3810 ;; =====================================================================
3811 %endmacro
3812
3813 ;;; ===========================================================================
3814 ;;; ===========================================================================
3815 ;;; GCM_ENC_DEC_4x128 Encodes/Decodes given data
3816 ;;; - 4 lanes, 8 blocks at a time (hence 4x128 bytes or 4x8 blocks)
3817 ;;; - assumes that the passed gcm_context_data struct has been initialized by GCM_INIT
3818 ;;; - requires the input data be multiple of 128 bytes
3819 ;;; Input: gcm_key_data struct *GDATA_KEY[4]
3820 ;;; gcm_context_data *GDATA_CTX[4]
3821 ;;; input text PLAIN_CYPH_IN[4]
3822 ;;; input text length (PLAIN_CYPH_LEN) and
3823 ;;; whether encoding or decoding (ENC_DEC).
3824 ;;; Output: A cipher of the given plain text CYPH_PLAIN_OUT[4]
3825 ;;; updated GDATA_CTX[4]
3826 ;;; Linux clobbers: rax, rbx, rcx, rdx, rbp, r8-r15, zmm0-zmm31
3827 ;;; Windows clobbers: rax, rbx, rdi ,rsi, rbp, r8-r15, zmm0-zmm31
3828 ;;; ===========================================================================
3829 %macro GCM_ENC_DEC_4x128 3
3830 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded keys
3831 %define %%PLAIN_CYPH_LEN %2 ; [in] length of the text to process (multiple of 128 bytes)
3832 %define %%ENC_DEC %3 ; [in] ENC (encrypt) or DEC (decrypt) selector
3833
3834 %define %%GDATA_KEY %%STATE + _gcm_args_keys
3835 %define %%GDATA_CTX %%STATE + _gcm_args_ctx
3836 %define %%CYPH_PLAIN_OUT %%STATE + _gcm_args_out
3837 %define %%PLAIN_CYPH_IN %%STATE + _gcm_args_in
3838
3839 %define %%LEN_REG %%PLAIN_CYPH_LEN
3840 %define %%DATA_OFFSET r14 ;; @note: on windows this reg is used to retrive stack args
3841
3842 ;;; ===========================================================================
3843 ;;; register mappings within the macro
3844
3845 %define %%TPTR0 r9
3846 %define %%TPTR1 r10
3847 %define %%TPTR2 r11
3848 %define %%TPTR3 r12
3849
3850 %define %%GPR0 rax
3851 %define %%GPR1 rbx
3852 %define %%GPR2 rbp
3853 %define %%GPR3 r15
3854
3855 %ifidn __OUTPUT_FORMAT__, win64
3856 %define %%KPTR0 r8
3857 %define %%KPTR1 r13
3858 %define %%KPTR2 rdi
3859 %define %%KPTR3 rsi
3860 %else
3861 %define %%KPTR0 rdx
3862 %define %%KPTR1 rcx
3863 %define %%KPTR2 r8
3864 %define %%KPTR3 r13
3865 %endif
3866
3867 %define %%L0B03 zmm0
3868 %define %%L0B47 zmm1
3869 %define %%L1B03 zmm2
3870 %define %%L1B47 zmm3
3871 %define %%L2B03 zmm4
3872 %define %%L2B47 zmm5
3873 %define %%L3B03 zmm6
3874 %define %%L3B47 zmm7
3875
3876 %define %%T1 zmm8
3877 %define %%T2 zmm9
3878 %define %%T3 zmm10
3879 %define %%T4 zmm11
3880 %define %%T5 zmm12
3881 %define %%T6 zmm13
3882 %define %%T7 zmm14
3883 %define %%T8 zmm15
3884 %define %%T9 zmm16
3885 %define %%T10 zmm17
3886 %define %%T11 zmm18
3887 %define %%T12 zmm19
3888 %define %%T13 zmm20
3889 %define %%T14 zmm21
3890 %define %%T15 zmm22
3891 %define %%T16 zmm23
3892 %define %%T17 zmm24
3893 %define %%T18 zmm25
3894
3895 %define %%GHASH zmm26
3896
3897 %define %%CTRL0 zmm27
3898 %define %%CTRL1 zmm28
3899 %define %%CTRL2 zmm29
3900 %define %%CTRL3 zmm30
3901
3902 %define %%ZMM_SHUF_MASK zmm31
3903
3904 ;;; ===========================================================================
3905 ;;; virtual register mappings
3906
3907 %define %%PREVLO0 %%T11 ; 4 lanes x 8 blocks of cipher text for GHASH
3908 %define %%PREVHI0 %%T12
3909 %define %%PREVLO1 %%T13
3910 %define %%PREVHI1 %%T14
3911 %define %%PREVLO2 %%T15
3912 %define %%PREVHI2 %%T16
3913 %define %%PREVLO3 %%T17
3914 %define %%PREVHI3 %%T18
3915
3916 ;;; ===========================================================================
3917
3918 or %%LEN_REG, %%LEN_REG
3919 jz %%_enc_dec_done_x4
3920
3921 mov %%DATA_OFFSET, 128
3922
3923 ;; load GCM CTX pointers for 4 lanes
3924 mov %%TPTR0, [%%GDATA_CTX + (0*8)]
3925 mov %%TPTR1, [%%GDATA_CTX + (1*8)]
3926 mov %%TPTR2, [%%GDATA_CTX + (2*8)]
3927 mov %%TPTR3, [%%GDATA_CTX + (3*8)]
3928
3929 ;; load common constants used in the code
3930 vmovdqa64 %%ZMM_SHUF_MASK, [rel SHUF_MASK]
3931
3932 ;; Update length of data processed
3933 add [%%TPTR0 + InLen], %%LEN_REG
3934 add [%%TPTR1 + InLen], %%LEN_REG
3935 add [%%TPTR2 + InLen], %%LEN_REG
3936 add [%%TPTR3 + InLen], %%LEN_REG
3937
3938 ;; extract current hash values from 4 lanes
3939 vmovdqu64 XWORD(%%GHASH), [%%TPTR0 + AadHash]
3940 vinserti64x2 %%GHASH, [%%TPTR1 + AadHash], 1
3941 vinserti64x2 %%GHASH, [%%TPTR2 + AadHash], 2
3942 vinserti64x2 %%GHASH, [%%TPTR3 + AadHash], 3
3943
3944 ;; lift CTR set from initial_blocks to here
3945 vmovdqa64 %%T1, [rel ddq_add_1234]
3946 vmovdqa64 %%T2, [rel ddq_add_5678]
3947 vbroadcastf64x2 %%CTRL0, [%%TPTR0 + CurCount]
3948 vbroadcastf64x2 %%CTRL1, [%%TPTR1 + CurCount]
3949 vbroadcastf64x2 %%CTRL2, [%%TPTR2 + CurCount]
3950 vbroadcastf64x2 %%CTRL3, [%%TPTR3 + CurCount]
3951 vpaddd %%L0B03, %%CTRL0, %%T1
3952 vpaddd %%L1B03, %%CTRL1, %%T1
3953 vpaddd %%L2B03, %%CTRL2, %%T1
3954 vpaddd %%L3B03, %%CTRL3, %%T1
3955 vpaddd %%L0B47, %%CTRL0, %%T2
3956 vpaddd %%L1B47, %%CTRL1, %%T2
3957 vpaddd %%L2B47, %%CTRL2, %%T2
3958 vpaddd %%L3B47, %%CTRL3, %%T2
3959 vmovdqa64 %%CTRL0, %%L0B47
3960 vmovdqa64 %%CTRL1, %%L1B47
3961 vmovdqa64 %%CTRL2, %%L2B47
3962 vmovdqa64 %%CTRL3, %%L3B47
3963
3964 ;; load GCM key pointers for 4 lanes
3965 mov %%KPTR0, [%%GDATA_KEY + (0*8)]
3966 mov %%KPTR1, [%%GDATA_KEY + (1*8)]
3967 mov %%KPTR2, [%%GDATA_KEY + (2*8)]
3968 mov %%KPTR3, [%%GDATA_KEY + (3*8)]
3969
3970 %%_cipher_only_x4:
3971 ;; run cipher only over the first 8 blocks
3972 INITIAL_BLOCKS_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
3973 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
3974 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
3975 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
3976 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
3977 %%GHASH, \
3978 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, \
3979 %%T9, %%T10, %%T11, %%T12, \
3980 %%ZMM_SHUF_MASK, %%ENC_DEC
3981
3982 ;; Update length
3983 sub %%LEN_REG, 128
3984 jz %%_encrypt_done_x4
3985
3986 vmovq %%GPR0, XWORD(%%CTRL0)
3987 vmovq %%GPR1, XWORD(%%CTRL1)
3988 vmovq %%GPR2, XWORD(%%CTRL2)
3989 vmovq %%GPR3, XWORD(%%CTRL3)
3990
3991 and %%GPR0, 255
3992 and %%GPR1, 255
3993 and %%GPR2, 255
3994 and %%GPR3, 255
3995
3996 ;; shuffle the counters to BE
3997 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
3998 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
3999 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4000 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4001
4002 %%_encrypt_by_8_parallel_x4:
4003 ;; get max counter value
4004 cmp %%GPR0, %%GPR1
4005 cmova %%GPR1, %%GPR0
4006 cmp %%GPR2, %%GPR1
4007 cmova %%GPR1, %%GPR2
4008 cmp %%GPR3, %%GPR1
4009 cmova %%GPR1, %%GPR3
4010 ;; at this stage %%GPR1 includes max 8-bit LS counter from 4 lanes
4011
4012 ;; if max counter is above 244 then overflow will occur
4013 cmp %%GPR1, 244
4014 ja %%_encrypt_by_8_overflow_x4
4015
4016 ;; (256 - 8) because we process 8 blocks at a time
4017 ;; Max number of blocks that can be processed in a lane
4018 ;; without shuffling is (256 - 8)
4019 mov %%GPR0, (256 - 8)
4020 sub %%GPR0, %%GPR1
4021 shr %%GPR0, 3
4022 ;; GPR0 holds number of iterations based on remaing blocks before overflow
4023
4024 ;; get number of iterations from the remaining byte length
4025 mov %%GPR1, %%LEN_REG
4026 shr %%GPR1, 7
4027
4028 ;; pick the smallest one (GPR0 will be the counter)
4029 cmp %%GPR1, %%GPR0
4030 cmovb %%GPR0, %%GPR1
4031
4032 %%_encrypt_by_8_x4:
4033 ;; copy previously encrypted blocks for GHASH
4034 vmovdqa64 %%PREVLO0, %%L0B03
4035 vmovdqa64 %%PREVHI0, %%L0B47
4036 vmovdqa64 %%PREVLO1, %%L1B03
4037 vmovdqa64 %%PREVHI1, %%L1B47
4038 vmovdqa64 %%PREVLO2, %%L2B03
4039 vmovdqa64 %%PREVHI2, %%L2B47
4040 vmovdqa64 %%PREVLO3, %%L3B03
4041 vmovdqa64 %%PREVHI3, %%L3B47
4042
4043 ;; - no byte overflow and no shuffling required
4044 vmovdqa64 %%T1, [rel ddq_addbe_4444]
4045 vmovdqa64 %%T2, [rel ddq_addbe_8888]
4046
4047 vpaddd %%L0B03, %%CTRL0, %%T1
4048 vpaddd %%L1B03, %%CTRL1, %%T1
4049 vpaddd %%L2B03, %%CTRL2, %%T1
4050 vpaddd %%L3B03, %%CTRL3, %%T1
4051 vpaddd %%L0B47, %%CTRL0, %%T2
4052 vpaddd %%L1B47, %%CTRL1, %%T2
4053 vpaddd %%L2B47, %%CTRL2, %%T2
4054 vpaddd %%L3B47, %%CTRL3, %%T2
4055
4056 vmovdqa64 %%CTRL0, %%L0B47
4057 vmovdqa64 %%CTRL1, %%L1B47
4058 vmovdqa64 %%CTRL2, %%L2B47
4059 vmovdqa64 %%CTRL3, %%L3B47
4060
4061 GHASH_8_ENCRYPT_8_PARALLEL_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
4062 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4063 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
4064 %%DATA_OFFSET, \
4065 %%CTRL0, %%CTRL1, %%CTRL2, %%CTRL3, \
4066 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4067 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4068 %%GHASH, \
4069 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, \
4070 %%T8, %%T9, %%T10, \
4071 %%PREVLO0, %%PREVHI0, %%PREVLO1, %%PREVHI1, \
4072 %%PREVLO2, %%PREVHI2, %%PREVLO3, %%PREVHI3, \
4073 %%ZMM_SHUF_MASK, %%ENC_DEC
4074 add %%DATA_OFFSET, 128
4075 sub %%LEN_REG, 128
4076 sub %%GPR0, 1
4077 jnz %%_encrypt_by_8_x4
4078
4079 %%_encrypt_by_8_overflow_x4:
4080 ;; shuffle the counters back to LE
4081 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
4082 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
4083 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4084 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4085
4086 or %%LEN_REG, %%LEN_REG
4087 jz %%_encrypt_done_x4
4088
4089 ;; copy previously encrypted blocks for GHASH
4090 vmovdqa64 %%PREVLO0, %%L0B03
4091 vmovdqa64 %%PREVHI0, %%L0B47
4092 vmovdqa64 %%PREVLO1, %%L1B03
4093 vmovdqa64 %%PREVHI1, %%L1B47
4094 vmovdqa64 %%PREVLO2, %%L2B03
4095 vmovdqa64 %%PREVHI2, %%L2B47
4096 vmovdqa64 %%PREVLO3, %%L3B03
4097 vmovdqa64 %%PREVHI3, %%L3B47
4098
4099 ;; prepare new counter blocks in LE
4100 vmovdqa64 %%T1, [rel ddq_add_4444]
4101 vmovdqa64 %%T2, [rel ddq_add_8888]
4102 vpaddd %%L0B03, %%CTRL0, %%T1
4103 vpaddd %%L1B03, %%CTRL1, %%T1
4104 vpaddd %%L2B03, %%CTRL2, %%T1
4105 vpaddd %%L3B03, %%CTRL3, %%T1
4106 vpaddd %%L0B47, %%CTRL0, %%T2
4107 vpaddd %%L1B47, %%CTRL1, %%T2
4108 vpaddd %%L2B47, %%CTRL2, %%T2
4109 vpaddd %%L3B47, %%CTRL3, %%T2
4110
4111 ;; save the counter to GPR's for calculation of number of loops
4112 vmovq %%GPR0, XWORD(%%L0B47)
4113 vmovq %%GPR1, XWORD(%%L1B47)
4114 vmovq %%GPR2, XWORD(%%L2B47)
4115 vmovq %%GPR3, XWORD(%%L3B47)
4116
4117 and %%GPR0, 255
4118 and %%GPR1, 255
4119 and %%GPR2, 255
4120 and %%GPR3, 255
4121
4122 ;; convert counter blocks to BE
4123 vpshufb %%L0B03, %%ZMM_SHUF_MASK
4124 vpshufb %%L0B47, %%ZMM_SHUF_MASK
4125 vpshufb %%L1B03, %%ZMM_SHUF_MASK
4126 vpshufb %%L1B47, %%ZMM_SHUF_MASK
4127 vpshufb %%L2B03, %%ZMM_SHUF_MASK
4128 vpshufb %%L2B47, %%ZMM_SHUF_MASK
4129 vpshufb %%L3B03, %%ZMM_SHUF_MASK
4130 vpshufb %%L3B47, %%ZMM_SHUF_MASK
4131
4132 ;; update 4 lane CTR in BE
4133 vmovdqa64 %%CTRL0, %%L0B47
4134 vmovdqa64 %%CTRL1, %%L1B47
4135 vmovdqa64 %%CTRL2, %%L2B47
4136 vmovdqa64 %%CTRL3, %%L3B47
4137
4138 GHASH_8_ENCRYPT_8_PARALLEL_x4 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, \
4139 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4140 %%TPTR0, %%TPTR1, %%TPTR2, %%TPTR3, \
4141 %%DATA_OFFSET, \
4142 %%CTRL0, %%CTRL1, %%CTRL2, %%CTRL3, \
4143 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4144 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4145 %%GHASH, \
4146 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, \
4147 %%T8, %%T9, %%T10, \
4148 %%PREVLO0, %%PREVHI0, %%PREVLO1, %%PREVHI1, \
4149 %%PREVLO2, %%PREVHI2, %%PREVLO3, %%PREVHI3, \
4150 %%ZMM_SHUF_MASK, %%ENC_DEC
4151 add %%DATA_OFFSET, 128
4152 sub %%LEN_REG, 128
4153 jnz %%_encrypt_by_8_parallel_x4
4154
4155 ;; shuffle the counters back to LE
4156 vpshufb %%CTRL0, %%ZMM_SHUF_MASK
4157 vpshufb %%CTRL1, %%ZMM_SHUF_MASK
4158 vpshufb %%CTRL2, %%ZMM_SHUF_MASK
4159 vpshufb %%CTRL3, %%ZMM_SHUF_MASK
4160
4161 %%_encrypt_done_x4:
4162 GHASH_LAST_8x4 %%KPTR0, %%KPTR1, %%KPTR2, %%KPTR3, \
4163 %%L0B03, %%L0B47, %%L1B03, %%L1B47, \
4164 %%L2B03, %%L2B47, %%L3B03, %%L3B47, \
4165 %%GHASH, \
4166 %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, \
4167 %%T7, %%T8, %%T9, %%T10, %%T11, %%T12
4168
4169 %%_ghash_done_x4:
4170 mov %%TPTR0, [%%GDATA_CTX + (0*8)]
4171 mov %%TPTR1, [%%GDATA_CTX + (1*8)]
4172 mov %%TPTR2, [%%GDATA_CTX + (2*8)]
4173 mov %%TPTR3, [%%GDATA_CTX + (3*8)]
4174
4175 ;; save current counter blocks
4176 vextracti32x4 [%%TPTR0 + CurCount], %%CTRL0, 3
4177 vextracti32x4 [%%TPTR1 + CurCount], %%CTRL1, 3
4178 vextracti32x4 [%%TPTR2 + CurCount], %%CTRL2, 3
4179 vextracti32x4 [%%TPTR3 + CurCount], %%CTRL3, 3
4180
4181 ;; save current hash values
4182 vmovdqu64 [%%TPTR0 + AadHash], XWORD(%%GHASH)
4183 vextracti64x2 [%%TPTR1 + AadHash], %%GHASH, 1
4184 vextracti64x2 [%%TPTR2 + AadHash], %%GHASH, 2
4185 vextracti64x2 [%%TPTR3 + AadHash], %%GHASH, 3
4186
4187 ;; decrement lens
4188 ;; increment the input / output pointers
4189 ;; - output and input pointers are next to one another in the structure
4190 ;; so updating all 8 pointers with a single zmm
4191 vpbroadcastq %%T1, %%DATA_OFFSET ; DATA_OFFSET should be equal to length
4192 vpaddq %%T2, %%T1, [%%CYPH_PLAIN_OUT]
4193 vmovdqu64 [%%CYPH_PLAIN_OUT], %%T2
4194 vmovdqu64 YWORD(%%T3), [%%STATE + _gcm_lens]
4195 vpsubq YWORD(%%T3), YWORD(%%T3), YWORD(%%T1)
4196 vmovdqu64 [%%STATE + _gcm_lens], YWORD(%%T3)
4197
4198 %%_enc_dec_done_x4:
4199
4200
4201 %endmacro ; GCM_ENC_DEC_4x128
4202
4203 ;;; ===========================================================================
4204 ;;; ===========================================================================
4205 ;;; GCM_COMPLETE_x4 - completes one of MB jobs
4206 ;;; Clobbers rax, r9-r12, r14, r15 and zmm0-zmm31
4207 ;;; ===========================================================================
4208 %macro GCM_COMPLETE_x4 3
4209 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4210 %define %%IDX %2 ; [in] lane index to be completed
4211 %define %%ENC_DEC %3
4212
4213 %ifidn __OUTPUT_FORMAT__, win64
4214 %define %%GDATA_KEY rdi
4215 %define %%GDATA_CTX rsi
4216 %define %%CYPH_PLAIN_OUT r11
4217 %define %%PLAIN_CYPH_IN r9
4218 %else
4219 %define %%GDATA_KEY arg3
4220 %define %%GDATA_CTX arg4
4221 %define %%CYPH_PLAIN_OUT r8
4222 %define %%PLAIN_CYPH_IN r9
4223 %endif
4224
4225
4226 %define %%PLAIN_CYPH_LEN rbp
4227 %define %%AUTH_TAG rbp
4228 %define %%AUTH_TAGLEN rbp
4229
4230 %define %%GPR rax
4231
4232 %define %%DATA_OFFSET rbx
4233
4234 mov %%PLAIN_CYPH_LEN, [%%STATE + _gcm_lens + %%IDX*8]
4235 mov %%GDATA_KEY, [%%STATE + _gcm_args_keys + %%IDX*8]
4236 mov %%GDATA_CTX, [%%STATE + _gcm_args_ctx + %%IDX*8]
4237 mov %%PLAIN_CYPH_IN, [%%STATE + _gcm_args_in + %%IDX*8]
4238 mov %%CYPH_PLAIN_OUT, [%%STATE + _gcm_args_out + %%IDX*8]
4239
4240 vmovdqu64 xmm16, [%%GDATA_KEY + HashKey]
4241 vmovdqu64 xmm17, [%%GDATA_CTX + AadHash]
4242
4243 ;;; ===========================================================================
4244 ;;; finalize last blocks (<128 bytes)
4245
4246 ; Macro flow:
4247 ; calculate the number of 16byte blocks in the message
4248 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
4249 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
4250 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
4251
4252 or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
4253 je %%_enc_dec_done_x4
4254
4255 xor %%DATA_OFFSET, %%DATA_OFFSET
4256
4257 ;; Update length of data processed
4258 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
4259
4260 vmovdqa64 xmm13, xmm16 ; load HashKey
4261 vmovdqa64 xmm8, xmm17 ; load AadHash; xmm8 is hash_in for gcm_enc_dec_small
4262 vmovdqu xmm9, [%%GDATA_CTX + CurCount]
4263
4264 ;; Save the amount of data left to process in r10
4265 mov r13, %%PLAIN_CYPH_LEN
4266
4267 ;; Determine how many blocks to process in INITIAL
4268 mov r12, r13
4269 shr r12, 4
4270 and r12, 7
4271
4272 ;; Process one additional block in INITIAL if there is a partial block
4273 mov r10, r13
4274 and r10, 0xf
4275 add r10, 0xf
4276 shr r10, 4 ; 0 - if 4LSB of length are all zero, 1 - otherwise
4277 add r12, r10 ; process an additional INITIAL block if r10 is not zero
4278
4279 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
4280 %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
4281 r13, r12, xmm9, xmm14, multi_call
4282
4283 %%_ghash_done_x4:
4284 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; current_counter = xmm9
4285 vmovdqa64 xmm17, xmm14 ; AadHash = xmm14
4286
4287 %%_enc_dec_done_x4:
4288 ;;; ===========================================================================
4289 ;;; COMPLETE
4290
4291 ;; Start AES as early as possible
4292 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
4293 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
4294
4295 ;; If the GCM function is called as a single function call rather
4296 ;; than invoking the individual parts (init, update, finalize) we
4297 ;; can remove a write to read dependency on AadHash.
4298 vmovdqa64 xmm14, xmm17 ; xmm14 = AadHash
4299 vmovdqa64 xmm13, xmm16 ; load HashKey
4300
4301 ;; Encrypt the final partial block. If we did this as a single call then
4302 ;; the partial block was handled in the main GCM_ENC_DEC macro.
4303 cmp qword [%%GDATA_CTX + PBlockLen], 0
4304 je %%_partial_done_x4
4305
4306 ;; xmm14: hash value [in/out]
4307 ;; xmm13: hash key [in]
4308 ;; xmm0, xmm10, xmm11, xmm5, xmm6 - temporary registers
4309 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
4310 vmovdqa64 xmm17, xmm14 ; AadHash = xmm14
4311
4312 %%_partial_done_x4:
4313 mov %%GPR, [%%GDATA_CTX + AadLen] ; aadLen (number of bytes)
4314 shl %%GPR, 3 ; convert into number of bits
4315 vmovd xmm15, DWORD(%%GPR) ; len(A) in xmm15
4316
4317 mov %%GPR, [%%GDATA_CTX + InLen]
4318 shl %%GPR, 3 ; len(C) in bits (*128)
4319 vmovq xmm1, %%GPR
4320 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
4321 vpor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
4322
4323 ;; prep auth_tag store mask
4324 mov %%AUTH_TAGLEN, [%%STATE + _gcm_args_taglen + %%IDX*8]
4325 lea %%GPR, [rel byte_len_to_mask_table]
4326 kmovw k1, [%%GPR + %%AUTH_TAGLEN*2]
4327 mov %%AUTH_TAG, [%%STATE + _gcm_args_tag + %%IDX*8]
4328
4329 ;; XOR current hash value with the next block xmm15
4330 vpxor xmm14, xmm15
4331
4332 ;; xmm14: hash value [in/out]
4333 ;; xmm13: hash key [in]
4334 ;; xmm0, xmm10, xmm11, xmm5, xmm6 - temporary registers
4335 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
4336 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
4337
4338 vpxor xmm9, xmm9, xmm14
4339
4340 %%_return_T:
4341 vmovdqu8 [%%AUTH_TAG]{k1}, xmm9 ; store TAG
4342 vmovdqu64 [%%GDATA_CTX + AadHash], xmm17 ; store AadHash
4343
4344 ;; put the lane back on free list
4345 mov rax, [%%STATE + _gcm_unused_lanes]
4346 shl rax, 4
4347 or rax, %%IDX
4348 mov [%%STATE + _gcm_unused_lanes], rax
4349
4350 ;; mark job as complete
4351 mov rax, [%%STATE + _gcm_job_in_lane + 8*%%IDX]
4352 or dword [rax + _status], STS_COMPLETED
4353 ;; clear job pointer in this lane
4354 mov qword [%%STATE + _gcm_job_in_lane + 8*%%IDX], 0
4355 ;; return finished job (rax)
4356 %%_return_T_done:
4357 %endmacro ; GCM_COMPLETE_x4
4358
4359
4360 ;;; ===========================================================================
4361 ;;; ===========================================================================
4362 ;;; GCM_FINALIZE_x4:
4363 ;;; - runs all lanes in parallel for %LEN
4364 ;;; - completes slected lane (any outstanding bytes < 128 bytes)
4365 ;;; - returns pointer of completed JOB
4366 ;;; Clobbers rax, r9-r12, r14, r15 and zmm0-zmm31
4367 ;;; ===========================================================================
4368 %macro GCM_FINALIZE_x4 4
4369 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4370 %define %%IDX %2 ; [in] lane index to be completed
4371 %define %%LEN %3 ; [in] common length to be prcessed across all lanes
4372 %define %%ENC_DEC %4
4373
4374 %%_gcm_finalize_4x128:
4375 mov [rsp + 0*8], %%IDX ; save %IDX as it will get clobbered
4376 and %%LEN, -128
4377 mov arg2, %%LEN
4378 GCM_ENC_DEC_4x128 %%STATE, arg2, %%ENC_DEC
4379
4380 %%_gcm_complete_min_lane:
4381 mov arg2, [rsp + 0*8] ; restore %%IDX
4382 GCM_COMPLETE_x4 %%STATE, arg2, %%ENC_DEC
4383 %endmacro ; GCM_FINALIZE_x4
4384 ;;; ===========================================================================
4385
4386 ;;; ===========================================================================
4387 ;;; ===========================================================================
4388 ;;; GCM_FLUSH_MB:
4389 ;;; - finds min not null lane
4390 ;;; - replicates non_null data across null lanes
4391 ;;; - returns min length lane index and length
4392 ;;; ===========================================================================
4393 %macro GCM_FLUSH_MB 3
4394 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4395 %define %%IDX %2 ; [out] lane index to be completed
4396 %define %%LEN %3 ; [out] common length to be prcessed across all lanes
4397
4398 ;; put max length into null lanes
4399 vmovdqu64 ymm0, [%%STATE + _gcm_job_in_lane]
4400 vpxorq ymm1, ymm1
4401 vpcmpq k2, ymm0, ymm1, 0 ; EQ
4402
4403 kmovq rax, k2 ; k2 = mask for null lanes
4404 xor rax, 0xf
4405 kmovq k1, rax ; k1 = mask for not null lanes (~k2)
4406
4407 vmovdqu64 ymm2, [%%STATE + _gcm_lens]
4408 vbroadcastf64x2 ymm4, [rel ALL_F]
4409 vporq ymm2{k2}, ymm2, ymm4
4410
4411 ;; find min lane & index
4412 vpsllq ymm3, ymm2, 2 ;
4413 vporq ymm3, ymm3, [rel index_to_lane4]
4414 vextracti32x4 xmm2, ymm3, 1
4415 vpminuq xmm2, xmm3, xmm2
4416 vpsrldq xmm3, xmm2, 8
4417 vpminuq xmm2, xmm3, xmm2
4418 vmovq %%LEN, xmm2
4419 mov %%IDX, %%LEN
4420 and %%IDX, 3
4421 shr %%LEN, 2
4422 ;; At this stage:
4423 ;; %%LEN - min length
4424 ;; %%IDX - lane index
4425
4426 ;; load context structure content from the non-null lane
4427 ;; it is 88 bytes long (64 + 24)
4428 ;; zmm7:ymm11
4429 mov rax, 0x7
4430 kmovq k3, rax
4431 mov r10, [%%STATE + _gcm_args_ctx + 8*%%IDX]
4432 vmovdqu64 zmm7, [r10]
4433 vmovdqu64 ymm11{k3}, [r10 + 64]
4434
4435 vmovdqu64 ymm7, [%%STATE + _gcm_args_in]
4436 vmovdqu64 ymm8, [%%STATE + _gcm_args_out]
4437 vmovdqu64 ymm9, [%%STATE + _gcm_args_keys]
4438 mov r10, [%%STATE + _gcm_args_in + 8*%%IDX]
4439 mov r11, [%%STATE + _gcm_args_out + 8*%%IDX]
4440 mov r12, [%%STATE + _gcm_args_keys + 8*%%IDX]
4441 ;; r10 = (min lane) valid in ptr
4442 ;; r11 = (min lane) valid out ptr
4443 ;; r12 = (min lane) valid keys ptr
4444
4445 ;; store valid in/out/key pointers to empty lanes
4446 vpbroadcastq ymm4, r10
4447 vpbroadcastq ymm5, r11
4448 vpbroadcastq ymm6, r12
4449
4450 vmovdqa64 ymm4{k1}, ymm7
4451 vmovdqa64 ymm5{k1}, ymm8
4452 vmovdqa64 ymm6{k1}, ymm9
4453
4454 vmovdqu64 [%%STATE + _gcm_args_in], ymm4
4455 vmovdqu64 [%%STATE + _gcm_args_out], ymm5
4456 vmovdqu64 [%%STATE + _gcm_args_keys], ymm6
4457
4458 ;; copy valid context into empty lanes
4459 kmovq rax, k2 ; null lane mask to rax
4460 test rax, 1
4461 jz %%_copy_ctx_lane1
4462 mov r10, [%%STATE + _gcm_args_ctx + 8*0]
4463 vmovdqu64 [r10], zmm7
4464 vmovdqu64 [r10 + 64]{k3}, ymm11
4465 %%_copy_ctx_lane1:
4466 test rax, 2
4467 jz %%_copy_ctx_lane2
4468 mov r10, [%%STATE + _gcm_args_ctx + 8*1]
4469 vmovdqu64 [r10], zmm7
4470 vmovdqu64 [r10 + 64]{k3}, ymm11
4471 %%_copy_ctx_lane2:
4472 test rax, 4
4473 jz %%_copy_ctx_lane3
4474 mov r10, [%%STATE + _gcm_args_ctx + 8*2]
4475 vmovdqu64 [r10], zmm7
4476 vmovdqu64 [r10 + 64]{k3}, ymm11
4477 %%_copy_ctx_lane3:
4478 test rax, 8
4479 jz %%_copy_ctx_end
4480 mov r10, [%%STATE + _gcm_args_ctx + 8*3]
4481 vmovdqu64 [r10], zmm7
4482 vmovdqu64 [r10 + 64]{k3}, ymm11
4483 %%_copy_ctx_end:
4484
4485 %endmacro ; GCM_FLUSH_MB
4486 ;;; ===========================================================================
4487
4488 ;;; ===========================================================================
4489 ;;; ===========================================================================
4490 ;;; GCM_SUBMIT_MB:
4491 ;;; - finds free lane and populates it with data from JOB
4492 ;;; - if all lanes populated then finds min common length
4493 ;;; - returns min length lane index and size
4494 ;;; ===========================================================================
4495 %macro GCM_SUBMIT_MB 4
4496 %define %%STATE %1 ; [in] pointer to an array with 4 pointers to expanded key
4497 %define %%JOB %2 ; [in] lane index to be completed / [out] index
4498 %define %%LEN %3 ; [out] common length to be prcessed across all lanes
4499 %define %%ENC_DEC %4 ; [in] encrypt / decrypt selector
4500
4501 %define %%IDX rbp
4502 %define %%RET_IDX %%JOB
4503 %ifidn __OUTPUT_FORMAT__, win64
4504 %define %%LCTX rdi
4505 %else
4506 %define %%LCTX r8
4507 %endif
4508 ;; get free lane
4509 mov rbx, [%%STATE + _gcm_unused_lanes]
4510 mov %%IDX, rbx
4511 shr rbx, 4
4512 and %%IDX, 0xf
4513 mov [%%STATE + _gcm_unused_lanes], rbx
4514
4515 ;; copy job data into the lane
4516 mov [%%STATE + _gcm_job_in_lane + 8*%%IDX], %%JOB
4517
4518 mov r9, [%%JOB + _aes_enc_key_expanded]
4519 mov [%%STATE + _gcm_args_keys + 8*%%IDX], r9
4520
4521 mov rax, [%%JOB + _src]
4522 add rax, [%%JOB + _cipher_start_src_offset_in_bytes]
4523 mov [%%STATE + _gcm_args_in + 8*%%IDX], rax
4524
4525 mov rax, [%%JOB + _dst]
4526 mov [%%STATE + _gcm_args_out + 8*%%IDX], rax
4527
4528 mov rax, [%%JOB + _auth_tag_output]
4529 mov [%%STATE + _gcm_args_tag + 8*%%IDX], rax
4530
4531 mov rax, [%%JOB + _auth_tag_output_len_in_bytes]
4532 mov [%%STATE + _gcm_args_taglen + 8*%%IDX], rax
4533
4534 vpbroadcastq ymm15, [%%JOB + _msg_len_to_cipher_in_bytes]
4535
4536 lea rax, [rel index_to_lane4_mask]
4537 kmovw k2, [rax + (index_to_lane4_not_mask - index_to_lane4_mask) + %%IDX*2]
4538 kmovw k1, [rax + %%IDX*2]
4539 vmovdqu64 ymm14{k2}{z}, [%%STATE + _gcm_lens]
4540 vporq ymm14{k1}, ymm14, ymm15
4541 vmovdqu64 [%%STATE + _gcm_lens], ymm14
4542 vmovdqu64 ymm31, ymm14
4543
4544 ;; call gcm_init
4545 mov r13, [%%JOB + _iv]
4546 mov r14, [%%JOB + _gcm_aad]
4547 mov rax, [%%JOB + _gcm_aad_len]
4548 mov %%LCTX, [%%STATE + _gcm_args_ctx + 8*%%IDX]
4549
4550 ;; GDATA_KEY %1
4551 ;; GDATA_CTX %2
4552 ;; IV %3
4553 ;; A_IN %4
4554 ;; A_LEN %5
4555 ;; r10 to 12 - temporary GPR's
4556 GCM_INIT r9, %%LCTX, r13, r14, rax, r10, r11, r12
4557
4558 ;; check if all lanes populated
4559 cmp rbx, 0xf
4560 je %%_gcm_ooo_ready
4561 %%_gcm_ooo_not_ready:
4562 xor rax, rax ; return NULL
4563 jmp %%_gcm_submit_return
4564
4565 %%_gcm_ooo_ready:
4566 ;; find min lane & index
4567 vpsllq ymm2, ymm31, 2 ;
4568 vporq ymm2, ymm2, [rel index_to_lane4]
4569 vextracti32x4 xmm3, ymm2, 1
4570 vpminuq xmm2, xmm3, xmm2
4571 vpsrldq xmm3, xmm2, 8
4572 vpminuq xmm2, xmm3, xmm2
4573 vmovq %%LEN, xmm2
4574 mov %%RET_IDX, %%LEN
4575 and %%RET_IDX, 3
4576 shr %%LEN, 2
4577 ;; At this stage:
4578 ;; %%LEN - min length
4579 ;; %%RET_IDX - lane index
4580
4581 ;; finalize puts returned job into RAX
4582 ;; arg1 - state
4583 ;; arg2 - min_lane_idx
4584 ;; arg3 - min_len
4585 %%_gcm_ooo_run:
4586 GCM_FINALIZE_x4 arg1, arg2, arg3, %%ENC_DEC
4587 ;; rax = finished job pointer
4588 %%_gcm_submit_return:
4589
4590 %endmacro ; GCM_SUBMIT_MB
4591 ;;; ===========================================================================
4592
4593 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4594 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
4595 ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
4596 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
4597 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
4598 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4599 %macro GCM_COMPLETE 6
4600 %define %%GDATA_KEY %1
4601 %define %%GDATA_CTX %2
4602 %define %%AUTH_TAG %3
4603 %define %%AUTH_TAG_LEN %4
4604 %define %%ENC_DEC %5
4605 %define %%INSTANCE_TYPE %6
4606 %define %%PLAIN_CYPH_LEN rax
4607
4608 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
4609 ;; Start AES as early as possible
4610 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
4611 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
4612
4613 %ifidn %%INSTANCE_TYPE, multi_call
4614 ;; If the GCM function is called as a single function call rather
4615 ;; than invoking the individual parts (init, update, finalize) we
4616 ;; can remove a write to read dependency on AadHash.
4617 vmovdqu xmm14, [%%GDATA_CTX + AadHash]
4618
4619 ;; Encrypt the final partial block. If we did this as a single call then
4620 ;; the partial block was handled in the main GCM_ENC_DEC macro.
4621 mov r12, [%%GDATA_CTX + PBlockLen]
4622 cmp r12, 0
4623
4624 je %%_partial_done
4625
4626 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
4627 vmovdqu [%%GDATA_CTX + AadHash], xmm14
4628
4629 %%_partial_done:
4630
4631 %endif
4632
4633 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
4634 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
4635
4636 shl r12, 3 ; convert into number of bits
4637 vmovd xmm15, r12d ; len(A) in xmm15
4638
4639 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
4640 vmovq xmm1, %%PLAIN_CYPH_LEN
4641 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
4642 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
4643
4644 vpxor xmm14, xmm15
4645 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
4646 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
4647
4648 vpxor xmm9, xmm9, xmm14
4649
4650
4651 %%_return_T:
4652 mov r10, %%AUTH_TAG ; r10 = authTag
4653 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
4654
4655 cmp r11, 16
4656 je %%_T_16
4657
4658 cmp r11, 12
4659 je %%_T_12
4660
4661 cmp r11, 8
4662 je %%_T_8
4663
4664 simd_store_avx r10, xmm9, r11, r12, rax
4665 jmp %%_return_T_done
4666 %%_T_8:
4667 vmovq rax, xmm9
4668 mov [r10], rax
4669 jmp %%_return_T_done
4670 %%_T_12:
4671 vmovq rax, xmm9
4672 mov [r10], rax
4673 vpsrldq xmm9, xmm9, 8
4674 vmovd eax, xmm9
4675 mov [r10 + 8], eax
4676 jmp %%_return_T_done
4677 %%_T_16:
4678 vmovdqu [r10], xmm9
4679
4680 %%_return_T_done:
4681 %endmacro ; GCM_COMPLETE
4682
4683
4684 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4685 ;void aes_gcm_precomp_128_vaes_avx512 /
4686 ; aes_gcm_precomp_192_vaes_avx512 /
4687 ; aes_gcm_precomp_256_vaes_avx512
4688 ; (struct gcm_key_data *key_data)
4689 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4690 MKGLOBAL(FN_NAME(precomp,_),function,)
4691 FN_NAME(precomp,_):
4692 push r12
4693 push r13
4694 push r14
4695 push r15
4696
4697 mov r14, rsp
4698
4699
4700
4701 sub rsp, VARIABLE_OFFSET
4702 and rsp, ~63 ; align rsp to 64 bytes
4703
4704 %ifidn __OUTPUT_FORMAT__, win64
4705 ; only xmm6 needs to be maintained
4706 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
4707 %endif
4708
4709 vpxor xmm6, xmm6
4710 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
4711
4712 vpshufb xmm6, [rel SHUF_MASK]
4713 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
4714 vmovdqa xmm2, xmm6
4715 vpsllq xmm6, xmm6, 1
4716 vpsrlq xmm2, xmm2, 63
4717 vmovdqa xmm1, xmm2
4718 vpslldq xmm2, xmm2, 8
4719 vpsrldq xmm1, xmm1, 8
4720 vpor xmm6, xmm6, xmm2
4721 ;reduction
4722 vpshufd xmm2, xmm1, 00100100b
4723 vpcmpeqd xmm2, [rel TWOONE]
4724 vpand xmm2, xmm2, [rel POLY]
4725 vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
4726 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4727 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
4728
4729
4730 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
4731
4732 %ifidn __OUTPUT_FORMAT__, win64
4733 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
4734 %endif
4735 mov rsp, r14
4736
4737 pop r15
4738 pop r14
4739 pop r13
4740 pop r12
4741 ret
4742
4743
4744 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4745 ;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
4746 ; (const struct gcm_key_data *key_data,
4747 ; struct gcm_context_data *context_data,
4748 ; u8 *iv,
4749 ; const u8 *aad,
4750 ; u64 aad_len);
4751 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4752 MKGLOBAL(FN_NAME(init,_),function,)
4753 FN_NAME(init,_):
4754 push r12
4755 push r13
4756 %ifidn __OUTPUT_FORMAT__, win64
4757 push r14
4758 push r15
4759 mov r14, rsp
4760 ; xmm6:xmm15 need to be maintained for Windows
4761 sub rsp, 1*16
4762 movdqu [rsp + 0*16], xmm6
4763 %endif
4764
4765 GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12
4766
4767 %ifidn __OUTPUT_FORMAT__, win64
4768 movdqu xmm6 , [rsp + 0*16]
4769 mov rsp, r14
4770 pop r15
4771 pop r14
4772 %endif
4773 pop r13
4774 pop r12
4775 ret
4776
4777
4778 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4779 ;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
4780 ; aes_gcm_enc_256_update_vaes_avx512
4781 ; (const struct gcm_key_data *key_data,
4782 ; struct gcm_context_data *context_data,
4783 ; u8 *out,
4784 ; const u8 *in,
4785 ; u64 plaintext_len);
4786 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4787 MKGLOBAL(FN_NAME(enc,_update_),function,)
4788 FN_NAME(enc,_update_):
4789
4790 FUNC_SAVE
4791
4792 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
4793
4794 FUNC_RESTORE
4795
4796 ret
4797
4798
4799 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4800 ;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
4801 ; aes_gcm_dec_256_update_vaes_avx512
4802 ; (const struct gcm_key_data *key_data,
4803 ; struct gcm_context_data *context_data,
4804 ; u8 *out,
4805 ; const u8 *in,
4806 ; u64 plaintext_len);
4807 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4808 MKGLOBAL(FN_NAME(dec,_update_),function,)
4809 FN_NAME(dec,_update_):
4810
4811 FUNC_SAVE
4812
4813 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
4814
4815 FUNC_RESTORE
4816 ret
4817
4818 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4819 ;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
4820 ; aes_gcm_enc_256_finalize_vaes_avx512
4821 ; (const struct gcm_key_data *key_data,
4822 ; struct gcm_context_data *context_data,
4823 ; u8 *auth_tag,
4824 ; u64 auth_tag_len);
4825 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4826 MKGLOBAL(FN_NAME(enc,_finalize_),function,)
4827 FN_NAME(enc,_finalize_):
4828
4829 push r12
4830
4831 %ifidn __OUTPUT_FORMAT__, win64
4832 ; xmm6:xmm15 need to be maintained for Windows
4833 sub rsp, 5*16
4834 vmovdqu [rsp + 0*16], xmm6
4835 vmovdqu [rsp + 1*16], xmm9
4836 vmovdqu [rsp + 2*16], xmm11
4837 vmovdqu [rsp + 3*16], xmm14
4838 vmovdqu [rsp + 4*16], xmm15
4839 %endif
4840 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
4841
4842 %ifidn __OUTPUT_FORMAT__, win64
4843 vmovdqu xmm15, [rsp + 4*16]
4844 vmovdqu xmm14, [rsp + 3*16]
4845 vmovdqu xmm11, [rsp + 2*16]
4846 vmovdqu xmm9, [rsp + 1*16]
4847 vmovdqu xmm6, [rsp + 0*16]
4848 add rsp, 5*16
4849 %endif
4850
4851 pop r12
4852 ret
4853
4854
4855 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4856 ;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
4857 ; aes_gcm_dec_256_finalize_vaes_avx512
4858 ; (const struct gcm_key_data *key_data,
4859 ; struct gcm_context_data *context_data,
4860 ; u8 *auth_tag,
4861 ; u64 auth_tag_len);
4862 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4863 MKGLOBAL(FN_NAME(dec,_finalize_),function,)
4864 FN_NAME(dec,_finalize_):
4865
4866 push r12
4867
4868 %ifidn __OUTPUT_FORMAT__, win64
4869 ; xmm6:xmm15 need to be maintained for Windows
4870 sub rsp, 5*16
4871 vmovdqu [rsp + 0*16], xmm6
4872 vmovdqu [rsp + 1*16], xmm9
4873 vmovdqu [rsp + 2*16], xmm11
4874 vmovdqu [rsp + 3*16], xmm14
4875 vmovdqu [rsp + 4*16], xmm15
4876 %endif
4877 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
4878
4879 %ifidn __OUTPUT_FORMAT__, win64
4880 vmovdqu xmm15, [rsp + 4*16]
4881 vmovdqu xmm14, [rsp + 3*16]
4882 vmovdqu xmm11, [rsp + 2*16]
4883 vmovdqu xmm9, [rsp + 1*16]
4884 vmovdqu xmm6, [rsp + 0*16]
4885 add rsp, 5*16
4886 %endif
4887
4888 pop r12
4889 ret
4890
4891
4892 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4893 ;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
4894 ; (const struct gcm_key_data *key_data,
4895 ; struct gcm_context_data *context_data,
4896 ; u8 *out,
4897 ; const u8 *in,
4898 ; u64 plaintext_len,
4899 ; u8 *iv,
4900 ; const u8 *aad,
4901 ; u64 aad_len,
4902 ; u8 *auth_tag,
4903 ; u64 auth_tag_len);
4904 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4905 MKGLOBAL(FN_NAME(enc,_),function,)
4906 FN_NAME(enc,_):
4907
4908 FUNC_SAVE
4909
4910 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
4911
4912 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
4913
4914 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
4915
4916 FUNC_RESTORE
4917
4918 ret
4919
4920 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4921 ;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
4922 ; (const struct gcm_key_data *key_data,
4923 ; struct gcm_context_data *context_data,
4924 ; u8 *out,
4925 ; const u8 *in,
4926 ; u64 plaintext_len,
4927 ; u8 *iv,
4928 ; const u8 *aad,
4929 ; u64 aad_len,
4930 ; u8 *auth_tag,
4931 ; u64 auth_tag_len);
4932 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4933 MKGLOBAL(FN_NAME(dec,_),function,)
4934 FN_NAME(dec,_):
4935
4936 FUNC_SAVE
4937
4938 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
4939
4940 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
4941
4942 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
4943
4944 FUNC_RESTORE
4945
4946 ret
4947
4948 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4949 ;JOB_AES_HMAC *aes_gcm_enc_128_submit_vaes_vaes_avx512 / aes_gcm_enc_192_submit_vaes_vaes_avx512 /
4950 ; aes_gcm_enc_256_submit_vaes_vaes_avx512
4951 ; (MB_MGR_GCM_OOO *state, JOB_AES_HMAC *job)
4952 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4953 MKGLOBAL(FN_NAME(enc,_submit_),function,internal)
4954 FN_NAME(enc,_submit_):
4955 FUNC_SAVE_AVX512
4956
4957 ;; arg1 - [in] state
4958 ;; arg2 - [in] job / [out] index
4959 ;; arg3 - [out] length
4960 GCM_SUBMIT_MB arg1, arg2, arg3, ENC
4961
4962 FUNC_RESTORE_AVX512
4963 ret
4964
4965 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4966 ;JOB_AES_HMAC *aes_gcm_enc_128_flush_vaes_avx512 / aes_gcm_enc_192_flush_vaes_avx512 /
4967 ; aes_gcm_enc_256_flush_vaes_avx512
4968 ; (MB_MGR_GCM_OOO *state)
4969 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4970 MKGLOBAL(FN_NAME(enc,_flush_),function,internal)
4971 FN_NAME(enc,_flush_):
4972 FUNC_SAVE_AVX512
4973
4974 ;; arg1 - [in] state
4975 ;; arg2 - [out] index
4976 ;; arg3 - [out] length
4977 GCM_FLUSH_MB arg1, arg2, arg3
4978
4979 ;; finalize puts returned job into RAX
4980 ;; arg1 - state
4981 ;; arg2 - min_lane_idx
4982 ;; arg3 - min_len
4983 GCM_FINALIZE_x4 arg1, arg2, arg3, ENC
4984
4985 FUNC_RESTORE_AVX512
4986 ret
4987
4988 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4989 ;JOB_AES_HMAC *aes_gcm_dec_128_submit_vaes_avx512 / aes_gcm_dec_192_submit_vaes_avx512 /
4990 ; aes_gcm_dec_256_submit_vaes_avx512
4991 ; (MB_MGR_GCM_OOO *state, JOB_AES_HMAC *job)
4992 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4993 MKGLOBAL(FN_NAME(dec,_submit_),function,internal)
4994 FN_NAME(dec,_submit_):
4995 FUNC_SAVE_AVX512
4996
4997 ;; arg1 - [in] state
4998 ;; arg2 - [in] job / [out] index
4999 ;; arg3 - [out] length
5000 GCM_SUBMIT_MB arg1, arg2, arg3, DEC
5001
5002 FUNC_RESTORE_AVX512
5003 ret
5004
5005 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5006 ;JOB_AES_HMAC *aes_gcm_dec_128_flush_vaes_avx512 / aes_gcm_dec_192_flush_vaes_avx512 /
5007 ; aes_gcm_dec_256_flush_vaes_avx512
5008 ; (MB_MGR_GCM_OOO *state)
5009 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5010 MKGLOBAL(FN_NAME(dec,_flush_),function,internal)
5011 FN_NAME(dec,_flush_):
5012 FUNC_SAVE_AVX512
5013
5014 ;; arg1 - [in] state
5015 ;; arg2 - [out] index
5016 ;; arg3 - [out] length
5017 GCM_FLUSH_MB arg1, arg2, arg3
5018
5019 ;; finalize puts returned job into RAX
5020 ;; arg1 - state
5021 ;; arg2 - min_lane_idx
5022 ;; arg3 - min_len
5023 GCM_FINALIZE_x4 arg1, arg2, arg3, DEC
5024
5025 FUNC_RESTORE_AVX512
5026 ret
5027
5028
5029 %ifdef LINUX
5030 section .note.GNU-stack noalloc noexec nowrite progbits
5031 %endif