]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / gcm_avx_gen2.asm
CommitLineData
11fdf7f2 1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
f67539c2 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
11fdf7f2
TL
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;
31; Authors:
32; Erdinc Ozturk
33; Vinodh Gopal
34; James Guilford
35;
36;
37; References:
38; This code was derived and highly optimized from the code described in paper:
39; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40;
41; For the shift-based reductions used in this code, we used the method described in paper:
42; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
43;
44;
45;
46;
47; Assumptions:
48;
49;
50;
51; iv:
52; 0 1 2 3
53; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55; | Salt (From the SA) |
56; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57; | Initialization Vector |
58; | (This is the sequence number from IPSec header) |
59; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60; | 0x1 |
61; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62;
63;
64;
65; AAD:
66; AAD will be padded with 0 to the next 16byte multiple
67; for example, assume AAD is a u32 vector
68;
69; if AAD is 8 bytes:
70; AAD[3] = {A0, A1};
71; padded AAD in xmm register = {A1 A0 0 0}
72;
73; 0 1 2 3
74; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76; | SPI (A1) |
77; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78; | 32-bit Sequence Number (A0) |
79; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80; | 0x0 |
81; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82;
83; AAD Format with 32-bit Sequence Number
84;
85; if AAD is 12 bytes:
86; AAD[3] = {A0, A1, A2};
87; padded AAD in xmm register = {A2 A1 A0 0}
88;
89; 0 1 2 3
90; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92; | SPI (A2) |
93; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94; | 64-bit Extended Sequence Number {A1,A0} |
95; | |
96; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97; | 0x0 |
98; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99;
100; AAD Format with 64-bit Extended Sequence Number
101;
102;
103; aadLen:
104; Must be a multiple of 4 bytes and from the definition of the spec.
105; The code additionally supports any aadLen length.
106;
107; TLen:
108; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109;
110; poly = x^128 + x^127 + x^126 + x^121 + 1
111; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112;
113
f67539c2
TL
114%include "include/os.asm"
115%include "include/reg_sizes.asm"
116%include "include/clear_regs.asm"
117%include "include/gcm_defines.asm"
118%include "include/gcm_keys_sse_avx.asm"
119%include "include/memcpy.asm"
11fdf7f2
TL
120
121%ifndef GCM128_MODE
122%ifndef GCM192_MODE
123%ifndef GCM256_MODE
124%error "No GCM mode selected for gcm_avx_gen2.asm!"
125%endif
126%endif
127%endif
128
129%ifdef GCM128_MODE
130%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2
131%define NROUNDS 9
132%endif
133
134%ifdef GCM192_MODE
135%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2
136%define NROUNDS 11
137%endif
138
139%ifdef GCM256_MODE
140%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2
141%define NROUNDS 13
142%endif
143
144default rel
145; need to push 4 registers into stack to maintain
146%define STACK_OFFSET 8*4
147
148%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
149%define TMP3 16*1 ; Temporary storage for AES State 3
150%define TMP4 16*2 ; Temporary storage for AES State 4
151%define TMP5 16*3 ; Temporary storage for AES State 5
152%define TMP6 16*4 ; Temporary storage for AES State 6
153%define TMP7 16*5 ; Temporary storage for AES State 7
154%define TMP8 16*6 ; Temporary storage for AES State 8
155
156%define LOCAL_STORAGE 16*7
157
158%ifidn __OUTPUT_FORMAT__, win64
159 %define XMM_STORAGE 16*10
160%else
161 %define XMM_STORAGE 0
162%endif
163
164%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
165
166section .text
167;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
168; Utility Macros
169;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
170
171;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
172; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
173; Input: A and B (128-bits each, bit-reflected)
174; Output: C = A*B*x mod poly, (i.e. >>1 )
175; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
176; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
177;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178%macro GHASH_MUL 7
179%define %%GH %1 ; 16 Bytes
180%define %%HK %2 ; 16 Bytes
181%define %%T1 %3
182%define %%T2 %4
183%define %%T3 %5
184%define %%T4 %6
185%define %%T5 %7
186 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
187 ;; Karatsuba
188 vpshufd %%T2, %%GH, 01001110b
189 vpshufd %%T3, %%HK, 01001110b
190 vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
191 vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
192
193 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
194 vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
195 vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
196 vpxor %%T2, %%T2, %%GH
197 vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
198
199 vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
200 vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
201 vpxor %%GH, %%GH, %%T3
202 vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
203
204 ;first phase of the reduction
205 vpslld %%T2, %%GH, 31 ; packed right shifting << 31
206 vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
207 vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
208
209 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
210 vpxor %%T2, %%T2, %%T4
211
212 vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
213
214 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
215 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
216 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
217
218 ;second phase of the reduction
219
220 vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
221 vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
222 vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
223 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
224 vpxor %%T2, %%T2, %%T4
225
226 vpxor %%T2, %%T2, %%T5
227 vpxor %%GH, %%GH, %%T2
228 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
229
230
231%endmacro
232
233
234%macro PRECOMPUTE 8
235%define %%GDATA %1
236%define %%HK %2
237%define %%T1 %3
238%define %%T2 %4
239%define %%T3 %5
240%define %%T4 %6
241%define %%T5 %7
242%define %%T6 %8
243
244;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
245; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
246 vmovdqa %%T5, %%HK
247
248 vpshufd %%T1, %%T5, 01001110b
249 vpxor %%T1, %%T5
250 vmovdqu [%%GDATA + HashKey_k], %%T1
251
252 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
253 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
254 vpshufd %%T1, %%T5, 01001110b
255 vpxor %%T1, %%T5
256 vmovdqu [%%GDATA + HashKey_2_k], %%T1
257
258 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
259 vmovdqu [%%GDATA + HashKey_3], %%T5
260 vpshufd %%T1, %%T5, 01001110b
261 vpxor %%T1, %%T5
262 vmovdqu [%%GDATA + HashKey_3_k], %%T1
263
264 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
265 vmovdqu [%%GDATA + HashKey_4], %%T5
266 vpshufd %%T1, %%T5, 01001110b
267 vpxor %%T1, %%T5
268 vmovdqu [%%GDATA + HashKey_4_k], %%T1
269
270 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
271 vmovdqu [%%GDATA + HashKey_5], %%T5
272 vpshufd %%T1, %%T5, 01001110b
273 vpxor %%T1, %%T5
274 vmovdqu [%%GDATA + HashKey_5_k], %%T1
275
276 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
277 vmovdqu [%%GDATA + HashKey_6], %%T5
278 vpshufd %%T1, %%T5, 01001110b
279 vpxor %%T1, %%T5
280 vmovdqu [%%GDATA + HashKey_6_k], %%T1
281
282 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
283 vmovdqu [%%GDATA + HashKey_7], %%T5
284 vpshufd %%T1, %%T5, 01001110b
285 vpxor %%T1, %%T5
286 vmovdqu [%%GDATA + HashKey_7_k], %%T1
287
288 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
289 vmovdqu [%%GDATA + HashKey_8], %%T5
290 vpshufd %%T1, %%T5, 01001110b
291 vpxor %%T1, %%T5
292 vmovdqu [%%GDATA + HashKey_8_k], %%T1
293%endmacro
294
295
296;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
297; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
298; Returns 0 if data has length 0.
299; Input: The input data (INPUT), that data's length (LENGTH).
300; Output: The packed xmm register (OUTPUT).
301;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
302%macro READ_SMALL_DATA_INPUT 6
303%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
304%define %%INPUT %2
305%define %%LENGTH %3
306%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
307%define %%COUNTER %5
308%define %%TMP1 %6
309
310 vpxor %%OUTPUT, %%OUTPUT
311 mov %%COUNTER, %%LENGTH
312 mov %%END_READ_LOCATION, %%INPUT
313 add %%END_READ_LOCATION, %%LENGTH
314 xor %%TMP1, %%TMP1
315
316
317 cmp %%COUNTER, 8
318 jl %%_byte_loop_2
319 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
320 je %%_done
321
322 sub %%COUNTER, 8
323
324%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
325 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
326 dec %%END_READ_LOCATION
327 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
328 dec %%COUNTER
329 jg %%_byte_loop_1
330 vpinsrq %%OUTPUT, %%TMP1, 1
331 jmp %%_done
332
333%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
334 cmp %%COUNTER, 0
335 je %%_done
336 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
337 dec %%END_READ_LOCATION
338 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
339 dec %%COUNTER
340 jg %%_byte_loop_2
341 vpinsrq %%OUTPUT, %%TMP1, 0
342%%_done:
343
344%endmacro ; READ_SMALL_DATA_INPUT
345
346
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
349; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
350; Output: The hash of the data (AAD_HASH).
351;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9f95a23c
TL
352%macro CALC_AAD_HASH 15
353%define %%A_IN %1
354%define %%A_LEN %2
355%define %%AAD_HASH %3
356%define %%GDATA_KEY %4
357%define %%XTMP0 %5 ; xmm temp reg 5
358%define %%XTMP1 %6 ; xmm temp reg 5
359%define %%XTMP2 %7
360%define %%XTMP3 %8
361%define %%XTMP4 %9
362%define %%XTMP5 %10 ; xmm temp reg 5
363%define %%T1 %11 ; temp reg 1
364%define %%T2 %12
365%define %%T3 %13
366%define %%T4 %14
367%define %%T5 %15 ; temp reg 5
368
369
370 mov %%T1, %%A_IN ; T1 = AAD
371 mov %%T2, %%A_LEN ; T2 = aadLen
372 vpxor %%AAD_HASH, %%AAD_HASH
373
374%%_get_AAD_loop128:
375 cmp %%T2, 128
376 jl %%_exit_AAD_loop128
377
378 vmovdqu %%XTMP0, [%%T1 + 16*0]
379 vpshufb %%XTMP0, [rel SHUF_MASK]
380
381 vpxor %%XTMP0, %%AAD_HASH
382
383 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
384 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
385 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
386 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
387 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
388 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
389
390%assign i 1
391%assign j 7
392%rep 7
393 vmovdqu %%XTMP0, [%%T1 + 16*i]
394 vpshufb %%XTMP0, [rel SHUF_MASK]
395
396 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
397 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
398 vpxor %%XTMP1, %%XTMP1, %%XTMP4
399
400 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
401 vpxor %%XTMP2, %%XTMP2, %%XTMP4
402
403 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
404 vpxor %%XTMP3, %%XTMP3, %%XTMP4
405 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
406 vpxor %%XTMP3, %%XTMP3, %%XTMP4
407%assign i (i + 1)
408%assign j (j - 1)
409%endrep
410
411 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
412 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
413 vpxor %%XTMP2, %%XTMP2, %%XTMP4
414 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
415
416 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
417 ;first phase of the reduction
418 vmovdqa %%XTMP5, [rel POLY2]
419 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
420 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
421 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
422
423 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
424 ;second phase of the reduction
425 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
426 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
427
428 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
429 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
430
431 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
432 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
433 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
434
435 sub %%T2, 128
436 je %%_CALC_AAD_done
437
438 add %%T1, 128
439 jmp %%_get_AAD_loop128
440
441%%_exit_AAD_loop128:
442 cmp %%T2, 16
443 jl %%_get_small_AAD_block
444
445 ;; calculate hash_key position to start with
446 mov %%T3, %%T2
447 and %%T3, -16 ; 1 to 7 blocks possible here
448 neg %%T3
449 add %%T3, HashKey_1 + 16
450 lea %%T3, [%%GDATA_KEY + %%T3]
451
452 vmovdqu %%XTMP0, [%%T1]
453 vpshufb %%XTMP0, [rel SHUF_MASK]
454
455 vpxor %%XTMP0, %%AAD_HASH
456
457 vmovdqu %%XTMP5, [%%T3]
458 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
459 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
460 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
461 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
462 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
463
464 add %%T3, 16 ; move to next hashkey
465 add %%T1, 16 ; move to next data block
466 sub %%T2, 16
467 cmp %%T2, 16
468 jl %%_AAD_reduce
469
470%%_AAD_blocks:
471 vmovdqu %%XTMP0, [%%T1]
472 vpshufb %%XTMP0, [rel SHUF_MASK]
473
474 vmovdqu %%XTMP5, [%%T3]
475 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
476 vpxor %%XTMP1, %%XTMP1, %%XTMP4
477
478 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
479 vpxor %%XTMP2, %%XTMP2, %%XTMP4
480
481 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
482 vpxor %%XTMP3, %%XTMP3, %%XTMP4
483 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
484 vpxor %%XTMP3, %%XTMP3, %%XTMP4
485
486 add %%T3, 16 ; move to next hashkey
487 add %%T1, 16
488 sub %%T2, 16
489 cmp %%T2, 16
490 jl %%_AAD_reduce
491 jmp %%_AAD_blocks
492
493%%_AAD_reduce:
494 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
495 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
496 vpxor %%XTMP2, %%XTMP2, %%XTMP4
497 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
498
499 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
500 ;first phase of the reduction
501 vmovdqa %%XTMP5, [rel POLY2]
502 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
503 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
504 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
505
506 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
507 ;second phase of the reduction
508 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
509 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
510
511 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
512 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
513
514 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
515 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
516 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
517
518 or %%T2, %%T2
519 je %%_CALC_AAD_done
11fdf7f2
TL
520
521%%_get_small_AAD_block:
9f95a23c
TL
522 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
523 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
524 ;byte-reflect the AAD data
525 vpshufb %%XTMP1, [rel SHUF_MASK]
526 vpxor %%AAD_HASH, %%XTMP1
527 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
11fdf7f2
TL
528
529%%_CALC_AAD_done:
530
531%endmacro ; CALC_AAD_HASH
532
11fdf7f2
TL
533;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
534; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
535; Requires the input data be at least 1 byte long.
536; Input:
537; GDATA_KEY - struct gcm_key_data *
538; GDATA_CTX - struct gcm_context_data *
539; PLAIN_CYPH_IN - input text
540; PLAIN_CYPH_LEN - input text length
541; DATA_OFFSET - the current data offset
542; ENC_DEC - whether encoding or decoding
543; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
544; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
545;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
546%macro PARTIAL_BLOCK 8
547%define %%GDATA_KEY %1
548%define %%GDATA_CTX %2
549%define %%CYPH_PLAIN_OUT %3
550%define %%PLAIN_CYPH_IN %4
551%define %%PLAIN_CYPH_LEN %5
552%define %%DATA_OFFSET %6
553%define %%AAD_HASH %7
554%define %%ENC_DEC %8
555 mov r13, [%%GDATA_CTX + PBlockLen]
556 cmp r13, 0
557 je %%_partial_block_done ;Leave Macro if no partial blocks
558
559 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
560 jl %%_fewer_than_16_bytes
561 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
562 jmp %%_data_read
563
564%%_fewer_than_16_bytes:
565 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
566 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
567
568%%_data_read: ;Finished reading in data
569
570
571 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
572 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
573
574 lea r12, [SHIFT_MASK]
575
576 cmp r13, rax
577 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
578 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
579 vpshufb xmm9, xmm2 ;shift right r13 bytes
580
581%ifidn %%ENC_DEC, DEC
582 vmovdqa xmm3, xmm1
583 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
584
585 mov r15, %%PLAIN_CYPH_LEN
586 add r15, r13
587 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
588 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
589 sub r12, r15
590%%_no_extra_mask_1:
591
592 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
593 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
594
595 vpand xmm3, xmm1
596 vpshufb xmm3, [SHUF_MASK]
597 vpshufb xmm3, xmm2
598 vpxor %%AAD_HASH, xmm3
599
600
601 cmp r15,0
602 jl %%_partial_incomplete_1
603
604 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
605 xor rax,rax
606 mov [%%GDATA_CTX + PBlockLen], rax
607 jmp %%_dec_done
608%%_partial_incomplete_1:
609%ifidn __OUTPUT_FORMAT__, win64
610 mov rax, %%PLAIN_CYPH_LEN
611 add [%%GDATA_CTX + PBlockLen], rax
612%else
613 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
614%endif
615%%_dec_done:
616 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
617
618%else
619 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
620
621 mov r15, %%PLAIN_CYPH_LEN
622 add r15, r13
623 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
624 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
625 sub r12, r15
626%%_no_extra_mask_2:
627
628 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
629 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
630
631 vpshufb xmm9, [SHUF_MASK]
632 vpshufb xmm9, xmm2
633 vpxor %%AAD_HASH, xmm9
634
635 cmp r15,0
636 jl %%_partial_incomplete_2
637
638 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
639 xor rax,rax
640 mov [%%GDATA_CTX + PBlockLen], rax
641 jmp %%_encode_done
642%%_partial_incomplete_2:
643%ifidn __OUTPUT_FORMAT__, win64
644 mov rax, %%PLAIN_CYPH_LEN
645 add [%%GDATA_CTX + PBlockLen], rax
646%else
647 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
648%endif
649%%_encode_done:
650 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
651
652 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
653 vpshufb xmm9, xmm2
654%endif
655
656
657 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
658 ; output encrypted Bytes
659 cmp r15,0
660 jl %%_partial_fill
661 mov r12, r13
662 mov r13, 16
663 sub r13, r12 ; Set r13 to be the number of bytes to write out
664 jmp %%_count_set
665%%_partial_fill:
666 mov r13, %%PLAIN_CYPH_LEN
667%%_count_set:
668 vmovq rax, xmm9
669 cmp r13, 8
670 jle %%_less_than_8_bytes_left
671
672 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
673 add %%DATA_OFFSET, 8
674 vpsrldq xmm9, xmm9, 8
675 vmovq rax, xmm9
676 sub r13, 8
677%%_less_than_8_bytes_left:
678 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
679 add %%DATA_OFFSET, 1
680 shr rax, 8
681 sub r13, 1
682 jne %%_less_than_8_bytes_left
683 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
684%%_partial_block_done:
685%endmacro ; PARTIAL_BLOCK
686
687
688; if a = number of total plaintext bytes
689; b = floor(a/16)
690; %%num_initial_blocks = b mod 8;
691; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
692; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
693; Updated AAD_HASH is returned in %%T3
694
695%macro INITIAL_BLOCKS 24
696%define %%GDATA_KEY %1
697%define %%GDATA_CTX %2
698%define %%CYPH_PLAIN_OUT %3
699%define %%PLAIN_CYPH_IN %4
700%define %%LENGTH %5
701%define %%DATA_OFFSET %6
702%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
703%define %%T1 %8
704%define %%HASH_KEY %9
705%define %%T3 %10
706%define %%T4 %11
707%define %%T5 %12
708%define %%CTR %13
709%define %%XMM1 %14
710%define %%XMM2 %15
711%define %%XMM3 %16
712%define %%XMM4 %17
713%define %%XMM5 %18
714%define %%XMM6 %19
715%define %%XMM7 %20
716%define %%XMM8 %21
717%define %%T6 %22
718%define %%T_key %23
719%define %%ENC_DEC %24
720
721%assign i (8-%%num_initial_blocks)
722 vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
723 ; start AES for %%num_initial_blocks blocks
724 vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
725
726
727%assign i (9-%%num_initial_blocks)
728%rep %%num_initial_blocks
729 vpaddd %%CTR, [ONE] ; INCR Y0
730 vmovdqa reg(i), %%CTR
731 vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
732%assign i (i+1)
733%endrep
734
735 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
736%assign i (9-%%num_initial_blocks)
737%rep %%num_initial_blocks
738 vpxor reg(i),%%T_key
739%assign i (i+1)
740%endrep
741
742%assign j 1
743%rep NROUNDS
744 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
745%assign i (9-%%num_initial_blocks)
746%rep %%num_initial_blocks
747 vaesenc reg(i),%%T_key
748%assign i (i+1)
749%endrep
750
751%assign j (j+1)
752%endrep ; NROUNDS
753
754
755vmovdqu %%T_key, [%%GDATA_KEY+16*j]
756%assign i (9-%%num_initial_blocks)
757%rep %%num_initial_blocks
758 vaesenclast reg(i),%%T_key
759%assign i (i+1)
760%endrep
761
762%assign i (9-%%num_initial_blocks)
763%rep %%num_initial_blocks
764 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
765 vpxor reg(i), %%T1
766 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
767 add %%DATA_OFFSET, 16
768 %ifidn %%ENC_DEC, DEC
769 vmovdqa reg(i), %%T1
770 %endif
771 vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
772%assign i (i+1)
773%endrep
774
775
776%assign i (8-%%num_initial_blocks)
777%assign j (9-%%num_initial_blocks)
778
779%rep %%num_initial_blocks
780 vpxor reg(j), reg(i)
781 GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
782%assign i (i+1)
783%assign j (j+1)
784%endrep
785 ; %%XMM8 has the current Hash Value
786 vmovdqa %%T3, %%XMM8
787
788 cmp %%LENGTH, 128
f67539c2 789 jl %%_initial_blocks_done
11fdf7f2
TL
790
791;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
f67539c2
TL
792; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and
793; store cipher/plain text.
794; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8)
795; - combine current GHASH value into block 0 (XMM1)
796
11fdf7f2
TL
797 vpaddd %%CTR, [ONE] ; INCR Y0
798 vmovdqa %%XMM1, %%CTR
799 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
800
801 vpaddd %%CTR, [ONE] ; INCR Y0
802 vmovdqa %%XMM2, %%CTR
803 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
804
805 vpaddd %%CTR, [ONE] ; INCR Y0
806 vmovdqa %%XMM3, %%CTR
807 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
808
809 vpaddd %%CTR, [ONE] ; INCR Y0
810 vmovdqa %%XMM4, %%CTR
811 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
812
813 vpaddd %%CTR, [ONE] ; INCR Y0
814 vmovdqa %%XMM5, %%CTR
815 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
816
817 vpaddd %%CTR, [ONE] ; INCR Y0
818 vmovdqa %%XMM6, %%CTR
819 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
820
821 vpaddd %%CTR, [ONE] ; INCR Y0
822 vmovdqa %%XMM7, %%CTR
823 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
824
825 vpaddd %%CTR, [ONE] ; INCR Y0
826 vmovdqa %%XMM8, %%CTR
827 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
828
829 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
830 vpxor %%XMM1, %%T_key
831 vpxor %%XMM2, %%T_key
832 vpxor %%XMM3, %%T_key
833 vpxor %%XMM4, %%T_key
834 vpxor %%XMM5, %%T_key
835 vpxor %%XMM6, %%T_key
836 vpxor %%XMM7, %%T_key
837 vpxor %%XMM8, %%T_key
838
839
840%assign i 1
841%rep NROUNDS
842 vmovdqu %%T_key, [%%GDATA_KEY+16*i]
843 vaesenc %%XMM1, %%T_key
844 vaesenc %%XMM2, %%T_key
845 vaesenc %%XMM3, %%T_key
846 vaesenc %%XMM4, %%T_key
847 vaesenc %%XMM5, %%T_key
848 vaesenc %%XMM6, %%T_key
849 vaesenc %%XMM7, %%T_key
850 vaesenc %%XMM8, %%T_key
851%assign i (i+1)
852%endrep
853
854
855 vmovdqu %%T_key, [%%GDATA_KEY+16*i]
856 vaesenclast %%XMM1, %%T_key
857 vaesenclast %%XMM2, %%T_key
858 vaesenclast %%XMM3, %%T_key
859 vaesenclast %%XMM4, %%T_key
860 vaesenclast %%XMM5, %%T_key
861 vaesenclast %%XMM6, %%T_key
862 vaesenclast %%XMM7, %%T_key
863 vaesenclast %%XMM8, %%T_key
864
865 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
866 vpxor %%XMM1, %%T1
867 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
868 %ifidn %%ENC_DEC, DEC
869 vmovdqa %%XMM1, %%T1
870 %endif
871
872 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
873 vpxor %%XMM2, %%T1
874 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
875 %ifidn %%ENC_DEC, DEC
876 vmovdqa %%XMM2, %%T1
877 %endif
878
879 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
880 vpxor %%XMM3, %%T1
881 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
882 %ifidn %%ENC_DEC, DEC
883 vmovdqa %%XMM3, %%T1
884 %endif
885
886 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
887 vpxor %%XMM4, %%T1
888 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
889 %ifidn %%ENC_DEC, DEC
890 vmovdqa %%XMM4, %%T1
891 %endif
892
893 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
894 vpxor %%XMM5, %%T1
895 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
896 %ifidn %%ENC_DEC, DEC
897 vmovdqa %%XMM5, %%T1
898 %endif
899
900 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
901 vpxor %%XMM6, %%T1
902 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
903 %ifidn %%ENC_DEC, DEC
904 vmovdqa %%XMM6, %%T1
905 %endif
906
907 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
908 vpxor %%XMM7, %%T1
909 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
910 %ifidn %%ENC_DEC, DEC
911 vmovdqa %%XMM7, %%T1
912 %endif
913
914 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
915 vpxor %%XMM8, %%T1
916 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
917 %ifidn %%ENC_DEC, DEC
918 vmovdqa %%XMM8, %%T1
919 %endif
920
921 add %%DATA_OFFSET, 128
922
923 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
924 vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
925 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
926 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
927 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
928 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
929 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
930 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
931 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
932
933;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
934
935%%_initial_blocks_done:
936
937
938%endmacro
939
940
941; encrypt 8 blocks at a time
942; ghash the 8 previously encrypted ciphertext blocks
943; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
944; r11 is the data offset value
945%macro GHASH_8_ENCRYPT_8_PARALLEL 22
946%define %%GDATA %1
947%define %%CYPH_PLAIN_OUT %2
948%define %%PLAIN_CYPH_IN %3
949%define %%DATA_OFFSET %4
950%define %%T1 %5
951%define %%T2 %6
952%define %%T3 %7
953%define %%T4 %8
954%define %%T5 %9
955%define %%T6 %10
956%define %%CTR %11
957%define %%XMM1 %12
958%define %%XMM2 %13
959%define %%XMM3 %14
960%define %%XMM4 %15
961%define %%XMM5 %16
962%define %%XMM6 %17
963%define %%XMM7 %18
964%define %%XMM8 %19
965%define %%T7 %20
966%define %%loop_idx %21
967%define %%ENC_DEC %22
968
969 vmovdqa %%T2, %%XMM1
970 vmovdqu [rsp + TMP2], %%XMM2
971 vmovdqu [rsp + TMP3], %%XMM3
972 vmovdqu [rsp + TMP4], %%XMM4
973 vmovdqu [rsp + TMP5], %%XMM5
974 vmovdqu [rsp + TMP6], %%XMM6
975 vmovdqu [rsp + TMP7], %%XMM7
976 vmovdqu [rsp + TMP8], %%XMM8
977
978%ifidn %%loop_idx, in_order
979 vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
980 vpaddd %%XMM2, %%XMM1, [ONE]
981 vpaddd %%XMM3, %%XMM2, [ONE]
982 vpaddd %%XMM4, %%XMM3, [ONE]
983 vpaddd %%XMM5, %%XMM4, [ONE]
984 vpaddd %%XMM6, %%XMM5, [ONE]
985 vpaddd %%XMM7, %%XMM6, [ONE]
986 vpaddd %%XMM8, %%XMM7, [ONE]
987 vmovdqa %%CTR, %%XMM8
988
989 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
990 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
991 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
992 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
993 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
994 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
995 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
996 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
997%else
998 vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
999 vpaddd %%XMM2, %%XMM1, [ONEf]
1000 vpaddd %%XMM3, %%XMM2, [ONEf]
1001 vpaddd %%XMM4, %%XMM3, [ONEf]
1002 vpaddd %%XMM5, %%XMM4, [ONEf]
1003 vpaddd %%XMM6, %%XMM5, [ONEf]
1004 vpaddd %%XMM7, %%XMM6, [ONEf]
1005 vpaddd %%XMM8, %%XMM7, [ONEf]
1006 vmovdqa %%CTR, %%XMM8
1007%endif
1008
1009
1010
1011 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1012
1013 vmovdqu %%T1, [%%GDATA + 16*0]
1014 vpxor %%XMM1, %%T1
1015 vpxor %%XMM2, %%T1
1016 vpxor %%XMM3, %%T1
1017 vpxor %%XMM4, %%T1
1018 vpxor %%XMM5, %%T1
1019 vpxor %%XMM6, %%T1
1020 vpxor %%XMM7, %%T1
1021 vpxor %%XMM8, %%T1
1022
1023 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1024
1025 vmovdqu %%T1, [%%GDATA + 16*1]
1026 vaesenc %%XMM1, %%T1
1027 vaesenc %%XMM2, %%T1
1028 vaesenc %%XMM3, %%T1
1029 vaesenc %%XMM4, %%T1
1030 vaesenc %%XMM5, %%T1
1031 vaesenc %%XMM6, %%T1
1032 vaesenc %%XMM7, %%T1
1033 vaesenc %%XMM8, %%T1
1034
1035
1036 vmovdqu %%T1, [%%GDATA + 16*2]
1037 vaesenc %%XMM1, %%T1
1038 vaesenc %%XMM2, %%T1
1039 vaesenc %%XMM3, %%T1
1040 vaesenc %%XMM4, %%T1
1041 vaesenc %%XMM5, %%T1
1042 vaesenc %%XMM6, %%T1
1043 vaesenc %%XMM7, %%T1
1044 vaesenc %%XMM8, %%T1
1045
1046 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1047
1048 vmovdqu %%T5, [%%GDATA + HashKey_8]
1049 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1050 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
1051
1052 vpshufd %%T6, %%T2, 01001110b
1053 vpxor %%T6, %%T2
1054
1055 vmovdqu %%T5, [%%GDATA + HashKey_8_k]
1056 vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
1057
1058
1059 vmovdqu %%T1, [%%GDATA + 16*3]
1060 vaesenc %%XMM1, %%T1
1061 vaesenc %%XMM2, %%T1
1062 vaesenc %%XMM3, %%T1
1063 vaesenc %%XMM4, %%T1
1064 vaesenc %%XMM5, %%T1
1065 vaesenc %%XMM6, %%T1
1066 vaesenc %%XMM7, %%T1
1067 vaesenc %%XMM8, %%T1
1068
1069 vmovdqu %%T1, [rsp + TMP2]
1070 vmovdqu %%T5, [%%GDATA + HashKey_7]
1071 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1072 vpxor %%T4, %%T4, %%T3
1073 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1074 vpxor %%T7, %%T7, %%T3
1075
1076 vpshufd %%T3, %%T1, 01001110b
1077 vpxor %%T3, %%T1
1078 vmovdqu %%T5, [%%GDATA + HashKey_7_k]
1079 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1080 vpxor %%T6, %%T6, %%T3
1081
1082 vmovdqu %%T1, [%%GDATA + 16*4]
1083 vaesenc %%XMM1, %%T1
1084 vaesenc %%XMM2, %%T1
1085 vaesenc %%XMM3, %%T1
1086 vaesenc %%XMM4, %%T1
1087 vaesenc %%XMM5, %%T1
1088 vaesenc %%XMM6, %%T1
1089 vaesenc %%XMM7, %%T1
1090 vaesenc %%XMM8, %%T1
1091
1092 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1093 vmovdqu %%T1, [rsp + TMP3]
1094 vmovdqu %%T5, [%%GDATA + HashKey_6]
1095 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1096 vpxor %%T4, %%T4, %%T3
1097 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1098 vpxor %%T7, %%T7, %%T3
1099
1100 vpshufd %%T3, %%T1, 01001110b
1101 vpxor %%T3, %%T1
1102 vmovdqu %%T5, [%%GDATA + HashKey_6_k]
1103 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1104 vpxor %%T6, %%T6, %%T3
1105
1106 vmovdqu %%T1, [%%GDATA + 16*5]
1107 vaesenc %%XMM1, %%T1
1108 vaesenc %%XMM2, %%T1
1109 vaesenc %%XMM3, %%T1
1110 vaesenc %%XMM4, %%T1
1111 vaesenc %%XMM5, %%T1
1112 vaesenc %%XMM6, %%T1
1113 vaesenc %%XMM7, %%T1
1114 vaesenc %%XMM8, %%T1
1115
1116
1117 vmovdqu %%T1, [rsp + TMP4]
1118 vmovdqu %%T5, [%%GDATA + HashKey_5]
1119 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1120 vpxor %%T4, %%T4, %%T3
1121 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1122 vpxor %%T7, %%T7, %%T3
1123
1124 vpshufd %%T3, %%T1, 01001110b
1125 vpxor %%T3, %%T1
1126 vmovdqu %%T5, [%%GDATA + HashKey_5_k]
1127 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1128 vpxor %%T6, %%T6, %%T3
1129
1130 vmovdqu %%T1, [%%GDATA + 16*6]
1131 vaesenc %%XMM1, %%T1
1132 vaesenc %%XMM2, %%T1
1133 vaesenc %%XMM3, %%T1
1134 vaesenc %%XMM4, %%T1
1135 vaesenc %%XMM5, %%T1
1136 vaesenc %%XMM6, %%T1
1137 vaesenc %%XMM7, %%T1
1138 vaesenc %%XMM8, %%T1
1139
1140 vmovdqu %%T1, [rsp + TMP5]
1141 vmovdqu %%T5, [%%GDATA + HashKey_4]
1142 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1143 vpxor %%T4, %%T4, %%T3
1144 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1145 vpxor %%T7, %%T7, %%T3
1146
1147 vpshufd %%T3, %%T1, 01001110b
1148 vpxor %%T3, %%T1
1149 vmovdqu %%T5, [%%GDATA + HashKey_4_k]
1150 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1151 vpxor %%T6, %%T6, %%T3
1152
1153
1154 vmovdqu %%T1, [%%GDATA + 16*7]
1155 vaesenc %%XMM1, %%T1
1156 vaesenc %%XMM2, %%T1
1157 vaesenc %%XMM3, %%T1
1158 vaesenc %%XMM4, %%T1
1159 vaesenc %%XMM5, %%T1
1160 vaesenc %%XMM6, %%T1
1161 vaesenc %%XMM7, %%T1
1162 vaesenc %%XMM8, %%T1
1163
1164 vmovdqu %%T1, [rsp + TMP6]
1165 vmovdqu %%T5, [%%GDATA + HashKey_3]
1166 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1167 vpxor %%T4, %%T4, %%T3
1168 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1169 vpxor %%T7, %%T7, %%T3
1170
1171 vpshufd %%T3, %%T1, 01001110b
1172 vpxor %%T3, %%T1
1173 vmovdqu %%T5, [%%GDATA + HashKey_3_k]
1174 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1175 vpxor %%T6, %%T6, %%T3
1176
1177 vmovdqu %%T1, [%%GDATA + 16*8]
1178 vaesenc %%XMM1, %%T1
1179 vaesenc %%XMM2, %%T1
1180 vaesenc %%XMM3, %%T1
1181 vaesenc %%XMM4, %%T1
1182 vaesenc %%XMM5, %%T1
1183 vaesenc %%XMM6, %%T1
1184 vaesenc %%XMM7, %%T1
1185 vaesenc %%XMM8, %%T1
1186
1187 vmovdqu %%T1, [rsp + TMP7]
1188 vmovdqu %%T5, [%%GDATA + HashKey_2]
1189 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1190 vpxor %%T4, %%T4, %%T3
1191 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1192 vpxor %%T7, %%T7, %%T3
1193
1194 vpshufd %%T3, %%T1, 01001110b
1195 vpxor %%T3, %%T1
1196 vmovdqu %%T5, [%%GDATA + HashKey_2_k]
1197 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1198 vpxor %%T6, %%T6, %%T3
1199 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1200
1201 vmovdqu %%T5, [%%GDATA + 16*9]
1202 vaesenc %%XMM1, %%T5
1203 vaesenc %%XMM2, %%T5
1204 vaesenc %%XMM3, %%T5
1205 vaesenc %%XMM4, %%T5
1206 vaesenc %%XMM5, %%T5
1207 vaesenc %%XMM6, %%T5
1208 vaesenc %%XMM7, %%T5
1209 vaesenc %%XMM8, %%T5
1210
1211 vmovdqu %%T1, [rsp + TMP8]
1212 vmovdqu %%T5, [%%GDATA + HashKey]
1213 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1214 vpxor %%T4, %%T4, %%T3
1215 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1216 vpxor %%T7, %%T7, %%T3
1217
1218 vpshufd %%T3, %%T1, 01001110b
1219 vpxor %%T3, %%T1
1220 vmovdqu %%T5, [%%GDATA + HashKey_k]
1221 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1222 vpxor %%T6, %%T6, %%T3
1223
1224 vpxor %%T6, %%T4
1225 vpxor %%T6, %%T7
1226
1227%ifdef GCM128_MODE
1228 vmovdqu %%T5, [%%GDATA + 16*10]
1229%endif
1230%ifdef GCM192_MODE
1231 vmovdqu %%T5, [%%GDATA + 16*10]
1232 vaesenc %%XMM1, %%T5
1233 vaesenc %%XMM2, %%T5
1234 vaesenc %%XMM3, %%T5
1235 vaesenc %%XMM4, %%T5
1236 vaesenc %%XMM5, %%T5
1237 vaesenc %%XMM6, %%T5
1238 vaesenc %%XMM7, %%T5
1239 vaesenc %%XMM8, %%T5
1240
1241 vmovdqu %%T5, [%%GDATA + 16*11]
1242 vaesenc %%XMM1, %%T5
1243 vaesenc %%XMM2, %%T5
1244 vaesenc %%XMM3, %%T5
1245 vaesenc %%XMM4, %%T5
1246 vaesenc %%XMM5, %%T5
1247 vaesenc %%XMM6, %%T5
1248 vaesenc %%XMM7, %%T5
1249 vaesenc %%XMM8, %%T5
1250
1251 vmovdqu %%T5, [%%GDATA + 16*12]
1252%endif
1253%ifdef GCM256_MODE
1254 vmovdqu %%T5, [%%GDATA + 16*10]
1255 vaesenc %%XMM1, %%T5
1256 vaesenc %%XMM2, %%T5
1257 vaesenc %%XMM3, %%T5
1258 vaesenc %%XMM4, %%T5
1259 vaesenc %%XMM5, %%T5
1260 vaesenc %%XMM6, %%T5
1261 vaesenc %%XMM7, %%T5
1262 vaesenc %%XMM8, %%T5
1263
1264 vmovdqu %%T5, [%%GDATA + 16*11]
1265 vaesenc %%XMM1, %%T5
1266 vaesenc %%XMM2, %%T5
1267 vaesenc %%XMM3, %%T5
1268 vaesenc %%XMM4, %%T5
1269 vaesenc %%XMM5, %%T5
1270 vaesenc %%XMM6, %%T5
1271 vaesenc %%XMM7, %%T5
1272 vaesenc %%XMM8, %%T5
1273
1274 vmovdqu %%T5, [%%GDATA + 16*12]
1275 vaesenc %%XMM1, %%T5
1276 vaesenc %%XMM2, %%T5
1277 vaesenc %%XMM3, %%T5
1278 vaesenc %%XMM4, %%T5
1279 vaesenc %%XMM5, %%T5
1280 vaesenc %%XMM6, %%T5
1281 vaesenc %%XMM7, %%T5
1282 vaesenc %%XMM8, %%T5
1283
1284 vmovdqu %%T5, [%%GDATA + 16*13]
1285 vaesenc %%XMM1, %%T5
1286 vaesenc %%XMM2, %%T5
1287 vaesenc %%XMM3, %%T5
1288 vaesenc %%XMM4, %%T5
1289 vaesenc %%XMM5, %%T5
1290 vaesenc %%XMM6, %%T5
1291 vaesenc %%XMM7, %%T5
1292 vaesenc %%XMM8, %%T5
1293
1294 vmovdqu %%T5, [%%GDATA + 16*14]
1295%endif
1296
1297%assign i 0
1298%assign j 1
1299%rep 8
1300
1301%ifidn %%ENC_DEC, ENC
1302%ifdef NT_LD
1303 VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1304 vpxor %%T2, %%T2, %%T5
1305%else
1306 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1307%endif ; NT_LD
1308 vaesenclast reg(j), reg(j), %%T2
1309%else
1310 VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1311 vpxor %%T2, %%T2, %%T5
1312 vaesenclast %%T3, reg(j), %%T2
1313 vpxor reg(j), %%T2, %%T5
1314 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
1315%endif ; %%ENC_DEC
1316
1317%assign i (i+1)
1318%assign j (j+1)
1319%endrep
1320
1321 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
1322 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
1323 vpxor %%T7, %%T3
1324 vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
1325
1326
1327 ;first phase of the reduction
1328
1329 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1330 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1331 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1332
1333 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1334 vpxor %%T2, %%T2, %%T4
1335
1336 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1337
1338 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1339 vpxor %%T7, %%T2 ; first phase of the reduction complete
1340 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1341 %ifidn %%ENC_DEC, ENC
1342 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
1343 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
1344 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
1345 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
1346 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
1347 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
1348 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
1349 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
1350 %endif
1351
1352 ;second phase of the reduction
1353
1354 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1355 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1356 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1357 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1358 vpxor %%T2, %%T2,%%T4
1359
1360 vpxor %%T2, %%T2, %%T1
1361 vpxor %%T7, %%T7, %%T2
1362 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1363
1364
1365
1366 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1367 vpshufb %%XMM2, [SHUF_MASK]
1368 vpshufb %%XMM3, [SHUF_MASK]
1369 vpshufb %%XMM4, [SHUF_MASK]
1370 vpshufb %%XMM5, [SHUF_MASK]
1371 vpshufb %%XMM6, [SHUF_MASK]
1372 vpshufb %%XMM7, [SHUF_MASK]
1373 vpshufb %%XMM8, [SHUF_MASK]
1374
1375
1376 vpxor %%XMM1, %%T6
1377
1378%endmacro
1379
1380
1381; GHASH the last 4 ciphertext blocks.
1382; %%GDATA is GCM key data
1383%macro GHASH_LAST_8 16
1384%define %%GDATA %1
1385%define %%T1 %2
1386%define %%T2 %3
1387%define %%T3 %4
1388%define %%T4 %5
1389%define %%T5 %6
1390%define %%T6 %7
1391%define %%T7 %8
1392%define %%XMM1 %9
1393%define %%XMM2 %10
1394%define %%XMM3 %11
1395%define %%XMM4 %12
1396%define %%XMM5 %13
1397%define %%XMM6 %14
1398%define %%XMM7 %15
1399%define %%XMM8 %16
1400 ;; Karatsuba Method
1401
1402
1403 vpshufd %%T2, %%XMM1, 01001110b
1404 vpxor %%T2, %%XMM1
1405 vmovdqu %%T5, [%%GDATA + HashKey_8]
1406 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
1407 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
1408
1409 vmovdqu %%T3, [%%GDATA + HashKey_8_k]
1410 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
1411
1412
1413 ;;;;;;;;;;;;;;;;;;;;;;
1414
1415
1416 vpshufd %%T2, %%XMM2, 01001110b
1417 vpxor %%T2, %%XMM2
1418 vmovdqu %%T5, [%%GDATA + HashKey_7]
1419 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
1420 vpxor %%T6, %%T6, %%T4
1421
1422 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
1423 vpxor %%T7, %%T7, %%T4
1424
1425 vmovdqu %%T3, [%%GDATA + HashKey_7_k]
1426 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1427 vpxor %%XMM1, %%XMM1, %%T2
1428
1429 ;;;;;;;;;;;;;;;;;;;;;;
1430
1431
1432 vpshufd %%T2, %%XMM3, 01001110b
1433 vpxor %%T2, %%XMM3
1434 vmovdqu %%T5, [%%GDATA + HashKey_6]
1435 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
1436 vpxor %%T6, %%T6, %%T4
1437
1438 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
1439 vpxor %%T7, %%T7, %%T4
1440
1441 vmovdqu %%T3, [%%GDATA + HashKey_6_k]
1442 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1443 vpxor %%XMM1, %%XMM1, %%T2
1444
1445 ;;;;;;;;;;;;;;;;;;;;;;
1446
1447
1448 vpshufd %%T2, %%XMM4, 01001110b
1449 vpxor %%T2, %%XMM4
1450 vmovdqu %%T5, [%%GDATA + HashKey_5]
1451 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
1452 vpxor %%T6, %%T6, %%T4
1453
1454 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
1455 vpxor %%T7, %%T7, %%T4
1456
1457 vmovdqu %%T3, [%%GDATA + HashKey_5_k]
1458 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1459 vpxor %%XMM1, %%XMM1, %%T2
1460
1461 ;;;;;;;;;;;;;;;;;;;;;;
1462
1463 vpshufd %%T2, %%XMM5, 01001110b
1464 vpxor %%T2, %%XMM5
1465 vmovdqu %%T5, [%%GDATA + HashKey_4]
1466 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
1467 vpxor %%T6, %%T6, %%T4
1468
1469 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
1470 vpxor %%T7, %%T7, %%T4
1471
1472 vmovdqu %%T3, [%%GDATA + HashKey_4_k]
1473 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1474 vpxor %%XMM1, %%XMM1, %%T2
1475
1476 ;;;;;;;;;;;;;;;;;;;;;;
1477
1478 vpshufd %%T2, %%XMM6, 01001110b
1479 vpxor %%T2, %%XMM6
1480 vmovdqu %%T5, [%%GDATA + HashKey_3]
1481
1482 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
1483 vpxor %%T6, %%T6, %%T4
1484
1485 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
1486 vpxor %%T7, %%T7, %%T4
1487
1488 vmovdqu %%T3, [%%GDATA + HashKey_3_k]
1489 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1490 vpxor %%XMM1, %%XMM1, %%T2
1491
1492 ;;;;;;;;;;;;;;;;;;;;;;
1493
1494 vpshufd %%T2, %%XMM7, 01001110b
1495 vpxor %%T2, %%XMM7
1496 vmovdqu %%T5, [%%GDATA + HashKey_2]
1497 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
1498 vpxor %%T6, %%T6, %%T4
1499
1500 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
1501 vpxor %%T7, %%T7, %%T4
1502
1503 vmovdqu %%T3, [%%GDATA + HashKey_2_k]
1504 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1505 vpxor %%XMM1, %%XMM1, %%T2
1506
1507 ;;;;;;;;;;;;;;;;;;;;;;
1508
1509 vpshufd %%T2, %%XMM8, 01001110b
1510 vpxor %%T2, %%XMM8
1511 vmovdqu %%T5, [%%GDATA + HashKey]
1512 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
1513 vpxor %%T6, %%T6, %%T4
1514
1515 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
1516 vpxor %%T7, %%T7, %%T4
1517
1518 vmovdqu %%T3, [%%GDATA + HashKey_k]
1519 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1520
1521 vpxor %%XMM1, %%XMM1, %%T2
1522 vpxor %%XMM1, %%XMM1, %%T6
1523 vpxor %%T2, %%XMM1, %%T7
1524
1525
1526
1527
1528 vpslldq %%T4, %%T2, 8
1529 vpsrldq %%T2, %%T2, 8
1530
1531 vpxor %%T7, %%T4
1532 vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
1533
1534 ;first phase of the reduction
1535
1536 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1537 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1538 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1539
1540 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1541 vpxor %%T2, %%T2, %%T4
1542
1543 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1544
1545 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1546 vpxor %%T7, %%T2 ; first phase of the reduction complete
1547 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1548
1549 ;second phase of the reduction
1550
1551 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1552 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1553 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1554 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1555 vpxor %%T2, %%T2,%%T4
1556
1557 vpxor %%T2, %%T2, %%T1
1558 vpxor %%T7, %%T7, %%T2
1559 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1560
1561
1562%endmacro
1563
1564
1565; Encryption of a single block
1566; %%GDATA is GCM key data
1567%macro ENCRYPT_SINGLE_BLOCK 2
1568%define %%GDATA %1
1569%define %%XMM0 %2
1570
1571 vpxor %%XMM0, [%%GDATA+16*0]
1572%assign i 1
1573%rep NROUNDS
1574 vaesenc %%XMM0, [%%GDATA+16*i]
1575%assign i (i+1)
1576%endrep ; NROUNDS
1577 vaesenclast %%XMM0, [%%GDATA+16*i]
1578%endmacro
1579
1580
1581;; Start of Stack Setup
1582
1583%macro FUNC_SAVE 0
1584 ;; Required for Update/GMC_ENC
1585 ;the number of pushes must equal STACK_OFFSET
1586 push r12
1587 push r13
1588 push r14
1589 push r15
1590 mov r14, rsp
1591
1592 sub rsp, VARIABLE_OFFSET
1593 and rsp, ~63
1594
1595%ifidn __OUTPUT_FORMAT__, win64
1596 ; xmm6:xmm15 need to be maintained for Windows
1597 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1598 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
1599 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
1600 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
1601 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
1602 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
1603 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
1604 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
1605 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
1606 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
1607%endif
1608%endmacro
1609
1610
1611%macro FUNC_RESTORE 0
1612
f67539c2
TL
1613%ifdef SAFE_DATA
1614 clear_scratch_gps_asm
1615 clear_scratch_xmms_avx_asm
1616%endif
11fdf7f2
TL
1617%ifidn __OUTPUT_FORMAT__, win64
1618 vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
1619 vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
1620 vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
1621 vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
1622 vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
1623 vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
1624 vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
1625 vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
1626 vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
1627 vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
1628%endif
1629
1630;; Required for Update/GMC_ENC
1631 mov rsp, r14
1632 pop r15
1633 pop r14
1634 pop r13
1635 pop r12
1636%endmacro
1637
1638
1639;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1640; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
1641; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
1642; IV, Additional Authentication data (A_IN), Additional
1643; Data length (A_LEN)
1644; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
1645; Clobbers rax, r10-r13, and xmm0-xmm6
1646;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1647%macro GCM_INIT 5
1648%define %%GDATA_KEY %1
1649%define %%GDATA_CTX %2
1650%define %%IV %3
1651%define %%A_IN %4
1652%define %%A_LEN %5
1653%define %%AAD_HASH xmm0
11fdf7f2 1654
9f95a23c 1655 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
11fdf7f2
TL
1656 vpxor xmm2, xmm3
1657 mov r10, %%A_LEN
1658
1659 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
1660 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
1661 xor r10, r10
1662 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
1663 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
1664 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
1665 mov r10, %%IV
1666 vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
1667 vpinsrq xmm2, [r10], 0
1668 vpinsrd xmm2, [r10+8], 2
1669 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
1670
9f95a23c 1671 vpshufb xmm2, [rel SHUF_MASK]
11fdf7f2
TL
1672
1673 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
1674%endmacro
1675
1676
1677;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1678; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
1679; has been initialized by GCM_INIT
1680; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
1681; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
1682; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
1683; and whether encoding or decoding (ENC_DEC)
1684; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
1685; Clobbers rax, r10-r15, and xmm0-xmm15
1686;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1687%macro GCM_ENC_DEC 6
1688%define %%GDATA_KEY %1
1689%define %%GDATA_CTX %2
1690%define %%CYPH_PLAIN_OUT %3
1691%define %%PLAIN_CYPH_IN %4
1692%define %%PLAIN_CYPH_LEN %5
1693%define %%ENC_DEC %6
1694%define %%DATA_OFFSET r11
1695
1696; Macro flow:
1697; calculate the number of 16byte blocks in the message
1698; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
1699; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
1700; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
1701 cmp %%PLAIN_CYPH_LEN, 0
1702 je %%_multiple_of_16_bytes
1703
1704 xor %%DATA_OFFSET, %%DATA_OFFSET
1705%ifidn __OUTPUT_FORMAT__, win64
1706 mov rax, %%PLAIN_CYPH_LEN
1707 add [%%GDATA_CTX + InLen], rax ; Update length of data processed
1708%else
1709 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed
1710%endif
1711 vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
1712 vmovdqu xmm8, [%%GDATA_CTX + AadHash]
1713
1714
1715 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
1716
1717
1718 mov r13, %%PLAIN_CYPH_LEN
1719 sub r13, %%DATA_OFFSET
1720 mov r10, r13 ; save the amount of data left to process in r10
1721 and r13, -16 ; r13 = r13 - (r13 mod 16)
1722
1723 mov r12, r13
1724 shr r12, 4
1725 and r12, 7
1726
1727 jz %%_initial_num_blocks_is_0
1728
1729 cmp r12, 7
1730 je %%_initial_num_blocks_is_7
1731 cmp r12, 6
1732 je %%_initial_num_blocks_is_6
1733 cmp r12, 5
1734 je %%_initial_num_blocks_is_5
1735 cmp r12, 4
1736 je %%_initial_num_blocks_is_4
1737 cmp r12, 3
1738 je %%_initial_num_blocks_is_3
1739 cmp r12, 2
1740 je %%_initial_num_blocks_is_2
1741
1742 jmp %%_initial_num_blocks_is_1
1743
1744%%_initial_num_blocks_is_7:
1745 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1746 sub r13, 16*7
1747 jmp %%_initial_blocks_encrypted
1748
1749%%_initial_num_blocks_is_6:
1750 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1751 sub r13, 16*6
1752 jmp %%_initial_blocks_encrypted
1753
1754%%_initial_num_blocks_is_5:
1755 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1756 sub r13, 16*5
1757 jmp %%_initial_blocks_encrypted
1758
1759%%_initial_num_blocks_is_4:
1760 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1761 sub r13, 16*4
1762 jmp %%_initial_blocks_encrypted
1763
1764
1765%%_initial_num_blocks_is_3:
1766 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1767 sub r13, 16*3
1768 jmp %%_initial_blocks_encrypted
1769%%_initial_num_blocks_is_2:
1770 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1771 sub r13, 16*2
1772 jmp %%_initial_blocks_encrypted
1773
1774%%_initial_num_blocks_is_1:
1775 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1776 sub r13, 16
1777 jmp %%_initial_blocks_encrypted
1778
1779%%_initial_num_blocks_is_0:
1780 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1781
1782
1783%%_initial_blocks_encrypted:
1784 cmp r13, 0
1785 je %%_zero_cipher_left
1786
1787 sub r13, 128
1788 je %%_eight_cipher_left
1789
1790
1791
1792
1793 vmovd r15d, xmm9
1794 and r15d, 255
1795 vpshufb xmm9, [SHUF_MASK]
1796
1797
1798%%_encrypt_by_8_new:
1799 cmp r15d, 255-8
1800 jg %%_encrypt_by_8
1801
1802
1803
1804 add r15b, 8
1805 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
1806 add %%DATA_OFFSET, 128
1807 sub r13, 128
1808 jne %%_encrypt_by_8_new
1809
1810 vpshufb xmm9, [SHUF_MASK]
1811 jmp %%_eight_cipher_left
1812
1813%%_encrypt_by_8:
1814 vpshufb xmm9, [SHUF_MASK]
1815 add r15b, 8
1816 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
1817 vpshufb xmm9, [SHUF_MASK]
1818 add %%DATA_OFFSET, 128
1819 sub r13, 128
1820 jne %%_encrypt_by_8_new
1821
1822 vpshufb xmm9, [SHUF_MASK]
1823
1824
1825
1826
1827%%_eight_cipher_left:
1828 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
1829
1830
1831%%_zero_cipher_left:
1832 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14
1833 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9
1834
1835 mov r13, r10
1836 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
1837
1838 je %%_multiple_of_16_bytes
1839
1840 mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13
1841 ; handle the last <16 Byte block seperately
1842
1843 vpaddd xmm9, [ONE] ; INCR CNT to get Yn
1844 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
1845 vpshufb xmm9, [SHUF_MASK]
1846 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn)
1847 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9
1848
1849 cmp %%PLAIN_CYPH_LEN, 16
1850 jge %%_large_enough_update
1851
1852 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1853 READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
1854 lea r12, [SHIFT_MASK + 16]
1855 sub r12, r13
1856 jmp %%_data_read
1857
1858%%_large_enough_update:
1859 sub %%DATA_OFFSET, 16
1860 add %%DATA_OFFSET, r13
1861
1862 vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
1863
1864 sub %%DATA_OFFSET, r13
1865 add %%DATA_OFFSET, 16
1866
1867
1868 lea r12, [SHIFT_MASK + 16]
1869 sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
1870
1871 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
1872 vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
1873%%_data_read:
1874%ifidn %%ENC_DEC, DEC
1875 vmovdqa xmm2, xmm1
1876 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1877 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1878 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1879 vpand xmm2, xmm1
1880 vpshufb xmm2, [SHUF_MASK]
1881 vpxor xmm14, xmm2
1882 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1883
1884%else
1885 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1886 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1887 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1888 vpshufb xmm9, [SHUF_MASK]
1889 vpxor xmm14, xmm9
1890 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1891
1892 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
1893%endif
1894
1895 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1896 ; output r13 Bytes
1897 vmovq rax, xmm9
1898 cmp r13, 8
1899 jle %%_less_than_8_bytes_left
1900
1901 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
1902 add %%DATA_OFFSET, 8
1903 vpsrldq xmm9, xmm9, 8
1904 vmovq rax, xmm9
1905 sub r13, 8
1906
1907%%_less_than_8_bytes_left:
1908 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
1909 add %%DATA_OFFSET, 1
1910 shr rax, 8
1911 sub r13, 1
1912 jne %%_less_than_8_bytes_left
1913 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1914
1915%%_multiple_of_16_bytes:
1916
1917
1918
1919%endmacro
1920
1921
1922;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1923; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
1924; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
1925; whether encoding or decoding (ENC_DEC).
1926; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
1927; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
1928;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1929%macro GCM_COMPLETE 5
1930%define %%GDATA_KEY %1
1931%define %%GDATA_CTX %2
1932%define %%AUTH_TAG %3
1933%define %%AUTH_TAG_LEN %4
1934%define %%ENC_DEC %5
1935%define %%PLAIN_CYPH_LEN rax
1936
1937 mov r12, [%%GDATA_CTX + PBlockLen]
1938 vmovdqu xmm14, [%%GDATA_CTX + AadHash]
1939 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
1940
1941 cmp r12, 0
1942
1943 je %%_partial_done
1944
1945 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
1946 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1947
1948%%_partial_done:
1949
1950 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
1951 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
1952
1953 shl r12, 3 ; convert into number of bits
1954 vmovd xmm15, r12d ; len(A) in xmm15
1955
1956 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
1957 vmovq xmm1, %%PLAIN_CYPH_LEN
1958 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
1959 vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
1960
1961 vpxor xmm14, xmm15
1962 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
1963 vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
1964
1965 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
1966
1967 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
1968
1969 vpxor xmm9, xmm14
1970
1971
1972%%_return_T:
1973 mov r10, %%AUTH_TAG ; r10 = authTag
1974 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
1975
1976 cmp r11, 16
1977 je %%_T_16
1978
1979 cmp r11, 12
1980 je %%_T_12
1981
9f95a23c
TL
1982 cmp r11, 8
1983 je %%_T_8
1984
1985 simd_store_avx r10, xmm9, r11, r12, rax
1986 jmp %%_return_T_done
11fdf7f2
TL
1987%%_T_8:
1988 vmovq rax, xmm9
1989 mov [r10], rax
1990 jmp %%_return_T_done
1991%%_T_12:
1992 vmovq rax, xmm9
1993 mov [r10], rax
1994 vpsrldq xmm9, xmm9, 8
1995 vmovd eax, xmm9
1996 mov [r10 + 8], eax
1997 jmp %%_return_T_done
11fdf7f2
TL
1998%%_T_16:
1999 vmovdqu [r10], xmm9
2000
2001%%_return_T_done:
f67539c2
TL
2002
2003%ifdef SAFE_DATA
2004 ;; Clear sensitive data from context structure
2005 vpxor xmm0, xmm0
2006 vmovdqu [%%GDATA_CTX + AadHash], xmm0
2007 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
2008%endif
11fdf7f2
TL
2009%endmacro ; GCM_COMPLETE
2010
2011
2012;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2013;void aes_gcm_precomp_128_avx_gen2
2014; (struct gcm_key_data *key_data);
2015;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2016MKGLOBAL(FN_NAME(precomp,_),function,)
2017FN_NAME(precomp,_):
2018
f67539c2
TL
2019%ifdef SAFE_PARAM
2020 ;; Check key_data != NULL
2021 cmp arg1, 0
2022 jz exit_precomp
2023%endif
2024
11fdf7f2
TL
2025 push r12
2026 push r13
2027 push r14
2028 push r15
2029
2030 mov r14, rsp
2031
2032
2033
2034 sub rsp, VARIABLE_OFFSET
2035 and rsp, ~63 ; align rsp to 64 bytes
2036
2037%ifidn __OUTPUT_FORMAT__, win64
2038 ; only xmm6 needs to be maintained
2039 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2040%endif
2041
2042 vpxor xmm6, xmm6
2043 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
2044
2045 vpshufb xmm6, [SHUF_MASK]
2046 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
2047 vmovdqa xmm2, xmm6
2048 vpsllq xmm6, 1
2049 vpsrlq xmm2, 63
2050 vmovdqa xmm1, xmm2
2051 vpslldq xmm2, xmm2, 8
2052 vpsrldq xmm1, xmm1, 8
2053 vpor xmm6, xmm2
2054 ;reduction
2055 vpshufd xmm2, xmm1, 00100100b
2056 vpcmpeqd xmm2, [TWOONE]
2057 vpand xmm2, [POLY]
2058 vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
2059 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2060 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
2061
2062
2063 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2064
2065%ifidn __OUTPUT_FORMAT__, win64
2066 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2067%endif
2068 mov rsp, r14
2069
2070 pop r15
2071 pop r14
2072 pop r13
2073 pop r12
f67539c2
TL
2074
2075%ifdef SAFE_DATA
2076 clear_scratch_gps_asm
2077 clear_scratch_xmms_avx_asm
2078%endif
2079exit_precomp:
2080
11fdf7f2
TL
2081 ret
2082
2083;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2084;void aes_gcm_init_128_avx_gen2(
2085; const struct gcm_key_data *key_data,
2086; struct gcm_context_data *context_data,
2087; u8 *iv,
2088; const u8 *aad,
2089; u64 aad_len);
2090;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2091MKGLOBAL(FN_NAME(init,_),function,)
2092FN_NAME(init,_):
2093 push r12
2094 push r13
2095%ifidn __OUTPUT_FORMAT__, win64
2096 push r14
2097 push r15
2098 mov r14, rsp
2099 ; xmm6:xmm15 need to be maintained for Windows
2100 sub rsp, 1*16
2101 movdqu [rsp + 0*16], xmm6
2102%endif
2103
f67539c2
TL
2104%ifdef SAFE_PARAM
2105 ;; Check key_data != NULL
2106 cmp arg1, 0
2107 jz exit_init
2108
2109 ;; Check context_data != NULL
2110 cmp arg2, 0
2111 jz exit_init
2112
2113 ;; Check IV != NULL
2114 cmp arg3, 0
2115 jz exit_init
2116
2117 ;; Check if aad_len == 0
2118 cmp arg5, 0
2119 jz skip_aad_check_init
2120
2121 ;; Check aad != NULL (aad_len != 0)
2122 cmp arg4, 0
2123 jz exit_init
2124
2125skip_aad_check_init:
2126%endif
11fdf7f2
TL
2127 GCM_INIT arg1, arg2, arg3, arg4, arg5
2128
f67539c2
TL
2129%ifdef SAFE_DATA
2130 clear_scratch_gps_asm
2131 clear_scratch_xmms_avx_asm
2132%endif
2133exit_init:
2134
11fdf7f2
TL
2135%ifidn __OUTPUT_FORMAT__, win64
2136 movdqu xmm6 , [rsp + 0*16]
2137 mov rsp, r14
2138 pop r15
2139 pop r14
2140%endif
2141 pop r13
2142 pop r12
2143 ret
2144
2145
2146;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2147;void aes_gcm_enc_128_update_avx_gen2(
2148; const struct gcm_key_data *key_data,
2149; struct gcm_context_data *context_data,
2150; u8 *out,
2151; const u8 *in,
2152; u64 plaintext_len);
2153;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2154MKGLOBAL(FN_NAME(enc,_update_),function,)
2155FN_NAME(enc,_update_):
2156
2157 FUNC_SAVE
2158
f67539c2
TL
2159%ifdef SAFE_PARAM
2160 ;; Check key_data != NULL
2161 cmp arg1, 0
2162 jz exit_update_enc
2163
2164 ;; Check context_data != NULL
2165 cmp arg2, 0
2166 jz exit_update_enc
2167
2168 ;; Check if plaintext_len == 0
2169 cmp arg5, 0
2170 jz skip_in_out_check_update_enc
2171
2172 ;; Check out != NULL (plaintext_len != 0)
2173 cmp arg3, 0
2174 jz exit_update_enc
2175
2176 ;; Check in != NULL (plaintext_len != 0)
2177 cmp arg4, 0
2178 jz exit_update_enc
2179
2180skip_in_out_check_update_enc:
2181%endif
11fdf7f2
TL
2182 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2183
f67539c2 2184exit_update_enc:
11fdf7f2
TL
2185 FUNC_RESTORE
2186
2187 ret
2188
2189
2190;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2191;void aes_gcm_dec_128_update_avx_gen2(
2192; const struct gcm_key_data *key_data,
2193; struct gcm_context_data *context_data,
2194; u8 *out,
2195; const u8 *in,
2196; u64 plaintext_len);
2197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2198MKGLOBAL(FN_NAME(dec,_update_),function,)
2199FN_NAME(dec,_update_):
2200
f67539c2
TL
2201 FUNC_SAVE
2202
2203%ifdef SAFE_PARAM
2204 ;; Check key_data != NULL
2205 cmp arg1, 0
2206 jz exit_update_dec
2207
2208 ;; Check context_data != NULL
2209 cmp arg2, 0
2210 jz exit_update_dec
2211
2212 ;; Check if plaintext_len == 0
2213 cmp arg5, 0
2214 jz skip_in_out_check_update_dec
2215
2216 ;; Check out != NULL (plaintext_len != 0)
2217 cmp arg3, 0
2218 jz exit_update_dec
2219
2220 ;; Check in != NULL (plaintext_len != 0)
2221 cmp arg4, 0
2222 jz exit_update_dec
2223
2224skip_in_out_check_update_dec:
2225%endif
11fdf7f2
TL
2226
2227 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2228
f67539c2 2229exit_update_dec:
11fdf7f2
TL
2230 FUNC_RESTORE
2231
2232 ret
2233
2234
2235;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2236;void aes_gcm_enc_128_finalize_avx_gen2(
2237; const struct gcm_key_data *key_data,
2238; struct gcm_context_data *context_data,
2239; u8 *auth_tag,
2240; u64 auth_tag_len);
2241;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2242MKGLOBAL(FN_NAME(enc,_finalize_),function,)
2243FN_NAME(enc,_finalize_):
2244
f67539c2
TL
2245%ifdef SAFE_PARAM
2246 ;; Check key_data != NULL
2247 cmp arg1, 0
2248 jz exit_enc_fin
2249
2250 ;; Check context_data != NULL
2251 cmp arg2, 0
2252 jz exit_enc_fin
2253
2254 ;; Check auth_tag != NULL
2255 cmp arg3, 0
2256 jz exit_enc_fin
2257
2258 ;; Check auth_tag_len == 0 or > 16
2259 cmp arg4, 0
2260 jz exit_enc_fin
2261
2262 cmp arg4, 16
2263 ja exit_enc_fin
2264%endif
11fdf7f2
TL
2265 push r12
2266
2267%ifidn __OUTPUT_FORMAT__, win64
2268 ; xmm6:xmm15 need to be maintained for Windows
2269 sub rsp, 5*16
2270 vmovdqu [rsp + 0*16],xmm6
2271 vmovdqu [rsp + 1*16],xmm9
2272 vmovdqu [rsp + 2*16],xmm11
2273 vmovdqu [rsp + 3*16],xmm14
2274 vmovdqu [rsp + 4*16],xmm15
2275%endif
2276 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
2277
2278%ifidn __OUTPUT_FORMAT__, win64
2279 vmovdqu xmm15 , [rsp + 4*16]
2280 vmovdqu xmm14 , [rsp + 3*16]
2281 vmovdqu xmm11 , [rsp + 2*16]
2282 vmovdqu xmm9 , [rsp + 1*16]
2283 vmovdqu xmm6 , [rsp + 0*16]
2284 add rsp, 5*16
2285%endif
2286
2287 pop r12
f67539c2
TL
2288
2289%ifdef SAFE_DATA
2290 clear_scratch_gps_asm
2291 clear_scratch_xmms_avx_asm
2292%endif
2293exit_enc_fin:
11fdf7f2
TL
2294 ret
2295
2296
2297;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2298;void aes_gcm_dec_128_finalize_avx_gen2(
2299; const struct gcm_key_data *key_data,
2300; struct gcm_context_data *context_data,
2301; u8 *auth_tag,
2302; u64 auth_tag_len);
2303;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2304MKGLOBAL(FN_NAME(dec,_finalize_),function,)
2305FN_NAME(dec,_finalize_):
2306
f67539c2
TL
2307%ifdef SAFE_PARAM
2308 ;; Check key_data != NULL
2309 cmp arg1, 0
2310 jz exit_dec_fin
2311
2312 ;; Check context_data != NULL
2313 cmp arg2, 0
2314 jz exit_dec_fin
2315
2316 ;; Check auth_tag != NULL
2317 cmp arg3, 0
2318 jz exit_dec_fin
2319
2320 ;; Check auth_tag_len == 0 or > 16
2321 cmp arg4, 0
2322 jz exit_dec_fin
2323
2324 cmp arg4, 16
2325 ja exit_dec_fin
2326%endif
2327
11fdf7f2
TL
2328 push r12
2329
2330%ifidn __OUTPUT_FORMAT__, win64
2331 ; xmm6:xmm15 need to be maintained for Windows
2332 sub rsp, 5*16
2333 vmovdqu [rsp + 0*16],xmm6
2334 vmovdqu [rsp + 1*16],xmm9
2335 vmovdqu [rsp + 2*16],xmm11
2336 vmovdqu [rsp + 3*16],xmm14
2337 vmovdqu [rsp + 4*16],xmm15
2338%endif
2339 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
2340
2341%ifidn __OUTPUT_FORMAT__, win64
2342 vmovdqu xmm15 , [rsp + 4*16]
2343 vmovdqu xmm14 , [rsp + 3*16]
2344 vmovdqu xmm11 , [rsp + 2*16]
2345 vmovdqu xmm9 , [rsp + 1*16]
2346 vmovdqu xmm6 , [rsp + 0*16]
2347 add rsp, 5*16
2348%endif
2349
2350 pop r12
f67539c2
TL
2351
2352%ifdef SAFE_DATA
2353 clear_scratch_gps_asm
2354 clear_scratch_xmms_avx_asm
2355%endif
2356exit_dec_fin:
2357 ret
11fdf7f2
TL
2358
2359
2360;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2361;void aes_gcm_enc_128_avx_gen2(
2362; const struct gcm_key_data *key_data,
2363; struct gcm_context_data *context_data,
2364; u8 *out,
2365; const u8 *in,
2366; u64 plaintext_len,
2367; u8 *iv,
2368; const u8 *aad,
2369; u64 aad_len,
2370; u8 *auth_tag,
2371; u64 auth_tag_len);
2372;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2373MKGLOBAL(FN_NAME(enc,_),function,)
2374FN_NAME(enc,_):
2375
2376 FUNC_SAVE
2377
f67539c2
TL
2378%ifdef SAFE_PARAM
2379 ;; Check key_data != NULL
2380 cmp arg1, 0
2381 jz exit_enc
2382
2383 ;; Check context_data != NULL
2384 cmp arg2, 0
2385 jz exit_enc
2386
2387 ;; Check IV != NULL
2388 cmp arg6, 0
2389 jz exit_enc
2390
2391 ;; Check auth_tag != NULL
2392 cmp arg9, 0
2393 jz exit_enc
2394
2395 ;; Check auth_tag_len == 0 or > 16
2396 cmp arg10, 0
2397 jz exit_enc
2398
2399 cmp arg10, 16
2400 ja exit_enc
2401
2402 ;; Check if plaintext_len == 0
2403 cmp arg5, 0
2404 jz skip_in_out_check_enc
2405
2406 ;; Check out != NULL (plaintext_len != 0)
2407 cmp arg3, 0
2408 jz exit_enc
2409
2410 ;; Check in != NULL (plaintext_len != 0)
2411 cmp arg4, 0
2412 jz exit_enc
2413
2414skip_in_out_check_enc:
2415 ;; Check if aad_len == 0
2416 cmp arg8, 0
2417 jz skip_aad_check_enc
2418
2419 ;; Check aad != NULL (aad_len != 0)
2420 cmp arg7, 0
2421 jz exit_enc
2422
2423skip_aad_check_enc:
2424%endif
11fdf7f2
TL
2425 GCM_INIT arg1, arg2, arg6, arg7, arg8
2426
2427 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2428
2429 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
2430
f67539c2 2431exit_enc:
11fdf7f2
TL
2432 FUNC_RESTORE
2433
2434 ret
2435
2436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2437;void aes_gcm_dec_128_avx_gen2(
2438; const struct gcm_key_data *key_data,
2439; struct gcm_context_data *context_data,
2440; u8 *out,
2441; const u8 *in,
2442; u64 plaintext_len,
2443; u8 *iv,
2444; const u8 *aad,
2445; u64 aad_len,
2446; u8 *auth_tag,
2447; u64 auth_tag_len);
2448;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2449MKGLOBAL(FN_NAME(dec,_),function,)
2450FN_NAME(dec,_):
2451
2452 FUNC_SAVE
2453
f67539c2
TL
2454%ifdef SAFE_PARAM
2455 ;; Check key_data != NULL
2456 cmp arg1, 0
2457 jz exit_dec
2458
2459 ;; Check context_data != NULL
2460 cmp arg2, 0
2461 jz exit_dec
2462
2463 ;; Check IV != NULL
2464 cmp arg6, 0
2465 jz exit_dec
2466
2467 ;; Check auth_tag != NULL
2468 cmp arg9, 0
2469 jz exit_dec
2470
2471 ;; Check auth_tag_len == 0 or > 16
2472 cmp arg10, 0
2473 jz exit_dec
2474
2475 cmp arg10, 16
2476 ja exit_dec
2477
2478 ;; Check if plaintext_len == 0
2479 cmp arg5, 0
2480 jz skip_in_out_check_dec
2481
2482 ;; Check out != NULL (plaintext_len != 0)
2483 cmp arg3, 0
2484 jz exit_dec
2485
2486 ;; Check in != NULL (plaintext_len != 0)
2487 cmp arg4, 0
2488 jz exit_dec
2489
2490skip_in_out_check_dec:
2491 ;; Check if aad_len == 0
2492 cmp arg8, 0
2493 jz skip_aad_check_dec
2494
2495 ;; Check aad != NULL (aad_len != 0)
2496 cmp arg7, 0
2497 jz exit_dec
2498
2499skip_aad_check_dec:
2500%endif
2501
11fdf7f2
TL
2502 GCM_INIT arg1, arg2, arg6, arg7, arg8
2503
2504 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2505
2506 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
2507
f67539c2 2508exit_dec:
11fdf7f2
TL
2509 FUNC_RESTORE
2510
2511 ret
2512
2513%ifdef LINUX
2514section .note.GNU-stack noalloc noexec nowrite progbits
2515%endif