]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / gcm_avx_gen4.asm
CommitLineData
11fdf7f2
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2018, Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;
31; Authors:
32; Erdinc Ozturk
33; Vinodh Gopal
34; James Guilford
35;
36;
37; References:
38; This code was derived and highly optimized from the code described in paper:
39; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40; The details of the implementation is explained in:
41; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
42;
43;
44;
45;
46; Assumptions:
47;
48;
49;
50; iv:
51; 0 1 2 3
52; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
53; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
54; | Salt (From the SA) |
55; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
56; | Initialization Vector |
57; | (This is the sequence number from IPSec header) |
58; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
59; | 0x1 |
60; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
61;
62;
63;
64; AAD:
65; AAD will be padded with 0 to the next 16byte multiple
66; for example, assume AAD is a u32 vector
67;
68; if AAD is 8 bytes:
69; AAD[3] = {A0, A1};
70; padded AAD in xmm register = {A1 A0 0 0}
71;
72; 0 1 2 3
73; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
74; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
75; | SPI (A1) |
76; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
77; | 32-bit Sequence Number (A0) |
78; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
79; | 0x0 |
80; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
81;
82; AAD Format with 32-bit Sequence Number
83;
84; if AAD is 12 bytes:
85; AAD[3] = {A0, A1, A2};
86; padded AAD in xmm register = {A2 A1 A0 0}
87;
88; 0 1 2 3
89; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
90; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
91; | SPI (A2) |
92; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93; | 64-bit Extended Sequence Number {A1,A0} |
94; | |
95; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
96; | 0x0 |
97; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98;
99; AAD Format with 64-bit Extended Sequence Number
100;
101;
102; aadLen:
103; Must be a multiple of 4 bytes and from the definition of the spec.
104; The code additionally supports any aadLen length.
105;
106; TLen:
107; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
108;
109; poly = x^128 + x^127 + x^126 + x^121 + 1
110; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
111;
112
113%include "os.asm"
114%include "reg_sizes.asm"
115%include "gcm_defines.asm"
9f95a23c 116%include "memcpy.asm"
11fdf7f2
TL
117
118%ifndef GCM128_MODE
119%ifndef GCM192_MODE
120%ifndef GCM256_MODE
121%error "No GCM mode selected for gcm_avx_gen4.asm!"
122%endif
123%endif
124%endif
125
126;; Decide on AES-GCM key size to compile for
127%ifdef GCM128_MODE
128%define NROUNDS 9
129%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4
130%endif
131
132%ifdef GCM192_MODE
133%define NROUNDS 11
134%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4
135%endif
136
137%ifdef GCM256_MODE
138%define NROUNDS 13
139%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4
140%endif
141
142section .text
143default rel
144
145; need to push 4 registers into stack to maintain
146%define STACK_OFFSET 8*4
147
148%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
149%define TMP3 16*1 ; Temporary storage for AES State 3
150%define TMP4 16*2 ; Temporary storage for AES State 4
151%define TMP5 16*3 ; Temporary storage for AES State 5
152%define TMP6 16*4 ; Temporary storage for AES State 6
153%define TMP7 16*5 ; Temporary storage for AES State 7
154%define TMP8 16*6 ; Temporary storage for AES State 8
155
156%define LOCAL_STORAGE 16*7
157
158%ifidn __OUTPUT_FORMAT__, win64
159 %define XMM_STORAGE 16*10
160%else
161 %define XMM_STORAGE 0
162%endif
163
164%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
165
166;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
167; Utility Macros
168;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
169
170;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
171; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
172; Input: A and B (128-bits each, bit-reflected)
173; Output: C = A*B*x mod poly, (i.e. >>1 )
174; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
175; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
176;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
177%macro GHASH_MUL 7
178%define %%GH %1 ; 16 Bytes
179%define %%HK %2 ; 16 Bytes
180%define %%T1 %3
181%define %%T2 %4
182%define %%T3 %5
183%define %%T4 %6
184%define %%T5 %7
185 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
186
187 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
188 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
189 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
190 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
191 vpxor %%GH, %%GH, %%T3
192
193
194 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
195 vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
196
197 vpxor %%T1, %%T1, %%T3
198 vpxor %%GH, %%GH, %%T2
199
200 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
201 ;first phase of the reduction
9f95a23c 202 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
203
204 vpclmulqdq %%T2, %%T3, %%GH, 0x01
205 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
206
207 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
208 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
209 ;second phase of the reduction
210 vpclmulqdq %%T2, %%T3, %%GH, 0x00
211 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
212
213 vpclmulqdq %%GH, %%T3, %%GH, 0x10
214 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
215
216 vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
217 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
218 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
219
220%endmacro
221
222
223; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
224; functions, but are kept to allow users to switch cpu architectures between calls
225; of pre, init, update, and finalize.
226%macro PRECOMPUTE 8
227%define %%GDATA %1
228%define %%HK %2
229%define %%T1 %3
230%define %%T2 %4
231%define %%T3 %5
232%define %%T4 %6
233%define %%T5 %7
234%define %%T6 %8
235
236 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
237 vmovdqa %%T5, %%HK
238
239 vpshufd %%T1, %%T5, 01001110b
240 vpxor %%T1, %%T5
241 vmovdqu [%%GDATA + HashKey_k], %%T1
242
243 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
244 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
245 vpshufd %%T1, %%T5, 01001110b
246 vpxor %%T1, %%T5
247 vmovdqu [%%GDATA + HashKey_2_k], %%T1
248
249 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
250 vmovdqu [%%GDATA + HashKey_3], %%T5
251 vpshufd %%T1, %%T5, 01001110b
252 vpxor %%T1, %%T5
253 vmovdqu [%%GDATA + HashKey_3_k], %%T1
254
255 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
256 vmovdqu [%%GDATA + HashKey_4], %%T5
257 vpshufd %%T1, %%T5, 01001110b
258 vpxor %%T1, %%T5
259 vmovdqu [%%GDATA + HashKey_4_k], %%T1
260
261 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
262 vmovdqu [%%GDATA + HashKey_5], %%T5
263 vpshufd %%T1, %%T5, 01001110b
264 vpxor %%T1, %%T5
265 vmovdqu [%%GDATA + HashKey_5_k], %%T1
266
267 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
268 vmovdqu [%%GDATA + HashKey_6], %%T5
269 vpshufd %%T1, %%T5, 01001110b
270 vpxor %%T1, %%T5
271 vmovdqu [%%GDATA + HashKey_6_k], %%T1
272
273 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
274 vmovdqu [%%GDATA + HashKey_7], %%T5
275 vpshufd %%T1, %%T5, 01001110b
276 vpxor %%T1, %%T5
277 vmovdqu [%%GDATA + HashKey_7_k], %%T1
278
279 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
280 vmovdqu [%%GDATA + HashKey_8], %%T5
281 vpshufd %%T1, %%T5, 01001110b
282 vpxor %%T1, %%T5
283 vmovdqu [%%GDATA + HashKey_8_k], %%T1
284%endmacro
285
286
287;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
288; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
289; Returns 0 if data has length 0.
290; Input: The input data (INPUT), that data's length (LENGTH).
291; Output: The packed xmm register (OUTPUT).
292;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293%macro READ_SMALL_DATA_INPUT 6
294%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
295%define %%INPUT %2
296%define %%LENGTH %3
297%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
298%define %%COUNTER %5
299%define %%TMP1 %6
300
301 vpxor %%OUTPUT, %%OUTPUT
302 mov %%COUNTER, %%LENGTH
303 mov %%END_READ_LOCATION, %%INPUT
304 add %%END_READ_LOCATION, %%LENGTH
305 xor %%TMP1, %%TMP1
306
307
308 cmp %%COUNTER, 8
309 jl %%_byte_loop_2
310 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
311 je %%_done
312
313 sub %%COUNTER, 8
314
315%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
316 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
317 dec %%END_READ_LOCATION
318 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
319 dec %%COUNTER
320 jg %%_byte_loop_1
321 vpinsrq %%OUTPUT, %%TMP1, 1
322 jmp %%_done
323
324%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
325 ;; NOTE: in current implementation check for zero length is obsolete here.
326 ;; The adequate checks are done by callers of this macro.
327 ;; cmp %%COUNTER, 0
328 ;; je %%_done
329 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
330 dec %%END_READ_LOCATION
331 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
332 dec %%COUNTER
333 jg %%_byte_loop_2
334 vpinsrq %%OUTPUT, %%TMP1, 0
335%%_done:
336
337%endmacro ; READ_SMALL_DATA_INPUT
338
339
340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
341; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
342; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
343; Output: The hash of the data (AAD_HASH).
344;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9f95a23c 345%macro CALC_AAD_HASH 15
11fdf7f2
TL
346%define %%A_IN %1
347%define %%A_LEN %2
348%define %%AAD_HASH %3
9f95a23c
TL
349%define %%GDATA_KEY %4
350%define %%XTMP0 %5 ; xmm temp reg 5
351%define %%XTMP1 %6 ; xmm temp reg 5
352%define %%XTMP2 %7
353%define %%XTMP3 %8
354%define %%XTMP4 %9
355%define %%XTMP5 %10 ; xmm temp reg 5
356%define %%T1 %11 ; temp reg 1
357%define %%T2 %12
358%define %%T3 %13
359%define %%T4 %14
360%define %%T5 %15 ; temp reg 5
11fdf7f2
TL
361
362
363 mov %%T1, %%A_IN ; T1 = AAD
364 mov %%T2, %%A_LEN ; T2 = aadLen
365 vpxor %%AAD_HASH, %%AAD_HASH
366
9f95a23c
TL
367%%_get_AAD_loop128:
368 cmp %%T2, 128
369 jl %%_exit_AAD_loop128
370
371 vmovdqu %%XTMP0, [%%T1 + 16*0]
372 vpshufb %%XTMP0, [rel SHUF_MASK]
373
374 vpxor %%XTMP0, %%AAD_HASH
375
376 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
377 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
378 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
379 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
380 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
381 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
382
383%assign i 1
384%assign j 7
385%rep 7
386 vmovdqu %%XTMP0, [%%T1 + 16*i]
387 vpshufb %%XTMP0, [rel SHUF_MASK]
388
389 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
390 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
391 vpxor %%XTMP1, %%XTMP1, %%XTMP4
392
393 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
394 vpxor %%XTMP2, %%XTMP2, %%XTMP4
395
396 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
397 vpxor %%XTMP3, %%XTMP3, %%XTMP4
398 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
399 vpxor %%XTMP3, %%XTMP3, %%XTMP4
400%assign i (i + 1)
401%assign j (j - 1)
402%endrep
403
404 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
405 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
406 vpxor %%XTMP2, %%XTMP2, %%XTMP4
407 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
408
409 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
410 ;first phase of the reduction
411 vmovdqa %%XTMP5, [rel POLY2]
412 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
413 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
414 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
415
416 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
417 ;second phase of the reduction
418 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
419 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
420
421 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
422 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
423
424 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
425 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
426 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
427
428 sub %%T2, 128
429 je %%_CALC_AAD_done
430
431 add %%T1, 128
432 jmp %%_get_AAD_loop128
433
434%%_exit_AAD_loop128:
11fdf7f2
TL
435 cmp %%T2, 16
436 jl %%_get_small_AAD_block
437
9f95a23c
TL
438 ;; calculate hash_key position to start with
439 mov %%T3, %%T2
440 and %%T3, -16 ; 1 to 7 blocks possible here
441 neg %%T3
442 add %%T3, HashKey_1 + 16
443 lea %%T3, [%%GDATA_KEY + %%T3]
11fdf7f2 444
9f95a23c
TL
445 vmovdqu %%XTMP0, [%%T1]
446 vpshufb %%XTMP0, [rel SHUF_MASK]
447
448 vpxor %%XTMP0, %%AAD_HASH
449
450 vmovdqu %%XTMP5, [%%T3]
451 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
452 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
453 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
454 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
455 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
11fdf7f2 456
9f95a23c
TL
457 add %%T3, 16 ; move to next hashkey
458 add %%T1, 16 ; move to next data block
11fdf7f2 459 sub %%T2, 16
9f95a23c
TL
460 cmp %%T2, 16
461 jl %%_AAD_reduce
462
463%%_AAD_blocks:
464 vmovdqu %%XTMP0, [%%T1]
465 vpshufb %%XTMP0, [rel SHUF_MASK]
466
467 vmovdqu %%XTMP5, [%%T3]
468 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
469 vpxor %%XTMP1, %%XTMP1, %%XTMP4
11fdf7f2 470
9f95a23c
TL
471 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
472 vpxor %%XTMP2, %%XTMP2, %%XTMP4
473
474 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
475 vpxor %%XTMP3, %%XTMP3, %%XTMP4
476 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
477 vpxor %%XTMP3, %%XTMP3, %%XTMP4
478
479 add %%T3, 16 ; move to next hashkey
11fdf7f2 480 add %%T1, 16
9f95a23c 481 sub %%T2, 16
11fdf7f2 482 cmp %%T2, 16
9f95a23c
TL
483 jl %%_AAD_reduce
484 jmp %%_AAD_blocks
485
486%%_AAD_reduce:
487 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
488 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
489 vpxor %%XTMP2, %%XTMP2, %%XTMP4
490 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
491
492 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
493 ;first phase of the reduction
494 vmovdqa %%XTMP5, [rel POLY2]
495 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
496 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
497 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
498
499 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
500 ;second phase of the reduction
501 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
502 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
503
504 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
505 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
506
507 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
508 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
509 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
510
511 or %%T2, %%T2
512 je %%_CALC_AAD_done
11fdf7f2
TL
513
514%%_get_small_AAD_block:
9f95a23c 515 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
11fdf7f2
TL
516 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
517 ;byte-reflect the AAD data
9f95a23c
TL
518 vpshufb %%XTMP1, [rel SHUF_MASK]
519 vpxor %%AAD_HASH, %%XTMP1
520 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
11fdf7f2
TL
521
522%%_CALC_AAD_done:
523
524%endmacro ; CALC_AAD_HASH
525
526
527
528;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
529; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
530; Requires the input data be at least 1 byte long.
531; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
532; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
533; and whether encoding or decoding (ENC_DEC)
534; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
535; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
536;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
537%macro PARTIAL_BLOCK 8
538%define %%GDATA_KEY %1
539%define %%GDATA_CTX %2
540%define %%CYPH_PLAIN_OUT %3
541%define %%PLAIN_CYPH_IN %4
542%define %%PLAIN_CYPH_LEN %5
543%define %%DATA_OFFSET %6
544%define %%AAD_HASH %7
545%define %%ENC_DEC %8
546
547 mov r13, [%%GDATA_CTX + PBlockLen]
548 cmp r13, 0
549 je %%_partial_block_done ;Leave Macro if no partial blocks
550
551 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
552 jl %%_fewer_than_16_bytes
553 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
554 jmp %%_data_read
555
556%%_fewer_than_16_bytes:
557 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
558 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
559
560%%_data_read: ;Finished reading in data
561
562
563 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
564 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
565
9f95a23c 566 lea r12, [rel SHIFT_MASK]
11fdf7f2
TL
567
568 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
569 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
570 vpshufb xmm9, xmm2 ;shift right r13 bytes
571
572%ifidn %%ENC_DEC, DEC
573 vmovdqa xmm3, xmm1
574 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
575
576 mov r15, %%PLAIN_CYPH_LEN
577 add r15, r13
578 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
579 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
580 sub r12, r15
581%%_no_extra_mask_1:
582
583 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
584 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
585
586 vpand xmm3, xmm1
9f95a23c 587 vpshufb xmm3, [rel SHUF_MASK]
11fdf7f2
TL
588 vpshufb xmm3, xmm2
589 vpxor %%AAD_HASH, xmm3
590
591
592 cmp r15,0
593 jl %%_partial_incomplete_1
594
595 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
596 xor rax,rax
597 mov [%%GDATA_CTX + PBlockLen], rax
598 jmp %%_dec_done
599%%_partial_incomplete_1:
600%ifidn __OUTPUT_FORMAT__, win64
601 mov rax, %%PLAIN_CYPH_LEN
602 add [%%GDATA_CTX + PBlockLen], rax
603%else
604 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
605%endif
606%%_dec_done:
607 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
608
609%else
610 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
611
612 mov r15, %%PLAIN_CYPH_LEN
613 add r15, r13
614 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
615 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
616 sub r12, r15
617%%_no_extra_mask_2:
618
619 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
620 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
621
9f95a23c 622 vpshufb xmm9, [rel SHUF_MASK]
11fdf7f2
TL
623 vpshufb xmm9, xmm2
624 vpxor %%AAD_HASH, xmm9
625
626 cmp r15,0
627 jl %%_partial_incomplete_2
628
629 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
630 xor rax,rax
631 mov [%%GDATA_CTX + PBlockLen], rax
632 jmp %%_encode_done
633%%_partial_incomplete_2:
634%ifidn __OUTPUT_FORMAT__, win64
635 mov rax, %%PLAIN_CYPH_LEN
636 add [%%GDATA_CTX + PBlockLen], rax
637%else
638 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
639%endif
640%%_encode_done:
641 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
642
9f95a23c 643 vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
11fdf7f2
TL
644 vpshufb xmm9, xmm2
645%endif
646
647
648 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
649 ; output encrypted Bytes
650 cmp r15,0
651 jl %%_partial_fill
652 mov r12, r13
653 mov r13, 16
654 sub r13, r12 ; Set r13 to be the number of bytes to write out
655 jmp %%_count_set
656%%_partial_fill:
657 mov r13, %%PLAIN_CYPH_LEN
658%%_count_set:
659 vmovq rax, xmm9
660 cmp r13, 8
661 jle %%_less_than_8_bytes_left
662
663 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
664 add %%DATA_OFFSET, 8
665 vpsrldq xmm9, xmm9, 8
666 vmovq rax, xmm9
667 sub r13, 8
668%%_less_than_8_bytes_left:
669 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
670 add %%DATA_OFFSET, 1
671 shr rax, 8
672 sub r13, 1
673 jne %%_less_than_8_bytes_left
674 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
675
676%%_partial_block_done:
677%endmacro ; PARTIAL_BLOCK
678
679
680%macro GHASH_SINGLE_MUL 9
681%define %%GDATA %1
682%define %%HASHKEY %2
683%define %%CIPHER %3
684%define %%STATE_11 %4
685%define %%STATE_00 %5
686%define %%STATE_MID %6
687%define %%T1 %7
688%define %%T2 %8
689%define %%FIRST %9
690
691 vmovdqu %%T1, [%%GDATA + %%HASHKEY]
692%ifidn %%FIRST, first
693 vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
694 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
695 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
696 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
697 vpxor %%STATE_MID, %%STATE_MID, %%T2
698%else
699 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
700 vpxor %%STATE_11, %%STATE_11, %%T2
701
702 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
703 vpxor %%STATE_00, %%STATE_00, %%T2
704
705 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
706 vpxor %%STATE_MID, %%STATE_MID, %%T2
707
708 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
709 vpxor %%STATE_MID, %%STATE_MID, %%T2
710%endif
711
712%endmacro
713
714; if a = number of total plaintext bytes
715; b = floor(a/16)
716; %%num_initial_blocks = b mod 8;
717; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
718; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
719; Updated AAD_HASH is returned in %%T3
720
721%macro INITIAL_BLOCKS 23
722%define %%GDATA_KEY %1
723%define %%CYPH_PLAIN_OUT %2
724%define %%PLAIN_CYPH_IN %3
725%define %%LENGTH %4
726%define %%DATA_OFFSET %5
727%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
728%define %%T1 %7
729%define %%T2 %8
730%define %%T3 %9
731%define %%T4 %10
732%define %%T5 %11
733%define %%CTR %12
734%define %%XMM1 %13
735%define %%XMM2 %14
736%define %%XMM3 %15
737%define %%XMM4 %16
738%define %%XMM5 %17
739%define %%XMM6 %18
740%define %%XMM7 %19
741%define %%XMM8 %20
742%define %%T6 %21
743%define %%T_key %22
744%define %%ENC_DEC %23
745
746%assign i (8-%%num_initial_blocks)
747 ;; Move AAD_HASH to temp reg
748 vmovdqu %%T2, %%XMM8
749 ;; Start AES for %%num_initial_blocks blocks
750 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
751
752%assign i (9-%%num_initial_blocks)
753%rep %%num_initial_blocks
9f95a23c 754 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
11fdf7f2 755 vmovdqa reg(i), %%CTR
9f95a23c 756 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
757%assign i (i+1)
758%endrep
759
760%if(%%num_initial_blocks>0)
761vmovdqu %%T_key, [%%GDATA_KEY+16*0]
762%assign i (9-%%num_initial_blocks)
763%rep %%num_initial_blocks
764 vpxor reg(i),reg(i),%%T_key
765%assign i (i+1)
766%endrep
767
768%assign j 1
769%rep NROUNDS
770vmovdqu %%T_key, [%%GDATA_KEY+16*j]
771%assign i (9-%%num_initial_blocks)
772%rep %%num_initial_blocks
773 vaesenc reg(i),%%T_key
774%assign i (i+1)
775%endrep
776
777%assign j (j+1)
778%endrep
779
780
781vmovdqu %%T_key, [%%GDATA_KEY+16*j]
782%assign i (9-%%num_initial_blocks)
783%rep %%num_initial_blocks
784 vaesenclast reg(i),%%T_key
785%assign i (i+1)
786%endrep
787
788%endif ; %if(%%num_initial_blocks>0)
789
790
791
792%assign i (9-%%num_initial_blocks)
793%rep %%num_initial_blocks
794 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
795 vpxor reg(i), reg(i), %%T1
796 ;; Write back ciphertext for %%num_initial_blocks blocks
797 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
798 add %%DATA_OFFSET, 16
799 %ifidn %%ENC_DEC, DEC
800 vmovdqa reg(i), %%T1
801 %endif
802 ;; Prepare ciphertext for GHASH computations
9f95a23c 803 vpshufb reg(i), [rel SHUF_MASK]
11fdf7f2
TL
804%assign i (i+1)
805%endrep
806
807
808;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
809
810%assign i (9-%%num_initial_blocks)
811%if(%%num_initial_blocks>0)
812 vmovdqa %%T3, reg(i)
813%assign i (i+1)
814%endif
9f95a23c 815%if(%%num_initial_blocks>1)
11fdf7f2
TL
816%rep %%num_initial_blocks-1
817 vmovdqu [rsp + TMP %+ i], reg(i)
818%assign i (i+1)
819%endrep
9f95a23c 820%endif
11fdf7f2
TL
821 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
822 ;; Haskey_i_k holds XORed values of the low and high parts of
823 ;; the Haskey_i
9f95a23c
TL
824 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
825 vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
826 vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
827 vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
828 vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
829 vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
830 vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
831 vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
11fdf7f2
TL
832 vmovdqa %%CTR, %%XMM8
833
9f95a23c
TL
834 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
835 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
836 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
837 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
838 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
839 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
840 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
841 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
842
843 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
844 vpxor %%XMM1, %%XMM1, %%T_key
845 vpxor %%XMM2, %%XMM2, %%T_key
846 vpxor %%XMM3, %%XMM3, %%T_key
847 vpxor %%XMM4, %%XMM4, %%T_key
848 vpxor %%XMM5, %%XMM5, %%T_key
849 vpxor %%XMM6, %%XMM6, %%T_key
850 vpxor %%XMM7, %%XMM7, %%T_key
851 vpxor %%XMM8, %%XMM8, %%T_key
852
853%assign i (8-%%num_initial_blocks)
854%assign j (9-%%num_initial_blocks)
855%assign k (%%num_initial_blocks)
856
857%define %%T4_2 %%T4
858%if(%%num_initial_blocks>0)
859 ;; Hash in AES state
860 ;; T2 - incoming AAD hash
861 vpxor %%T2, %%T3
862
863 ;; GDATA, HASHKEY, CIPHER,
864 ;; STATE_11, STATE_00, STATE_MID, T1, T2
865 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
866 %%T1, %%T4, %%T6, %%T5, %%T3, first
867%endif
868
869 vmovdqu %%T_key, [%%GDATA_KEY+16*1]
870 vaesenc %%XMM1, %%T_key
871 vaesenc %%XMM2, %%T_key
872 vaesenc %%XMM3, %%T_key
873 vaesenc %%XMM4, %%T_key
874 vaesenc %%XMM5, %%T_key
875 vaesenc %%XMM6, %%T_key
876 vaesenc %%XMM7, %%T_key
877 vaesenc %%XMM8, %%T_key
878
879 vmovdqu %%T_key, [%%GDATA_KEY+16*2]
880 vaesenc %%XMM1, %%T_key
881 vaesenc %%XMM2, %%T_key
882 vaesenc %%XMM3, %%T_key
883 vaesenc %%XMM4, %%T_key
884 vaesenc %%XMM5, %%T_key
885 vaesenc %%XMM6, %%T_key
886 vaesenc %%XMM7, %%T_key
887 vaesenc %%XMM8, %%T_key
888
889%assign i (i+1)
890%assign j (j+1)
891%assign k (k-1)
892%if(%%num_initial_blocks>1)
893 ;; GDATA, HASHKEY, CIPHER,
894 ;; STATE_11, STATE_00, STATE_MID, T1, T2
895 vmovdqu %%T2, [rsp + TMP %+ j]
896 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
897 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
898%endif
899
900 vmovdqu %%T_key, [%%GDATA_KEY+16*3]
901 vaesenc %%XMM1, %%T_key
902 vaesenc %%XMM2, %%T_key
903 vaesenc %%XMM3, %%T_key
904 vaesenc %%XMM4, %%T_key
905 vaesenc %%XMM5, %%T_key
906 vaesenc %%XMM6, %%T_key
907 vaesenc %%XMM7, %%T_key
908 vaesenc %%XMM8, %%T_key
909
910 vmovdqu %%T_key, [%%GDATA_KEY+16*4]
911 vaesenc %%XMM1, %%T_key
912 vaesenc %%XMM2, %%T_key
913 vaesenc %%XMM3, %%T_key
914 vaesenc %%XMM4, %%T_key
915 vaesenc %%XMM5, %%T_key
916 vaesenc %%XMM6, %%T_key
917 vaesenc %%XMM7, %%T_key
918 vaesenc %%XMM8, %%T_key
919
920%assign i (i+1)
921%assign j (j+1)
922%assign k (k-1)
923%if(%%num_initial_blocks>2)
924 ;; GDATA, HASHKEY, CIPHER,
925 ;; STATE_11, STATE_00, STATE_MID, T1, T2
926 vmovdqu %%T2, [rsp + TMP %+ j]
927 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
928 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
929%endif
930
931%assign i (i+1)
932%assign j (j+1)
933%assign k (k-1)
934%if(%%num_initial_blocks>3)
935 ;; GDATA, HASHKEY, CIPHER,
936 ;; STATE_11, STATE_00, STATE_MID, T1, T2
937 vmovdqu %%T2, [rsp + TMP %+ j]
938 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
939 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
940%endif
941
942 vmovdqu %%T_key, [%%GDATA_KEY+16*5]
943 vaesenc %%XMM1, %%T_key
944 vaesenc %%XMM2, %%T_key
945 vaesenc %%XMM3, %%T_key
946 vaesenc %%XMM4, %%T_key
947 vaesenc %%XMM5, %%T_key
948 vaesenc %%XMM6, %%T_key
949 vaesenc %%XMM7, %%T_key
950 vaesenc %%XMM8, %%T_key
951
952 vmovdqu %%T_key, [%%GDATA_KEY+16*6]
953 vaesenc %%XMM1, %%T_key
954 vaesenc %%XMM2, %%T_key
955 vaesenc %%XMM3, %%T_key
956 vaesenc %%XMM4, %%T_key
957 vaesenc %%XMM5, %%T_key
958 vaesenc %%XMM6, %%T_key
959 vaesenc %%XMM7, %%T_key
960 vaesenc %%XMM8, %%T_key
961
962%assign i (i+1)
963%assign j (j+1)
964%assign k (k-1)
965%if(%%num_initial_blocks>4)
966 ;; GDATA, HASHKEY, CIPHER,
967 ;; STATE_11, STATE_00, STATE_MID, T1, T2
968 vmovdqu %%T2, [rsp + TMP %+ j]
969 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
970 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
971%endif
972
973 vmovdqu %%T_key, [%%GDATA_KEY+16*7]
974 vaesenc %%XMM1, %%T_key
975 vaesenc %%XMM2, %%T_key
976 vaesenc %%XMM3, %%T_key
977 vaesenc %%XMM4, %%T_key
978 vaesenc %%XMM5, %%T_key
979 vaesenc %%XMM6, %%T_key
980 vaesenc %%XMM7, %%T_key
981 vaesenc %%XMM8, %%T_key
982
983 vmovdqu %%T_key, [%%GDATA_KEY+16*8]
984 vaesenc %%XMM1, %%T_key
985 vaesenc %%XMM2, %%T_key
986 vaesenc %%XMM3, %%T_key
987 vaesenc %%XMM4, %%T_key
988 vaesenc %%XMM5, %%T_key
989 vaesenc %%XMM6, %%T_key
990 vaesenc %%XMM7, %%T_key
991 vaesenc %%XMM8, %%T_key
992
993%assign i (i+1)
994%assign j (j+1)
995%assign k (k-1)
996%if(%%num_initial_blocks>5)
997 ;; GDATA, HASHKEY, CIPHER,
998 ;; STATE_11, STATE_00, STATE_MID, T1, T2
999 vmovdqu %%T2, [rsp + TMP %+ j]
1000 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1001 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
1002%endif
1003
1004 vmovdqu %%T_key, [%%GDATA_KEY+16*9]
1005 vaesenc %%XMM1, %%T_key
1006 vaesenc %%XMM2, %%T_key
1007 vaesenc %%XMM3, %%T_key
1008 vaesenc %%XMM4, %%T_key
1009 vaesenc %%XMM5, %%T_key
1010 vaesenc %%XMM6, %%T_key
1011 vaesenc %%XMM7, %%T_key
1012 vaesenc %%XMM8, %%T_key
1013
1014%ifndef GCM128_MODE
1015 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
1016 vaesenc %%XMM1, %%T_key
1017 vaesenc %%XMM2, %%T_key
1018 vaesenc %%XMM3, %%T_key
1019 vaesenc %%XMM4, %%T_key
1020 vaesenc %%XMM5, %%T_key
1021 vaesenc %%XMM6, %%T_key
1022 vaesenc %%XMM7, %%T_key
1023 vaesenc %%XMM8, %%T_key
1024%endif
1025
1026%assign i (i+1)
1027%assign j (j+1)
1028%assign k (k-1)
1029%if(%%num_initial_blocks>6)
1030 ;; GDATA, HASHKEY, CIPHER,
1031 ;; STATE_11, STATE_00, STATE_MID, T1, T2
1032 vmovdqu %%T2, [rsp + TMP %+ j]
1033 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1034 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
1035%endif
1036
1037%ifdef GCM128_MODE
1038 vmovdqu %%T_key, [%%GDATA_KEY+16*10]
1039 vaesenclast %%XMM1, %%T_key
1040 vaesenclast %%XMM2, %%T_key
1041 vaesenclast %%XMM3, %%T_key
1042 vaesenclast %%XMM4, %%T_key
1043 vaesenclast %%XMM5, %%T_key
1044 vaesenclast %%XMM6, %%T_key
1045 vaesenclast %%XMM7, %%T_key
1046 vaesenclast %%XMM8, %%T_key
1047%endif
1048
1049%ifdef GCM192_MODE
1050 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
1051 vaesenc %%XMM1, %%T_key
1052 vaesenc %%XMM2, %%T_key
1053 vaesenc %%XMM3, %%T_key
1054 vaesenc %%XMM4, %%T_key
1055 vaesenc %%XMM5, %%T_key
1056 vaesenc %%XMM6, %%T_key
1057 vaesenc %%XMM7, %%T_key
1058 vaesenc %%XMM8, %%T_key
1059
1060 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
1061 vaesenclast %%XMM1, %%T_key
1062 vaesenclast %%XMM2, %%T_key
1063 vaesenclast %%XMM3, %%T_key
1064 vaesenclast %%XMM4, %%T_key
1065 vaesenclast %%XMM5, %%T_key
1066 vaesenclast %%XMM6, %%T_key
1067 vaesenclast %%XMM7, %%T_key
1068 vaesenclast %%XMM8, %%T_key
1069%endif
1070%ifdef GCM256_MODE
1071 vmovdqu %%T_key, [%%GDATA_KEY+16*11]
1072 vaesenc %%XMM1, %%T_key
1073 vaesenc %%XMM2, %%T_key
1074 vaesenc %%XMM3, %%T_key
1075 vaesenc %%XMM4, %%T_key
1076 vaesenc %%XMM5, %%T_key
1077 vaesenc %%XMM6, %%T_key
1078 vaesenc %%XMM7, %%T_key
1079 vaesenc %%XMM8, %%T_key
1080
1081 vmovdqu %%T_key, [%%GDATA_KEY+16*12]
1082 vaesenc %%XMM1, %%T_key
1083 vaesenc %%XMM2, %%T_key
1084 vaesenc %%XMM3, %%T_key
1085 vaesenc %%XMM4, %%T_key
1086 vaesenc %%XMM5, %%T_key
1087 vaesenc %%XMM6, %%T_key
1088 vaesenc %%XMM7, %%T_key
1089 vaesenc %%XMM8, %%T_key
1090%endif
1091
1092%assign i (i+1)
1093%assign j (j+1)
1094%assign k (k-1)
1095%if(%%num_initial_blocks>7)
1096 ;; GDATA, HASHKEY, CIPHER,
1097 ;; STATE_11, STATE_00, STATE_MID, T1, T2
1098 vmovdqu %%T2, [rsp + TMP %+ j]
1099 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1100 %%T1, %%T4, %%T6, %%T5, %%T3, not_first
1101%endif
1102
1103%ifdef GCM256_MODE ; GCM256
1104 vmovdqu %%T_key, [%%GDATA_KEY+16*13]
1105 vaesenc %%XMM1, %%T_key
1106 vaesenc %%XMM2, %%T_key
1107 vaesenc %%XMM3, %%T_key
1108 vaesenc %%XMM4, %%T_key
1109 vaesenc %%XMM5, %%T_key
1110 vaesenc %%XMM6, %%T_key
1111 vaesenc %%XMM7, %%T_key
1112 vaesenc %%XMM8, %%T_key
1113
1114 vmovdqu %%T_key, [%%GDATA_KEY+16*14]
1115 vaesenclast %%XMM1, %%T_key
1116 vaesenclast %%XMM2, %%T_key
1117 vaesenclast %%XMM3, %%T_key
1118 vaesenclast %%XMM4, %%T_key
1119 vaesenclast %%XMM5, %%T_key
1120 vaesenclast %%XMM6, %%T_key
1121 vaesenclast %%XMM7, %%T_key
1122 vaesenclast %%XMM8, %%T_key
1123%endif ; GCM256 mode
1124
1125%if(%%num_initial_blocks>0)
1126 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1127 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1128 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1129 vpxor %%T4, %%T6, %%T4
1130
1131 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1132 ; First phase of the reduction
9f95a23c 1133 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
1134
1135 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1136 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
1137
1138 ;; First phase of the reduction complete
1139 vpxor %%T4, %%T4, %%T2
1140
1141 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1142 ; Second phase of the reduction
1143 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1144 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1145 vpsrldq %%T2, %%T2, 4
1146
1147 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1148 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1149 vpslldq %%T4, %%T4, 4
1150 ;; Second phase of the reduction complete
1151 vpxor %%T4, %%T4, %%T2
1152 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1153 ; The result is in %%T3
1154 vpxor %%T3, %%T1, %%T4
1155%else
1156 ;; The hash should end up in T3
1157 vmovdqa %%T3, %%T2
1158%endif
1159
1160 ;; Final hash is now in T3
1161%if %%num_initial_blocks > 0
1162 ;; NOTE: obsolete in case %%num_initial_blocks = 0
1163 sub %%LENGTH, 16*%%num_initial_blocks
1164%endif
1165
1166 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
1167 vpxor %%XMM1, %%XMM1, %%T1
1168 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
1169 %ifidn %%ENC_DEC, DEC
1170 vmovdqa %%XMM1, %%T1
1171 %endif
1172
1173 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
1174 vpxor %%XMM2, %%XMM2, %%T1
1175 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
1176 %ifidn %%ENC_DEC, DEC
1177 vmovdqa %%XMM2, %%T1
1178 %endif
1179
1180 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
1181 vpxor %%XMM3, %%XMM3, %%T1
1182 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
1183 %ifidn %%ENC_DEC, DEC
1184 vmovdqa %%XMM3, %%T1
1185 %endif
1186
1187 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
1188 vpxor %%XMM4, %%XMM4, %%T1
1189 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
1190 %ifidn %%ENC_DEC, DEC
1191 vmovdqa %%XMM4, %%T1
1192 %endif
1193
1194 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
1195 vpxor %%XMM5, %%XMM5, %%T1
1196 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
1197 %ifidn %%ENC_DEC, DEC
1198 vmovdqa %%XMM5, %%T1
1199 %endif
1200
1201 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
1202 vpxor %%XMM6, %%XMM6, %%T1
1203 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
1204 %ifidn %%ENC_DEC, DEC
1205 vmovdqa %%XMM6, %%T1
1206 %endif
1207
1208 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
1209 vpxor %%XMM7, %%XMM7, %%T1
1210 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
1211 %ifidn %%ENC_DEC, DEC
1212 vmovdqa %%XMM7, %%T1
1213 %endif
1214
1215%if %%num_initial_blocks > 0
1216 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
1217 ;; This macro is executed for lenght 128 and up,
1218 ;; zero length is checked in GCM_ENC_DEC.
1219 ;; If the last block is partial then the xor will be done later
1220 ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
1221 ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
1222 cmp %%LENGTH, 128
1223 jl %%_initial_skip_last_word_write
1224%endif
1225 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
1226 vpxor %%XMM8, %%XMM8, %%T1
1227 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
1228 %ifidn %%ENC_DEC, DEC
1229 vmovdqa %%XMM8, %%T1
1230 %endif
1231
1232 ;; Update %%LENGTH with the number of blocks processed
1233 sub %%LENGTH, 16
1234 add %%DATA_OFFSET, 16
1235%%_initial_skip_last_word_write:
1236 sub %%LENGTH, 128-16
1237 add %%DATA_OFFSET, 128-16
1238
9f95a23c 1239 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
1240 ;; Combine GHASHed value with the corresponding ciphertext
1241 vpxor %%XMM1, %%XMM1, %%T3
9f95a23c
TL
1242 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
1243 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
1244 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
1245 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
1246 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
1247 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
1248 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
1249
1250;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1251
1252%%_initial_blocks_done:
1253
1254
1255%endmacro
1256
1257;;; INITIAL_BLOCKS macro with support for a partial final block.
1258;;; num_initial_blocks is expected to include the partial final block
1259;;; in the count.
1260%macro INITIAL_BLOCKS_PARTIAL 25
1261%define %%GDATA_KEY %1
1262%define %%GDATA_CTX %2
1263%define %%CYPH_PLAIN_OUT %3
1264%define %%PLAIN_CYPH_IN %4
1265%define %%LENGTH %5
1266%define %%DATA_OFFSET %6
1267%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
1268%define %%T1 %8
1269%define %%T2 %9
1270%define %%T3 %10
1271%define %%T4 %11
1272%define %%T5 %12
1273%define %%CTR %13
1274%define %%XMM1 %14
1275%define %%XMM2 %15
1276%define %%XMM3 %16
1277%define %%XMM4 %17
1278%define %%XMM5 %18
1279%define %%XMM6 %19
1280%define %%XMM7 %20
1281%define %%XMM8 %21
1282%define %%T6 %22
1283%define %%T_key %23
1284%define %%ENC_DEC %24
1285%define %%INSTANCE_TYPE %25
1286
1287%assign i (8-%%num_initial_blocks)
1288 ;; Move AAD_HASH to temp reg
1289 vmovdqu %%T2, %%XMM8
1290 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
1291
1292%assign i (9-%%num_initial_blocks)
1293%rep %%num_initial_blocks
1294 ;; Compute AES counters
1295 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
1296 vmovdqa reg(i), %%CTR
1297 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
1298%assign i (i+1)
1299%endrep
1300
1301vmovdqu %%T_key, [%%GDATA_KEY+16*0]
1302%assign i (9-%%num_initial_blocks)
1303%rep %%num_initial_blocks
1304 ; Start AES for %%num_initial_blocks blocks
1305 vpxor reg(i),reg(i),%%T_key
1306%assign i (i+1)
1307%endrep
1308
1309%assign j 1
1310%rep NROUNDS
1311vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1312%assign i (9-%%num_initial_blocks)
1313%rep %%num_initial_blocks
1314 vaesenc reg(i),%%T_key
1315%assign i (i+1)
1316%endrep
1317
1318%assign j (j+1)
1319%endrep
1320
1321
1322vmovdqu %%T_key, [%%GDATA_KEY+16*j]
1323%assign i (9-%%num_initial_blocks)
1324%rep %%num_initial_blocks
1325 vaesenclast reg(i),%%T_key
1326%assign i (i+1)
1327%endrep
1328
1329;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1330;;; Hash all but the last block of data
1331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1332
1333%assign i (9-%%num_initial_blocks)
1334%rep %%num_initial_blocks-1
1335 ;; Encrypt the message for all but the last block
1336 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1337 vpxor reg(i), reg(i), %%T1
1338 ;; write back ciphertext for %%num_initial_blocks blocks
1339 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1340 add %%DATA_OFFSET, 16
1341 %ifidn %%ENC_DEC, DEC
1342 vmovdqa reg(i), %%T1
1343 %endif
1344 ;; Prepare ciphertext for GHASH computations
1345 vpshufb reg(i), [rel SHUF_MASK]
1346%assign i (i+1)
1347%endrep
1348
1349 ;; The final block of data may be <16B
1350 sub %%LENGTH, 16*(%%num_initial_blocks-1)
1351
1352%if %%num_initial_blocks < 8
1353 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
1354 ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
1355 cmp %%LENGTH, 16
1356 jl %%_small_initial_partial_block
1357
1358;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1359;;; Handle a full length final block - encrypt and hash all blocks
1360;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1361
1362 sub %%LENGTH, 16
1363 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1364
1365 ;; Encrypt the message
1366 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1367 vpxor reg(i), reg(i), %%T1
1368 ;; write back ciphertext for %%num_initial_blocks blocks
1369 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1370 add %%DATA_OFFSET, 16
1371 %ifidn %%ENC_DEC, DEC
1372 vmovdqa reg(i), %%T1
1373 %endif
1374 ;; Prepare ciphertext for GHASH computations
1375 vpshufb reg(i), [rel SHUF_MASK]
1376
1377 ;; Hash all of the data
1378%assign i (8-%%num_initial_blocks)
1379%assign j (9-%%num_initial_blocks)
1380%assign k (%%num_initial_blocks)
1381%assign last_block_to_hash 0
1382
1383%if(%%num_initial_blocks>last_block_to_hash)
1384 ;; Hash in AES state
1385 vpxor %%T2, reg(j)
1386
1387 ;; T2 - incoming AAD hash
1388 ;; reg(i) holds ciphertext
1389 ;; T5 - hash key
1390 ;; T6 - updated xor
1391 ;; reg(1)/xmm1 should now be available for tmp use
1392 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1393 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1394 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1395 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1396 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1397 vpxor %%T6, %%T6, %%T5
1398%endif
1399
1400%assign i (i+1)
1401%assign j (j+1)
1402%assign k (k-1)
1403%assign rep_count (%%num_initial_blocks-1)
1404%rep rep_count
1405
1406 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1407 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1408 vpxor %%T1, %%T1, %%T3
1409
1410 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1411 vpxor %%T4, %%T4, %%T3
1412
1413 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1414 vpxor %%T6, %%T6, %%T3
1415
1416 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1417 vpxor %%T6, %%T6, %%T3
1418
1419%assign i (i+1)
1420%assign j (j+1)
1421%assign k (k-1)
1422%endrep
1423
1424 ;; Record that a reduction is needed
1425 mov r12, 1
1426
1427 jmp %%_small_initial_compute_hash
1428
1429
1430%endif ; %if %%num_initial_blocks < 8
1431
1432%%_small_initial_partial_block:
1433
1434;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1435;;; Handle ghash for a <16B final block
1436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1437
1438 ;; In this case if it's a single call to encrypt we can
1439 ;; hash all of the data but if it's an init / update / finalize
1440 ;; series of call we need to leave the last block if it's
1441 ;; less than a full block of data.
1442
1443 mov [%%GDATA_CTX + PBlockLen], %%LENGTH
1444 vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
1445 ;; Handle a partial final block
1446 ;; GDATA, KEY, T1, T2
1447 ;; r13 - length
1448 ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
1449 ;; NOTE: could be replaced with %%LENGTH but at this point
1450 ;; %%LENGTH is always less than 16.
1451 ;; No PLAIN_CYPH_LEN argument available in this macro.
1452 ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
9f95a23c 1453 vpshufb reg(i), [rel SHUF_MASK]
11fdf7f2
TL
1454
1455%ifidn %%INSTANCE_TYPE, multi_call
1456%assign i (8-%%num_initial_blocks)
1457%assign j (9-%%num_initial_blocks)
1458%assign k (%%num_initial_blocks-1)
1459%assign last_block_to_hash 1
1460%else
1461%assign i (8-%%num_initial_blocks)
1462%assign j (9-%%num_initial_blocks)
1463%assign k (%%num_initial_blocks)
1464%assign last_block_to_hash 0
1465%endif
1466
1467%if(%%num_initial_blocks>last_block_to_hash)
1468 ;; Record that a reduction is needed
1469 mov r12, 1
1470 ;; Hash in AES state
1471 vpxor %%T2, reg(j)
1472
1473 ;; T2 - incoming AAD hash
1474 ;; reg(i) holds ciphertext
1475 ;; T5 - hash key
1476 ;; T6 - updated xor
1477 ;; reg(1)/xmm1 should now be available for tmp use
1478 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1479 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1480 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
1481 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1482 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1483 vpxor %%T6, %%T6, %%T5
1484%else
1485 ;; Record that a reduction is not needed -
1486 ;; In this case no hashes are computed because there
1487 ;; is only one initial block and it is < 16B in length.
1488 mov r12, 0
1489%endif
1490
1491%assign i (i+1)
1492%assign j (j+1)
1493%assign k (k-1)
1494%ifidn %%INSTANCE_TYPE, multi_call
1495%assign rep_count (%%num_initial_blocks-2)
1496%%_multi_call_hash:
1497%else
1498%assign rep_count (%%num_initial_blocks-1)
1499%endif
9f95a23c
TL
1500
1501%if rep_count < 0
1502 ;; quick fix for negative rep_count (to be investigated)
1503%assign rep_count 0
1504%endif
1505
11fdf7f2
TL
1506%rep rep_count
1507
1508 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1509 vpclmulqdq %%T3, reg(j), %%T5, 0x11
1510 vpxor %%T1, %%T1, %%T3
1511
1512 vpclmulqdq %%T3, reg(j), %%T5, 0x00
1513 vpxor %%T4, %%T4, %%T3
1514
1515 vpclmulqdq %%T3, reg(j), %%T5, 0x01
1516 vpxor %%T6, %%T6, %%T3
1517
1518 vpclmulqdq %%T3, reg(j), %%T5, 0x10
1519 vpxor %%T6, %%T6, %%T3
1520
1521%assign i (i+1)
1522%assign j (j+1)
1523%assign k (k-1)
1524%endrep
1525
1526%%_small_initial_compute_hash:
1527
1528;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1529;;; Ghash reduction
1530;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1531
1532%if(%%num_initial_blocks=1)
1533%ifidn %%INSTANCE_TYPE, multi_call
1534 ;; We only need to check if a reduction is needed if
1535 ;; initial_blocks == 1 and init/update/final is being used.
1536 ;; In this case we may just have a partial block, and that
1537 ;; gets hashed in finalize.
1538 cmp r12, 0
1539 je %%_no_reduction_needed
1540%endif
1541%endif
1542
1543 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
1544 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
1545 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
1546 vpxor %%T4, %%T6, %%T4
1547
1548 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1549 ;; First phase of the reduction
9f95a23c 1550 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
1551
1552 vpclmulqdq %%T2, %%T3, %%T4, 0x01
1553 ;; shift-L xmm2 2 DWs
1554 vpslldq %%T2, %%T2, 8
1555 vpxor %%T4, %%T4, %%T2
1556
1557 ;; First phase of the reduction complete
1558 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1559 ;; Second phase of the reduction
1560
1561 vpclmulqdq %%T2, %%T3, %%T4, 0x00
1562 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1563 vpsrldq %%T2, %%T2, 4
1564
1565 vpclmulqdq %%T4, %%T3, %%T4, 0x10
1566 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1567 vpslldq %%T4, %%T4, 4
1568
1569 vpxor %%T4, %%T4, %%T2
1570 ;; Second phase of the reduction complete
1571 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1572 vpxor %%T3, %%T1, %%T4
1573
1574%ifidn %%INSTANCE_TYPE, multi_call
1575 ;; If using init/update/finalize, we need to xor any partial block data
1576 ;; into the hash.
1577%if %%num_initial_blocks > 1
1578 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
1579%if %%num_initial_blocks != 8
1580 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
1581 cmp qword [%%GDATA_CTX + PBlockLen], 0
1582 je %%_no_partial_block_xor
1583%endif ; %%num_initial_blocks != 8
1584 vpxor %%T3, %%T3, reg(8)
1585%%_no_partial_block_xor:
1586%endif ; %%num_initial_blocks > 1
1587%endif ; %%INSTANCE_TYPE, multi_call
1588
1589%if(%%num_initial_blocks=1)
1590%ifidn %%INSTANCE_TYPE, multi_call
1591 ;; NOTE: %%_no_reduction_needed case only valid for
1592 ;; multi_call with initial_blocks = 1.
1593 ;; Look for comment above around '_no_reduction_needed'
1594 ;; The jmp below is obsolete as the code will fall through.
1595
1596 ;; The result is in %%T3
1597 jmp %%_after_reduction
1598
1599%%_no_reduction_needed:
1600 ;; The hash should end up in T3. The only way we should get here is if
1601 ;; there is a partial block of data, so xor that into the hash.
1602 vpxor %%T3, %%T2, reg(8)
1603%endif ; %%INSTANCE_TYPE = multi_call
1604%endif ; %%num_initial_blocks=1
1605
1606%%_after_reduction:
1607 ;; Final hash is now in T3
1608
1609%endmacro ; INITIAL_BLOCKS_PARTIAL
1610
1611
1612
1613; encrypt 8 blocks at a time
1614; ghash the 8 previously encrypted ciphertext blocks
1615; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
1616; %%DATA_OFFSET is the data offset value
1617%macro GHASH_8_ENCRYPT_8_PARALLEL 23
1618%define %%GDATA %1
1619%define %%CYPH_PLAIN_OUT %2
1620%define %%PLAIN_CYPH_IN %3
1621%define %%DATA_OFFSET %4
1622%define %%T1 %5
1623%define %%T2 %6
1624%define %%T3 %7
1625%define %%T4 %8
1626%define %%T5 %9
1627%define %%T6 %10
1628%define %%CTR %11
1629%define %%XMM1 %12
1630%define %%XMM2 %13
1631%define %%XMM3 %14
1632%define %%XMM4 %15
1633%define %%XMM5 %16
1634%define %%XMM6 %17
1635%define %%XMM7 %18
1636%define %%XMM8 %19
1637%define %%T7 %20
1638%define %%loop_idx %21
1639%define %%ENC_DEC %22
1640%define %%FULL_PARTIAL %23
1641
1642 vmovdqa %%T2, %%XMM1
1643 vmovdqu [rsp + TMP2], %%XMM2
1644 vmovdqu [rsp + TMP3], %%XMM3
1645 vmovdqu [rsp + TMP4], %%XMM4
1646 vmovdqu [rsp + TMP5], %%XMM5
1647 vmovdqu [rsp + TMP6], %%XMM6
1648 vmovdqu [rsp + TMP7], %%XMM7
1649 vmovdqu [rsp + TMP8], %%XMM8
1650
1651%ifidn %%loop_idx, in_order
9f95a23c
TL
1652 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
1653 vmovdqa %%T5, [rel TWO]
11fdf7f2
TL
1654 vpaddd %%XMM2, %%CTR, %%T5
1655 vpaddd %%XMM3, %%XMM1, %%T5
1656 vpaddd %%XMM4, %%XMM2, %%T5
1657 vpaddd %%XMM5, %%XMM3, %%T5
1658 vpaddd %%XMM6, %%XMM4, %%T5
1659 vpaddd %%XMM7, %%XMM5, %%T5
1660 vpaddd %%XMM8, %%XMM6, %%T5
1661 vmovdqa %%CTR, %%XMM8
1662
9f95a23c 1663 vmovdqa %%T5, [rel SHUF_MASK]
11fdf7f2
TL
1664 vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
1665 vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
1666 vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
1667 vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
1668 vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
1669 vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
1670 vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
1671 vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
1672%else
9f95a23c
TL
1673 vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
1674 vmovdqa %%T5, [rel TWOf]
11fdf7f2
TL
1675 vpaddd %%XMM2, %%CTR, %%T5
1676 vpaddd %%XMM3, %%XMM1, %%T5
1677 vpaddd %%XMM4, %%XMM2, %%T5
1678 vpaddd %%XMM5, %%XMM3, %%T5
1679 vpaddd %%XMM6, %%XMM4, %%T5
1680 vpaddd %%XMM7, %%XMM5, %%T5
1681 vpaddd %%XMM8, %%XMM6, %%T5
1682 vmovdqa %%CTR, %%XMM8
1683%endif
1684
1685
1686
1687 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1688
1689 vmovdqu %%T1, [%%GDATA + 16*0]
1690 vpxor %%XMM1, %%XMM1, %%T1
1691 vpxor %%XMM2, %%XMM2, %%T1
1692 vpxor %%XMM3, %%XMM3, %%T1
1693 vpxor %%XMM4, %%XMM4, %%T1
1694 vpxor %%XMM5, %%XMM5, %%T1
1695 vpxor %%XMM6, %%XMM6, %%T1
1696 vpxor %%XMM7, %%XMM7, %%T1
1697 vpxor %%XMM8, %%XMM8, %%T1
1698
1699 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1700
1701 vmovdqu %%T1, [%%GDATA + 16*1]
1702 vaesenc %%XMM1, %%T1
1703 vaesenc %%XMM2, %%T1
1704 vaesenc %%XMM3, %%T1
1705 vaesenc %%XMM4, %%T1
1706 vaesenc %%XMM5, %%T1
1707 vaesenc %%XMM6, %%T1
1708 vaesenc %%XMM7, %%T1
1709 vaesenc %%XMM8, %%T1
1710
1711
1712 vmovdqu %%T1, [%%GDATA + 16*2]
1713 vaesenc %%XMM1, %%T1
1714 vaesenc %%XMM2, %%T1
1715 vaesenc %%XMM3, %%T1
1716 vaesenc %%XMM4, %%T1
1717 vaesenc %%XMM5, %%T1
1718 vaesenc %%XMM6, %%T1
1719 vaesenc %%XMM7, %%T1
1720 vaesenc %%XMM8, %%T1
1721
1722 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1723
1724 vmovdqu %%T5, [%%GDATA + HashKey_8]
1725 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1726 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
1727 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
1728 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
1729 vpxor %%T6, %%T6, %%T5
1730
1731 vmovdqu %%T1, [%%GDATA + 16*3]
1732 vaesenc %%XMM1, %%T1
1733 vaesenc %%XMM2, %%T1
1734 vaesenc %%XMM3, %%T1
1735 vaesenc %%XMM4, %%T1
1736 vaesenc %%XMM5, %%T1
1737 vaesenc %%XMM6, %%T1
1738 vaesenc %%XMM7, %%T1
1739 vaesenc %%XMM8, %%T1
1740
1741 vmovdqu %%T1, [rsp + TMP2]
1742 vmovdqu %%T5, [%%GDATA + HashKey_7]
1743 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1744 vpxor %%T4, %%T4, %%T3
1745
1746 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1747 vpxor %%T7, %%T7, %%T3
1748
1749 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1750 vpxor %%T6, %%T6, %%T3
1751
1752 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1753 vpxor %%T6, %%T6, %%T3
1754
1755 vmovdqu %%T1, [%%GDATA + 16*4]
1756 vaesenc %%XMM1, %%T1
1757 vaesenc %%XMM2, %%T1
1758 vaesenc %%XMM3, %%T1
1759 vaesenc %%XMM4, %%T1
1760 vaesenc %%XMM5, %%T1
1761 vaesenc %%XMM6, %%T1
1762 vaesenc %%XMM7, %%T1
1763 vaesenc %%XMM8, %%T1
1764
1765 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1766 vmovdqu %%T1, [rsp + TMP3]
1767 vmovdqu %%T5, [%%GDATA + HashKey_6]
1768 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1769 vpxor %%T4, %%T4, %%T3
1770
1771 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1772 vpxor %%T7, %%T7, %%T3
1773
1774 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1775 vpxor %%T6, %%T6, %%T3
1776
1777 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1778 vpxor %%T6, %%T6, %%T3
1779
1780 vmovdqu %%T1, [%%GDATA + 16*5]
1781 vaesenc %%XMM1, %%T1
1782 vaesenc %%XMM2, %%T1
1783 vaesenc %%XMM3, %%T1
1784 vaesenc %%XMM4, %%T1
1785 vaesenc %%XMM5, %%T1
1786 vaesenc %%XMM6, %%T1
1787 vaesenc %%XMM7, %%T1
1788 vaesenc %%XMM8, %%T1
1789
1790
1791 vmovdqu %%T1, [rsp + TMP4]
1792 vmovdqu %%T5, [%%GDATA + HashKey_5]
1793 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1794 vpxor %%T4, %%T4, %%T3
1795
1796 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1797 vpxor %%T7, %%T7, %%T3
1798
1799 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1800 vpxor %%T6, %%T6, %%T3
1801
1802 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1803 vpxor %%T6, %%T6, %%T3
1804
1805 vmovdqu %%T1, [%%GDATA + 16*6]
1806 vaesenc %%XMM1, %%T1
1807 vaesenc %%XMM2, %%T1
1808 vaesenc %%XMM3, %%T1
1809 vaesenc %%XMM4, %%T1
1810 vaesenc %%XMM5, %%T1
1811 vaesenc %%XMM6, %%T1
1812 vaesenc %%XMM7, %%T1
1813 vaesenc %%XMM8, %%T1
1814
1815 vmovdqu %%T1, [rsp + TMP5]
1816 vmovdqu %%T5, [%%GDATA + HashKey_4]
1817 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1818 vpxor %%T4, %%T4, %%T3
1819
1820 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1821 vpxor %%T7, %%T7, %%T3
1822
1823 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1824 vpxor %%T6, %%T6, %%T3
1825
1826 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1827 vpxor %%T6, %%T6, %%T3
1828
1829 vmovdqu %%T1, [%%GDATA + 16*7]
1830 vaesenc %%XMM1, %%T1
1831 vaesenc %%XMM2, %%T1
1832 vaesenc %%XMM3, %%T1
1833 vaesenc %%XMM4, %%T1
1834 vaesenc %%XMM5, %%T1
1835 vaesenc %%XMM6, %%T1
1836 vaesenc %%XMM7, %%T1
1837 vaesenc %%XMM8, %%T1
1838
1839 vmovdqu %%T1, [rsp + TMP6]
1840 vmovdqu %%T5, [%%GDATA + HashKey_3]
1841 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1842 vpxor %%T4, %%T4, %%T3
1843
1844 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1845 vpxor %%T7, %%T7, %%T3
1846
1847 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1848 vpxor %%T6, %%T6, %%T3
1849
1850 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1851 vpxor %%T6, %%T6, %%T3
1852
1853 vmovdqu %%T1, [%%GDATA + 16*8]
1854 vaesenc %%XMM1, %%T1
1855 vaesenc %%XMM2, %%T1
1856 vaesenc %%XMM3, %%T1
1857 vaesenc %%XMM4, %%T1
1858 vaesenc %%XMM5, %%T1
1859 vaesenc %%XMM6, %%T1
1860 vaesenc %%XMM7, %%T1
1861 vaesenc %%XMM8, %%T1
1862
1863 vmovdqu %%T1, [rsp + TMP7]
1864 vmovdqu %%T5, [%%GDATA + HashKey_2]
1865 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1866 vpxor %%T4, %%T4, %%T3
1867
1868 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1869 vpxor %%T7, %%T7, %%T3
1870
1871 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1872 vpxor %%T6, %%T6, %%T3
1873
1874 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1875 vpxor %%T6, %%T6, %%T3
1876
1877 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1878
1879 vmovdqu %%T5, [%%GDATA + 16*9]
1880 vaesenc %%XMM1, %%T5
1881 vaesenc %%XMM2, %%T5
1882 vaesenc %%XMM3, %%T5
1883 vaesenc %%XMM4, %%T5
1884 vaesenc %%XMM5, %%T5
1885 vaesenc %%XMM6, %%T5
1886 vaesenc %%XMM7, %%T5
1887 vaesenc %%XMM8, %%T5
1888
1889 vmovdqu %%T1, [rsp + TMP8]
1890 vmovdqu %%T5, [%%GDATA + HashKey]
1891
1892
1893 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1894 vpxor %%T7, %%T7, %%T3
1895
1896 vpclmulqdq %%T3, %%T1, %%T5, 0x01
1897 vpxor %%T6, %%T6, %%T3
1898
1899 vpclmulqdq %%T3, %%T1, %%T5, 0x10
1900 vpxor %%T6, %%T6, %%T3
1901
1902 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1903 vpxor %%T1, %%T4, %%T3
1904
1905
1906 vmovdqu %%T5, [%%GDATA + 16*10]
1907 %ifndef GCM128_MODE ; GCM192 or GCM256
1908 vaesenc %%XMM1, %%T5
1909 vaesenc %%XMM2, %%T5
1910 vaesenc %%XMM3, %%T5
1911 vaesenc %%XMM4, %%T5
1912 vaesenc %%XMM5, %%T5
1913 vaesenc %%XMM6, %%T5
1914 vaesenc %%XMM7, %%T5
1915 vaesenc %%XMM8, %%T5
1916
1917 vmovdqu %%T5, [%%GDATA + 16*11]
1918 vaesenc %%XMM1, %%T5
1919 vaesenc %%XMM2, %%T5
1920 vaesenc %%XMM3, %%T5
1921 vaesenc %%XMM4, %%T5
1922 vaesenc %%XMM5, %%T5
1923 vaesenc %%XMM6, %%T5
1924 vaesenc %%XMM7, %%T5
1925 vaesenc %%XMM8, %%T5
1926
1927 vmovdqu %%T5, [%%GDATA + 16*12]
1928%endif
1929%ifdef GCM256_MODE
1930 vaesenc %%XMM1, %%T5
1931 vaesenc %%XMM2, %%T5
1932 vaesenc %%XMM3, %%T5
1933 vaesenc %%XMM4, %%T5
1934 vaesenc %%XMM5, %%T5
1935 vaesenc %%XMM6, %%T5
1936 vaesenc %%XMM7, %%T5
1937 vaesenc %%XMM8, %%T5
1938
1939 vmovdqu %%T5, [%%GDATA + 16*13]
1940 vaesenc %%XMM1, %%T5
1941 vaesenc %%XMM2, %%T5
1942 vaesenc %%XMM3, %%T5
1943 vaesenc %%XMM4, %%T5
1944 vaesenc %%XMM5, %%T5
1945 vaesenc %%XMM6, %%T5
1946 vaesenc %%XMM7, %%T5
1947 vaesenc %%XMM8, %%T5
1948
1949 vmovdqu %%T5, [%%GDATA + 16*14]
1950%endif ; GCM256
1951
1952%assign i 0
1953%assign j 1
1954%rep 8
1955
1956 ;; SNP TBD: This is pretty ugly - consider whether just XORing the
1957 ;; data in after vaesenclast is simpler and performant. Would
1958 ;; also have to ripple it through partial block and ghash_mul_8.
1959%ifidn %%FULL_PARTIAL, full
1960 %ifdef NT_LD
1961 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1962 vpxor %%T2, %%T2, %%T5
1963 %else
1964 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1965 %endif
1966
1967 %ifidn %%ENC_DEC, ENC
1968 vaesenclast reg(j), reg(j), %%T2
1969 %else
1970 vaesenclast %%T3, reg(j), %%T2
1971 vpxor reg(j), %%T2, %%T5
1972 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1973 %endif
1974
1975%else
1976 ; Don't read the final data during partial block processing
1977 %ifdef NT_LD
1978 %if (i<7)
1979 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1980 vpxor %%T2, %%T2, %%T5
1981 %else
1982 ;; Stage the key directly in T2 rather than hash it with plaintext
1983 vmovdqu %%T2, %%T5
1984 %endif
1985 %else
1986 %if (i<7)
1987 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1988 %else
1989 ;; Stage the key directly in T2 rather than hash it with plaintext
1990 vmovdqu %%T2, %%T5
1991 %endif
1992 %endif
1993
1994 %ifidn %%ENC_DEC, ENC
1995 vaesenclast reg(j), reg(j), %%T2
1996 %else
1997 %if (i<7)
1998 vaesenclast %%T3, reg(j), %%T2
1999 vpxor reg(j), %%T2, %%T5
2000 ;; Do not read the data since it could fault
2001 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
2002 %else
2003 vaesenclast reg(j), reg(j), %%T2
2004 %endif
2005 %endif
2006%endif
2007
2008%assign i (i+1)
2009%assign j (j+1)
2010%endrep
2011
2012
2013;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2014
2015
2016 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
2017 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
2018 vpxor %%T7, %%T7, %%T3
2019 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
2020
2021
2022
2023 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2024 ;first phase of the reduction
9f95a23c 2025 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
2026
2027 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2028 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2029
2030 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2031 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2032
2033 %ifidn %%ENC_DEC, ENC
2034 ; Write to the Ciphertext buffer
2035 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
2036 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
2037 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
2038 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
2039 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
2040 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
2041 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
2042 %ifidn %%FULL_PARTIAL, full
2043 ;; Avoid writing past the buffer if handling a partial block
2044 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
2045 %endif
2046 %endif
2047
2048
2049;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2050 ;second phase of the reduction
2051 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2052 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2053
2054 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2055 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2056
2057 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2058 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2059 vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
2060
9f95a23c
TL
2061 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
2062 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
2063 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
2064 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
2065 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
2066 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
2067 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
2068 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
2069
2070
2071 vpxor %%XMM1, %%T1
2072
2073
2074%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
2075
2076
2077; GHASH the last 4 ciphertext blocks.
2078%macro GHASH_LAST_8 16
2079%define %%GDATA %1
2080%define %%T1 %2
2081%define %%T2 %3
2082%define %%T3 %4
2083%define %%T4 %5
2084%define %%T5 %6
2085%define %%T6 %7
2086%define %%T7 %8
2087%define %%XMM1 %9
2088%define %%XMM2 %10
2089%define %%XMM3 %11
2090%define %%XMM4 %12
2091%define %%XMM5 %13
2092%define %%XMM6 %14
2093%define %%XMM7 %15
2094%define %%XMM8 %16
2095
2096 ;; Karatsuba Method
2097
2098 vmovdqu %%T5, [%%GDATA + HashKey_8]
2099
2100 vpshufd %%T2, %%XMM1, 01001110b
2101 vpshufd %%T3, %%T5, 01001110b
2102 vpxor %%T2, %%T2, %%XMM1
2103 vpxor %%T3, %%T3, %%T5
2104
2105 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2106 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2107
2108 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2109
2110 ;;;;;;;;;;;;;;;;;;;;;;
2111
2112 vmovdqu %%T5, [%%GDATA + HashKey_7]
2113 vpshufd %%T2, %%XMM2, 01001110b
2114 vpshufd %%T3, %%T5, 01001110b
2115 vpxor %%T2, %%T2, %%XMM2
2116 vpxor %%T3, %%T3, %%T5
2117
2118 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2119 vpxor %%T6, %%T6, %%T4
2120
2121 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2122 vpxor %%T7, %%T7, %%T4
2123
2124 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2125
2126 vpxor %%XMM1, %%XMM1, %%T2
2127
2128 ;;;;;;;;;;;;;;;;;;;;;;
2129
2130 vmovdqu %%T5, [%%GDATA + HashKey_6]
2131 vpshufd %%T2, %%XMM3, 01001110b
2132 vpshufd %%T3, %%T5, 01001110b
2133 vpxor %%T2, %%T2, %%XMM3
2134 vpxor %%T3, %%T3, %%T5
2135
2136 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2137 vpxor %%T6, %%T6, %%T4
2138
2139 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2140 vpxor %%T7, %%T7, %%T4
2141
2142 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2143
2144 vpxor %%XMM1, %%XMM1, %%T2
2145
2146 ;;;;;;;;;;;;;;;;;;;;;;
2147
2148 vmovdqu %%T5, [%%GDATA + HashKey_5]
2149 vpshufd %%T2, %%XMM4, 01001110b
2150 vpshufd %%T3, %%T5, 01001110b
2151 vpxor %%T2, %%T2, %%XMM4
2152 vpxor %%T3, %%T3, %%T5
2153
2154 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2155 vpxor %%T6, %%T6, %%T4
2156
2157 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2158 vpxor %%T7, %%T7, %%T4
2159
2160 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2161
2162 vpxor %%XMM1, %%XMM1, %%T2
2163
2164 ;;;;;;;;;;;;;;;;;;;;;;
2165
2166 vmovdqu %%T5, [%%GDATA + HashKey_4]
2167 vpshufd %%T2, %%XMM5, 01001110b
2168 vpshufd %%T3, %%T5, 01001110b
2169 vpxor %%T2, %%T2, %%XMM5
2170 vpxor %%T3, %%T3, %%T5
2171
2172 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2173 vpxor %%T6, %%T6, %%T4
2174
2175 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2176 vpxor %%T7, %%T7, %%T4
2177
2178 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2179
2180 vpxor %%XMM1, %%XMM1, %%T2
2181
2182 ;;;;;;;;;;;;;;;;;;;;;;
2183
2184 vmovdqu %%T5, [%%GDATA + HashKey_3]
2185 vpshufd %%T2, %%XMM6, 01001110b
2186 vpshufd %%T3, %%T5, 01001110b
2187 vpxor %%T2, %%T2, %%XMM6
2188 vpxor %%T3, %%T3, %%T5
2189
2190 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2191 vpxor %%T6, %%T6, %%T4
2192
2193 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2194 vpxor %%T7, %%T7, %%T4
2195
2196 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2197
2198 vpxor %%XMM1, %%XMM1, %%T2
2199
2200 ;;;;;;;;;;;;;;;;;;;;;;
2201
2202 vmovdqu %%T5, [%%GDATA + HashKey_2]
2203 vpshufd %%T2, %%XMM7, 01001110b
2204 vpshufd %%T3, %%T5, 01001110b
2205 vpxor %%T2, %%T2, %%XMM7
2206 vpxor %%T3, %%T3, %%T5
2207
2208 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2209 vpxor %%T6, %%T6, %%T4
2210
2211 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2212 vpxor %%T7, %%T7, %%T4
2213
2214 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2215
2216 vpxor %%XMM1, %%XMM1, %%T2
2217
2218 ;;;;;;;;;;;;;;;;;;;;;;
2219
2220 vmovdqu %%T5, [%%GDATA + HashKey]
2221 vpshufd %%T2, %%XMM8, 01001110b
2222 vpshufd %%T3, %%T5, 01001110b
2223 vpxor %%T2, %%T2, %%XMM8
2224 vpxor %%T3, %%T3, %%T5
2225
2226 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
2227 vpxor %%T6, %%T6, %%T4
2228
2229 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
2230 vpxor %%T7, %%T7, %%T4
2231
2232 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2233
2234 vpxor %%XMM1, %%XMM1, %%T2
2235 vpxor %%XMM1, %%XMM1, %%T6
2236 vpxor %%T2, %%XMM1, %%T7
2237
2238
2239
2240
2241 vpslldq %%T4, %%T2, 8
2242 vpsrldq %%T2, %%T2, 8
2243
2244 vpxor %%T7, %%T7, %%T4
2245 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2246
2247 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2248 ;first phase of the reduction
9f95a23c 2249 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
2250
2251 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2252 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2253
2254 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2255 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2256
2257
2258 ;second phase of the reduction
2259 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2260 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2261
2262 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2263 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2264
2265 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2266 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2267 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2268%endmacro
2269
2270
2271; GHASH the last 4 ciphertext blocks.
2272%macro GHASH_LAST_7 15
2273%define %%GDATA %1
2274%define %%T1 %2
2275%define %%T2 %3
2276%define %%T3 %4
2277%define %%T4 %5
2278%define %%T5 %6
2279%define %%T6 %7
2280%define %%T7 %8
2281%define %%XMM1 %9
2282%define %%XMM2 %10
2283%define %%XMM3 %11
2284%define %%XMM4 %12
2285%define %%XMM5 %13
2286%define %%XMM6 %14
2287%define %%XMM7 %15
2288
2289 ;; Karatsuba Method
2290
2291 vmovdqu %%T5, [%%GDATA + HashKey_7]
2292
2293 vpshufd %%T2, %%XMM1, 01001110b
2294 vpshufd %%T3, %%T5, 01001110b
2295 vpxor %%T2, %%T2, %%XMM1
2296 vpxor %%T3, %%T3, %%T5
2297
2298 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
2299 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
2300
2301 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
2302
2303 ;;;;;;;;;;;;;;;;;;;;;;
2304
2305 vmovdqu %%T5, [%%GDATA + HashKey_6]
2306 vpshufd %%T2, %%XMM2, 01001110b
2307 vpshufd %%T3, %%T5, 01001110b
2308 vpxor %%T2, %%T2, %%XMM2
2309 vpxor %%T3, %%T3, %%T5
2310
2311 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
2312 vpxor %%T6, %%T6, %%T4
2313
2314 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
2315 vpxor %%T7, %%T7, %%T4
2316
2317 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2318
2319 vpxor %%XMM1, %%XMM1, %%T2
2320
2321 ;;;;;;;;;;;;;;;;;;;;;;
2322
2323 vmovdqu %%T5, [%%GDATA + HashKey_5]
2324 vpshufd %%T2, %%XMM3, 01001110b
2325 vpshufd %%T3, %%T5, 01001110b
2326 vpxor %%T2, %%T2, %%XMM3
2327 vpxor %%T3, %%T3, %%T5
2328
2329 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
2330 vpxor %%T6, %%T6, %%T4
2331
2332 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
2333 vpxor %%T7, %%T7, %%T4
2334
2335 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2336
2337 vpxor %%XMM1, %%XMM1, %%T2
2338
2339 ;;;;;;;;;;;;;;;;;;;;;;
2340
2341 vmovdqu %%T5, [%%GDATA + HashKey_4]
2342 vpshufd %%T2, %%XMM4, 01001110b
2343 vpshufd %%T3, %%T5, 01001110b
2344 vpxor %%T2, %%T2, %%XMM4
2345 vpxor %%T3, %%T3, %%T5
2346
2347 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
2348 vpxor %%T6, %%T6, %%T4
2349
2350 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
2351 vpxor %%T7, %%T7, %%T4
2352
2353 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2354
2355 vpxor %%XMM1, %%XMM1, %%T2
2356
2357 ;;;;;;;;;;;;;;;;;;;;;;
2358
2359 vmovdqu %%T5, [%%GDATA + HashKey_3]
2360 vpshufd %%T2, %%XMM5, 01001110b
2361 vpshufd %%T3, %%T5, 01001110b
2362 vpxor %%T2, %%T2, %%XMM5
2363 vpxor %%T3, %%T3, %%T5
2364
2365 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
2366 vpxor %%T6, %%T6, %%T4
2367
2368 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
2369 vpxor %%T7, %%T7, %%T4
2370
2371 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2372
2373 vpxor %%XMM1, %%XMM1, %%T2
2374
2375 ;;;;;;;;;;;;;;;;;;;;;;
2376
2377 vmovdqu %%T5, [%%GDATA + HashKey_2]
2378 vpshufd %%T2, %%XMM6, 01001110b
2379 vpshufd %%T3, %%T5, 01001110b
2380 vpxor %%T2, %%T2, %%XMM6
2381 vpxor %%T3, %%T3, %%T5
2382
2383 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
2384 vpxor %%T6, %%T6, %%T4
2385
2386 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
2387 vpxor %%T7, %%T7, %%T4
2388
2389 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2390
2391 vpxor %%XMM1, %%XMM1, %%T2
2392
2393 ;;;;;;;;;;;;;;;;;;;;;;
2394
2395 vmovdqu %%T5, [%%GDATA + HashKey_1]
2396 vpshufd %%T2, %%XMM7, 01001110b
2397 vpshufd %%T3, %%T5, 01001110b
2398 vpxor %%T2, %%T2, %%XMM7
2399 vpxor %%T3, %%T3, %%T5
2400
2401 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
2402 vpxor %%T6, %%T6, %%T4
2403
2404 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
2405 vpxor %%T7, %%T7, %%T4
2406
2407 vpclmulqdq %%T2, %%T2, %%T3, 0x00
2408
2409 vpxor %%XMM1, %%XMM1, %%T2
2410
2411 ;;;;;;;;;;;;;;;;;;;;;;
2412
2413 vpxor %%XMM1, %%XMM1, %%T6
2414 vpxor %%T2, %%XMM1, %%T7
2415
2416
2417
2418
2419 vpslldq %%T4, %%T2, 8
2420 vpsrldq %%T2, %%T2, 8
2421
2422 vpxor %%T7, %%T7, %%T4
2423 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2424
2425 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2426 ;first phase of the reduction
9f95a23c 2427 vmovdqa %%T3, [rel POLY2]
11fdf7f2
TL
2428
2429 vpclmulqdq %%T2, %%T3, %%T7, 0x01
2430 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
2431
2432 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
2433 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2434
2435
2436 ;second phase of the reduction
2437 vpclmulqdq %%T2, %%T3, %%T7, 0x00
2438 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2439
2440 vpclmulqdq %%T4, %%T3, %%T7, 0x10
2441 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2442
2443 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
2444 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2445 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
2446%endmacro
2447
2448
2449
2450;;; Handle encryption of the final partial block
2451;;; IN:
2452;;; r13 - Number of bytes to read
2453;;; MODIFIES:
2454;;; KEY - Key for encrypting the partial block
2455;;; HASH - Current hash value
2456;;; SMASHES:
2457;;; r10, r12, r15, rax
2458;;; T1, T2
2459;;; Note:
2460;;; PLAIN_CYPH_LEN, %7, is passed only to determine
2461;;; if buffer is big enough to do a 16 byte read & shift.
2462;;; 'LT16' is passed here only if buffer is known to be smaller
2463;;; than 16 bytes.
2464;;; Any other value passed here will result in 16 byte read
2465;;; code path.
2466;;; TBD: Remove HASH from the instantiation
2467%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8
2468%define %%KEY %1
2469%define %%T1 %2
2470%define %%T2 %3
2471%define %%CYPH_PLAIN_OUT %4
2472%define %%PLAIN_CYPH_IN %5
2473%define %%PLAIN_CYPH_LEN %6
2474%define %%ENC_DEC %7
2475%define %%DATA_OFFSET %8
2476
2477 ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting
2478%ifidn %%PLAIN_CYPH_LEN, LT16
2479 ;; Handle the case where the message is < 16 bytes
2480 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
2481
2482 ;; T1 - packed output
2483 ;; r10 - input data address
2484 ;; r13 - input data length
2485 ;; r12, r15, rax - temp registers
2486 READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax
2487
2488 lea r12, [SHIFT_MASK + 16]
2489 sub r12, r13
2490%else
2491 ;; Handle the case where the message is >= 16 bytes
2492 sub %%DATA_OFFSET, 16
2493 add %%DATA_OFFSET, r13
2494 ;; Receive the last <16 Byte block
2495 vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]
2496 sub %%DATA_OFFSET, r13
2497 add %%DATA_OFFSET, 16
2498
2499 lea r12, [SHIFT_MASK + 16]
2500 ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes
2501 ;; (r13 is the number of bytes in plaintext mod 16)
2502 sub r12, r13
2503 ;; Get the appropriate shuffle mask
2504 vmovdqu %%T2, [r12]
2505 ;; shift right 16-r13 bytes
2506 vpshufb %%T1, %%T2
2507%endif ; %%PLAIN_CYPH_LEN, LT16
2508
2509 ;; At this point T1 contains the partial block data
2510%ifidn %%ENC_DEC, DEC
2511 ;; Plaintext XOR E(K, Yn)
2512 ;; Set aside the ciphertext
2513 vmovdqa %%T2, %%T1
2514 vpxor %%KEY, %%KEY, %%T1
2515 ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
2516 vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
2517 ;; Mask out top 16-r13 bytes of ciphertext
2518 vpand %%KEY, %%KEY, %%T1
2519
2520 ;; Prepare the ciphertext for the hash
2521 ;; mask out top 16-r13 bytes of the plaintext
2522 vpand %%T2, %%T2, %%T1
2523%else
2524 ;; Plaintext XOR E(K, Yn)
2525 vpxor %%KEY, %%KEY, %%T1
2526 ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
2527 vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
2528 ;; Mask out top 16-r13 bytes of %%KEY
2529 vpand %%KEY, %%KEY, %%T1
2530%endif
2531
2532 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2533 ;; Output r13 Bytes
2534 vmovq rax, %%KEY
2535 cmp r13, 8
2536 jle %%_less_than_8_bytes_left
2537
2538 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
2539 add %%DATA_OFFSET, 8
2540 vpsrldq %%T1, %%KEY, 8
2541 vmovq rax, %%T1
2542 sub r13, 8
2543
2544%%_less_than_8_bytes_left:
2545 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
2546 add %%DATA_OFFSET, 1
2547 shr rax, 8
2548 sub r13, 1
2549 jne %%_less_than_8_bytes_left
2550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2551
2552%ifidn %%ENC_DEC, DEC
2553 ;; If decrypt, restore the ciphertext into %%KEY
2554 vmovdqu %%KEY, %%T2
2555%endif
2556%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
2557
2558
2559
2560; Encryption of a single block
2561%macro ENCRYPT_SINGLE_BLOCK 2
2562%define %%GDATA %1
2563%define %%XMM0 %2
2564
2565 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
2566%assign i 1
2567%rep NROUNDS
2568 vaesenc %%XMM0, [%%GDATA+16*i]
2569%assign i (i+1)
2570%endrep
2571 vaesenclast %%XMM0, [%%GDATA+16*i]
2572%endmacro
2573
2574
2575;; Start of Stack Setup
2576
2577%macro FUNC_SAVE 0
2578 ;; Required for Update/GMC_ENC
2579 ;the number of pushes must equal STACK_OFFSET
2580 push r12
2581 push r13
2582 push r14
2583 push r15
2584 mov r14, rsp
2585
2586 sub rsp, VARIABLE_OFFSET
2587 and rsp, ~63
2588
2589%ifidn __OUTPUT_FORMAT__, win64
2590 ; xmm6:xmm15 need to be maintained for Windows
2591 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2592 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
2593 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
2594 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
2595 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
2596 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
2597 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
2598 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
2599 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
2600 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
2601%endif
2602%endmacro
2603
2604
2605%macro FUNC_RESTORE 0
2606
2607%ifidn __OUTPUT_FORMAT__, win64
2608 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
2609 vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
2610 vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
2611 vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
2612 vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
2613 vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
2614 vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
2615 vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
2616 vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
2617 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2618%endif
2619
2620;; Required for Update/GMC_ENC
2621 mov rsp, r14
2622 pop r15
2623 pop r14
2624 pop r13
2625 pop r12
2626%endmacro
2627
2628
2629;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2630; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
2631; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
2632; Additional Authentication data (A_IN), Additional Data length (A_LEN).
2633; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
2634; Clobbers rax, r10-r13, and xmm0-xmm6
2635;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2636%macro GCM_INIT 5
2637%define %%GDATA_KEY %1
2638%define %%GDATA_CTX %2
2639%define %%IV %3
2640%define %%A_IN %4
2641%define %%A_LEN %5
2642%define %%AAD_HASH xmm14
11fdf7f2 2643
11fdf7f2
TL
2644
2645 mov r10, %%A_LEN
2646 cmp r10, 0
2647 je %%_aad_is_zero
2648
9f95a23c 2649 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
11fdf7f2
TL
2650 jmp %%_after_aad
2651
2652%%_aad_is_zero:
2653 vpxor %%AAD_HASH, %%AAD_HASH
2654
2655%%_after_aad:
2656 mov r10, %%A_LEN
2657 vpxor xmm2, xmm3
2658
2659 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
2660 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
2661 xor r10, r10
2662 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
2663 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
2664 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
2665 mov r10, %%IV
2666 vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
2667 vpinsrq xmm2, [r10], 0
2668 vpinsrd xmm2, [r10+8], 2
2669 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
2670
9f95a23c 2671 vpshufb xmm2, [rel SHUF_MASK]
11fdf7f2
TL
2672
2673 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
2674%endmacro
2675
2676%macro GCM_ENC_DEC_SMALL 12
2677%define %%GDATA_KEY %1
2678%define %%GDATA_CTX %2
2679%define %%CYPH_PLAIN_OUT %3
2680%define %%PLAIN_CYPH_IN %4
2681%define %%PLAIN_CYPH_LEN %5
2682%define %%ENC_DEC %6
2683%define %%DATA_OFFSET %7
2684%define %%LENGTH %8
2685%define %%NUM_BLOCKS %9
2686%define %%CTR %10
2687%define %%HASH %11
2688%define %%INSTANCE_TYPE %12
2689
2690 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
2691 ;; cmp %%NUM_BLOCKS, 0
2692 ;; je %%_small_initial_blocks_encrypted
2693 cmp %%NUM_BLOCKS, 8
2694 je %%_small_initial_num_blocks_is_8
2695 cmp %%NUM_BLOCKS, 7
2696 je %%_small_initial_num_blocks_is_7
2697 cmp %%NUM_BLOCKS, 6
2698 je %%_small_initial_num_blocks_is_6
2699 cmp %%NUM_BLOCKS, 5
2700 je %%_small_initial_num_blocks_is_5
2701 cmp %%NUM_BLOCKS, 4
2702 je %%_small_initial_num_blocks_is_4
2703 cmp %%NUM_BLOCKS, 3
2704 je %%_small_initial_num_blocks_is_3
2705 cmp %%NUM_BLOCKS, 2
2706 je %%_small_initial_num_blocks_is_2
2707
2708 jmp %%_small_initial_num_blocks_is_1
2709
2710
2711%%_small_initial_num_blocks_is_8:
2712 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2713 jmp %%_small_initial_blocks_encrypted
2714
2715%%_small_initial_num_blocks_is_7:
2716 ;; r13 - %%LENGTH
2717 ;; xmm12 - T1
2718 ;; xmm13 - T2
2719 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2720 ;; xmm15 - T4
2721 ;; xmm11 - T5
2722 ;; xmm9 - CTR
2723 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2724 ;; xmm2 - XMM2
2725 ;; xmm3 - XMM3
2726 ;; xmm4 - XMM4
2727 ;; xmm5 - XMM5
2728 ;; xmm6 - XMM6
2729 ;; xmm7 - XMM7
2730 ;; xmm8 - XMM8 - AAD HASH IN
2731 ;; xmm10 - T6
2732 ;; xmm0 - T_key
2733 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2734 jmp %%_small_initial_blocks_encrypted
2735
2736%%_small_initial_num_blocks_is_6:
2737 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2738 jmp %%_small_initial_blocks_encrypted
2739
2740%%_small_initial_num_blocks_is_5:
2741 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2742 jmp %%_small_initial_blocks_encrypted
2743
2744%%_small_initial_num_blocks_is_4:
2745 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2746 jmp %%_small_initial_blocks_encrypted
2747
2748%%_small_initial_num_blocks_is_3:
2749 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2750 jmp %%_small_initial_blocks_encrypted
2751
2752%%_small_initial_num_blocks_is_2:
2753 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2754 jmp %%_small_initial_blocks_encrypted
2755
2756%%_small_initial_num_blocks_is_1:
2757 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2758
2759 ;; Note: zero initial blocks not allowed.
2760
2761%%_small_initial_blocks_encrypted:
2762
2763%endmacro ; GCM_ENC_DEC_SMALL
2764
2765;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2766; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
2767; has been initialized by GCM_INIT
2768; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
2769; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
2770; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
2771; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
2772; Clobbers rax, r10-r15, and xmm0-xmm15
2773;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2774%macro GCM_ENC_DEC 7
2775%define %%GDATA_KEY %1
2776%define %%GDATA_CTX %2
2777%define %%CYPH_PLAIN_OUT %3
2778%define %%PLAIN_CYPH_IN %4
2779%define %%PLAIN_CYPH_LEN %5
2780%define %%ENC_DEC %6
2781%define %%INSTANCE_TYPE %7
2782%define %%DATA_OFFSET r11
2783
2784; Macro flow:
2785; calculate the number of 16byte blocks in the message
2786; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
2787; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
2788; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
2789
2790 cmp %%PLAIN_CYPH_LEN, 0
2791 je %%_enc_dec_done
2792
2793 xor %%DATA_OFFSET, %%DATA_OFFSET
2794 ;; Update length of data processed
2795%ifidn __OUTPUT_FORMAT__, win64
2796 mov rax, %%PLAIN_CYPH_LEN
2797 add [%%GDATA_CTX + InLen], rax
2798%else
2799 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
2800%endif
2801 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2802 vmovdqu xmm8, [%%GDATA_CTX + AadHash]
2803
2804%ifidn %%INSTANCE_TYPE, multi_call
2805 ;; NOTE: partial block processing makes only sense for multi_call here.
2806 ;; Used for the update flow - if there was a previous partial
2807 ;; block fill the remaining bytes here.
2808 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
2809%endif
2810
2811 ;; lift CTR set from initial_blocks to here
2812%ifidn %%INSTANCE_TYPE, single_call
2813 vmovdqu xmm9, xmm2
2814%else
2815 vmovdqu xmm9, [%%GDATA_CTX + CurCount]
2816%endif
2817
2818 ;; Save the amount of data left to process in r10
2819 mov r13, %%PLAIN_CYPH_LEN
2820%ifidn %%INSTANCE_TYPE, multi_call
2821 ;; NOTE: %%DATA_OFFSET is zero in single_call case.
2822 ;; Consequently PLAIN_CYPH_LEN will never be zero after
2823 ;; %%DATA_OFFSET subtraction below.
2824 sub r13, %%DATA_OFFSET
2825
2826 ;; There may be no more data if it was consumed in the partial block.
2827 cmp r13, 0
2828 je %%_enc_dec_done
2829%endif ; %%INSTANCE_TYPE, multi_call
2830 mov r10, r13
2831
2832 ;; Determine how many blocks to process in INITIAL
2833 mov r12, r13
2834 shr r12, 4
2835 and r12, 7
2836
2837 ;; Process one additional block in INITIAL if there is a partial block
2838 and r10, 0xf
2839 blsmsk r10, r10 ; Set CF if zero
2840 cmc ; Flip CF
2841 adc r12, 0x0 ; Process an additional INITIAL block if CF set
2842
2843 ;; Less than 127B will be handled by the small message code, which
2844 ;; can process up to 7 16B blocks.
2845 cmp r13, 128
2846 jge %%_large_message_path
2847
2848 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
2849 jmp %%_ghash_done
2850
2851%%_large_message_path:
2852 and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
2853 ; can be handled by the x8 partial loop.
2854
2855 cmp r12, 0
2856 je %%_initial_num_blocks_is_0
2857 cmp r12, 7
2858 je %%_initial_num_blocks_is_7
2859 cmp r12, 6
2860 je %%_initial_num_blocks_is_6
2861 cmp r12, 5
2862 je %%_initial_num_blocks_is_5
2863 cmp r12, 4
2864 je %%_initial_num_blocks_is_4
2865 cmp r12, 3
2866 je %%_initial_num_blocks_is_3
2867 cmp r12, 2
2868 je %%_initial_num_blocks_is_2
2869
2870 jmp %%_initial_num_blocks_is_1
2871
2872%%_initial_num_blocks_is_7:
2873 ;; r13 - %%LENGTH
2874 ;; xmm12 - T1
2875 ;; xmm13 - T2
2876 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
2877 ;; xmm15 - T4
2878 ;; xmm11 - T5
2879 ;; xmm9 - CTR
2880 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
2881 ;; xmm2 - XMM2
2882 ;; xmm3 - XMM3
2883 ;; xmm4 - XMM4
2884 ;; xmm5 - XMM5
2885 ;; xmm6 - XMM6
2886 ;; xmm7 - XMM7
2887 ;; xmm8 - XMM8 - AAD HASH IN
2888 ;; xmm10 - T6
2889 ;; xmm0 - T_key
2890 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2891 jmp %%_initial_blocks_encrypted
2892
2893%%_initial_num_blocks_is_6:
2894 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2895 jmp %%_initial_blocks_encrypted
2896
2897%%_initial_num_blocks_is_5:
2898 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2899 jmp %%_initial_blocks_encrypted
2900
2901%%_initial_num_blocks_is_4:
2902 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2903 jmp %%_initial_blocks_encrypted
2904
2905%%_initial_num_blocks_is_3:
2906 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2907 jmp %%_initial_blocks_encrypted
2908
2909%%_initial_num_blocks_is_2:
2910 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2911 jmp %%_initial_blocks_encrypted
2912
2913%%_initial_num_blocks_is_1:
2914 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2915 jmp %%_initial_blocks_encrypted
2916
2917%%_initial_num_blocks_is_0:
2918 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2919
2920
2921%%_initial_blocks_encrypted:
2922 ;; The entire message was encrypted processed in initial and now need to be hashed
2923 cmp r13, 0
2924 je %%_encrypt_done
2925
2926 ;; Encrypt the final <16 byte (partial) block, then hash
2927 cmp r13, 16
2928 jl %%_encrypt_final_partial
2929
2930 ;; Process 7 full blocks plus a partial block
2931 cmp r13, 128
2932 jl %%_encrypt_by_8_partial
2933
2934
2935%%_encrypt_by_8_parallel:
2936 ;; in_order vs. out_order is an optimization to increment the counter without shuffling
2937 ;; it back into little endian. r15d keeps track of when we need to increent in order so
2938 ;; that the carry is handled correctly.
2939 vmovd r15d, xmm9
2940 and r15d, 255
2941 vpshufb xmm9, [rel SHUF_MASK]
2942
2943
2944%%_encrypt_by_8_new:
2945 cmp r15d, 255-8
2946 jg %%_encrypt_by_8
2947
2948
2949
2950 ;; xmm0 - T1
2951 ;; xmm10 - T2
2952 ;; xmm11 - T3
2953 ;; xmm12 - T4
2954 ;; xmm13 - T5
2955 ;; xmm14 - T6
2956 ;; xmm9 - CTR
2957 ;; xmm1 - XMM1
2958 ;; xmm2 - XMM2
2959 ;; xmm3 - XMM3
2960 ;; xmm4 - XMM4
2961 ;; xmm5 - XMM5
2962 ;; xmm6 - XMM6
2963 ;; xmm7 - XMM7
2964 ;; xmm8 - XMM8
2965 ;; xmm15 - T7
2966 add r15b, 8
2967 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
2968 add %%DATA_OFFSET, 128
2969 sub r13, 128
2970 cmp r13, 128
2971 jge %%_encrypt_by_8_new
2972
9f95a23c 2973 vpshufb xmm9, [rel SHUF_MASK]
11fdf7f2
TL
2974 jmp %%_encrypt_by_8_parallel_done
2975
2976%%_encrypt_by_8:
9f95a23c 2977 vpshufb xmm9, [rel SHUF_MASK]
11fdf7f2
TL
2978 add r15b, 8
2979 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
9f95a23c 2980 vpshufb xmm9, [rel SHUF_MASK]
11fdf7f2
TL
2981 add %%DATA_OFFSET, 128
2982 sub r13, 128
2983 cmp r13, 128
2984 jge %%_encrypt_by_8_new
9f95a23c 2985 vpshufb xmm9, [rel SHUF_MASK]
11fdf7f2
TL
2986
2987
2988%%_encrypt_by_8_parallel_done:
2989 ;; Test to see if we need a by 8 with partial block. At this point
2990 ;; bytes remaining should be either zero or between 113-127.
2991 cmp r13, 0
2992 je %%_encrypt_done
2993
2994%%_encrypt_by_8_partial:
2995 ;; Shuffle needed to align key for partial block xor. out_order
2996 ;; is a little faster because it avoids extra shuffles.
2997 ;; TBD: Might need to account for when we don't have room to increment the counter.
2998
2999
3000 ;; Process parallel buffers with a final partial block.
3001 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
3002
3003
3004 add %%DATA_OFFSET, 128-16
3005 sub r13, 128-16
3006
3007%%_encrypt_final_partial:
3008
9f95a23c 3009 vpshufb xmm8, [rel SHUF_MASK]
11fdf7f2
TL
3010 mov [%%GDATA_CTX + PBlockLen], r13
3011 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
3012
3013 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
3014 ;; GDATA, KEY, T1, T2
3015 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
3016
9f95a23c 3017 vpshufb xmm8, [rel SHUF_MASK]
11fdf7f2
TL
3018
3019
3020%%_encrypt_done:
3021
3022 ;; Mapping to macro parameters
3023 ;; IN:
3024 ;; xmm9 contains the counter
3025 ;; xmm1-xmm8 contain the xor'd ciphertext
3026 ;; OUT:
3027 ;; xmm14 contains the final hash
3028 ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3029%ifidn %%INSTANCE_TYPE, multi_call
3030 mov r13, [%%GDATA_CTX + PBlockLen]
3031 cmp r13, 0
3032 jz %%_hash_last_8
3033 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
3034 ;; XOR the partial word into the hash
3035 vpxor xmm14, xmm14, xmm8
3036 jmp %%_ghash_done
3037%endif
3038%%_hash_last_8:
3039 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3040
3041%%_ghash_done:
3042 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
3043 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
3044
3045%%_enc_dec_done:
3046
3047
3048%endmacro
3049
3050
3051;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3052; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
3053; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
3054; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
3055; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
3056;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3057%macro GCM_COMPLETE 6
3058%define %%GDATA_KEY %1
3059%define %%GDATA_CTX %2
3060%define %%AUTH_TAG %3
3061%define %%AUTH_TAG_LEN %4
3062%define %%ENC_DEC %5
3063%define %%INSTANCE_TYPE %6
3064%define %%PLAIN_CYPH_LEN rax
3065
3066 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
3067 ;; Start AES as early as possible
3068 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
3069 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
3070
3071%ifidn %%INSTANCE_TYPE, multi_call
3072 ;; If the GCM function is called as a single function call rather
3073 ;; than invoking the individual parts (init, update, finalize) we
3074 ;; can remove a write to read dependency on AadHash.
3075 vmovdqu xmm14, [%%GDATA_CTX + AadHash]
3076
3077 ;; Encrypt the final partial block. If we did this as a single call then
3078 ;; the partial block was handled in the main GCM_ENC_DEC macro.
3079 mov r12, [%%GDATA_CTX + PBlockLen]
3080 cmp r12, 0
3081
3082 je %%_partial_done
3083
3084 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
3085 vmovdqu [%%GDATA_CTX + AadHash], xmm14
3086
3087%%_partial_done:
3088
3089%endif
3090
3091 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
3092 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
3093
3094 shl r12, 3 ; convert into number of bits
3095 vmovd xmm15, r12d ; len(A) in xmm15
3096
3097 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
3098 vmovq xmm1, %%PLAIN_CYPH_LEN
3099 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
3100 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
3101
3102 vpxor xmm14, xmm15
3103 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
9f95a23c 3104 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
11fdf7f2
TL
3105
3106 vpxor xmm9, xmm9, xmm14
3107
3108
3109%%_return_T:
3110 mov r10, %%AUTH_TAG ; r10 = authTag
3111 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
3112
3113 cmp r11, 16
3114 je %%_T_16
3115
3116 cmp r11, 12
3117 je %%_T_12
3118
9f95a23c
TL
3119 cmp r11, 8
3120 je %%_T_8
3121
3122 simd_store_avx r10, xmm9, r11, r12, rax
3123 jmp %%_return_T_done
11fdf7f2 3124%%_T_8:
9f95a23c 3125 vmovq rax, xmm9
11fdf7f2
TL
3126 mov [r10], rax
3127 jmp %%_return_T_done
3128%%_T_12:
9f95a23c 3129 vmovq rax, xmm9
11fdf7f2
TL
3130 mov [r10], rax
3131 vpsrldq xmm9, xmm9, 8
9f95a23c 3132 vmovd eax, xmm9
11fdf7f2
TL
3133 mov [r10 + 8], eax
3134 jmp %%_return_T_done
11fdf7f2
TL
3135%%_T_16:
3136 vmovdqu [r10], xmm9
3137
3138%%_return_T_done:
3139%endmacro ; GCM_COMPLETE
3140
3141
3142;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3143;void aes_gcm_precomp_128_avx_gen4 /
3144; aes_gcm_precomp_192_avx_gen4 /
3145; aes_gcm_precomp_256_avx_gen4
3146; (struct gcm_key_data *key_data)
3147;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3148MKGLOBAL(FN_NAME(precomp,_),function,)
3149FN_NAME(precomp,_):
3150 push r12
3151 push r13
3152 push r14
3153 push r15
3154
3155 mov r14, rsp
3156
3157
3158
3159 sub rsp, VARIABLE_OFFSET
3160 and rsp, ~63 ; align rsp to 64 bytes
3161
3162%ifidn __OUTPUT_FORMAT__, win64
3163 ; only xmm6 needs to be maintained
3164 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
3165%endif
3166
3167 vpxor xmm6, xmm6
3168 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
3169
3170 vpshufb xmm6, [rel SHUF_MASK]
3171 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
3172 vmovdqa xmm2, xmm6
3173 vpsllq xmm6, xmm6, 1
3174 vpsrlq xmm2, xmm2, 63
3175 vmovdqa xmm1, xmm2
3176 vpslldq xmm2, xmm2, 8
3177 vpsrldq xmm1, xmm1, 8
3178 vpor xmm6, xmm6, xmm2
3179 ;reduction
3180 vpshufd xmm2, xmm1, 00100100b
9f95a23c
TL
3181 vpcmpeqd xmm2, [rel TWOONE]
3182 vpand xmm2, xmm2, [rel POLY]
11fdf7f2
TL
3183 vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
3184 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3185 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
3186
3187
3188 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
3189
3190%ifidn __OUTPUT_FORMAT__, win64
3191 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
3192%endif
3193 mov rsp, r14
3194
3195 pop r15
3196 pop r14
3197 pop r13
3198 pop r12
3199 ret
3200
3201
3202;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3203;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4
3204; (const struct gcm_key_data *key_data,
3205; struct gcm_context_data *context_data,
3206; u8 *iv,
3207; const u8 *aad,
3208; u64 aad_len);
3209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3210MKGLOBAL(FN_NAME(init,_),function,)
3211FN_NAME(init,_):
3212 push r12
3213 push r13
3214%ifidn __OUTPUT_FORMAT__, win64
3215 push r14
3216 push r15
3217 mov r14, rsp
3218 ; xmm6:xmm15 need to be maintained for Windows
3219 sub rsp, 1*16
3220 movdqu [rsp + 0*16], xmm6
3221%endif
3222
3223 GCM_INIT arg1, arg2, arg3, arg4, arg5
3224
3225%ifidn __OUTPUT_FORMAT__, win64
3226 movdqu xmm6 , [rsp + 0*16]
3227 mov rsp, r14
3228 pop r15
3229 pop r14
3230%endif
3231 pop r13
3232 pop r12
3233 ret
3234
3235
3236;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3237;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 /
3238; aes_gcm_enc_128_update_avx_gen4
3239; (const struct gcm_key_data *key_data,
3240; struct gcm_context_data *context_data,
3241; u8 *out,
3242; const u8 *in,
3243; u64 plaintext_len);
3244;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3245MKGLOBAL(FN_NAME(enc,_update_),function,)
3246FN_NAME(enc,_update_):
3247
3248 FUNC_SAVE
3249
3250 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
3251
3252 FUNC_RESTORE
3253
3254 ret
3255
3256
3257;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3258;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 /
3259; aes_gcm_dec_256_update_avx_gen4
3260; (const struct gcm_key_data *key_data,
3261; struct gcm_context_data *context_data,
3262; u8 *out,
3263; const u8 *in,
3264; u64 plaintext_len);
3265;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3266MKGLOBAL(FN_NAME(dec,_update_),function,)
3267FN_NAME(dec,_update_):
3268
3269 FUNC_SAVE
3270
3271 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
3272
3273 FUNC_RESTORE
3274
3275 ret
3276
3277
3278;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3279;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 /
3280; aes_gcm_enc_256_finalize_avx_gen4
3281; (const struct gcm_key_data *key_data,
3282; struct gcm_context_data *context_data,
3283; u8 *auth_tag,
3284; u64 auth_tag_len);
3285;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3286MKGLOBAL(FN_NAME(enc,_finalize_),function,)
3287FN_NAME(enc,_finalize_):
3288
3289 push r12
3290
3291%ifidn __OUTPUT_FORMAT__, win64
3292 ; xmm6:xmm15 need to be maintained for Windows
3293 sub rsp, 5*16
3294 vmovdqu [rsp + 0*16], xmm6
3295 vmovdqu [rsp + 1*16], xmm9
3296 vmovdqu [rsp + 2*16], xmm11
3297 vmovdqu [rsp + 3*16], xmm14
3298 vmovdqu [rsp + 4*16], xmm15
3299%endif
3300 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
3301
3302%ifidn __OUTPUT_FORMAT__, win64
3303 vmovdqu xmm15, [rsp + 4*16]
3304 vmovdqu xmm14, [rsp + 3*16]
3305 vmovdqu xmm11, [rsp + 2*16]
3306 vmovdqu xmm9, [rsp + 1*16]
3307 vmovdqu xmm6, [rsp + 0*16]
3308 add rsp, 5*16
3309%endif
3310
3311 pop r12
3312ret
3313
3314
3315;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3316;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4
3317; aes_gcm_dec_256_finalize_avx_gen4
3318; (const struct gcm_key_data *key_data,
3319; struct gcm_context_data *context_data,
3320; u8 *auth_tag,
3321; u64 auth_tag_len);
3322;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3323MKGLOBAL(FN_NAME(dec,_finalize_),function,)
3324FN_NAME(dec,_finalize_):
3325
3326 push r12
3327
3328%ifidn __OUTPUT_FORMAT__, win64
3329 ; xmm6:xmm15 need to be maintained for Windows
3330 sub rsp, 5*16
3331 vmovdqu [rsp + 0*16], xmm6
3332 vmovdqu [rsp + 1*16], xmm9
3333 vmovdqu [rsp + 2*16], xmm11
3334 vmovdqu [rsp + 3*16], xmm14
3335 vmovdqu [rsp + 4*16], xmm15
3336%endif
3337 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
3338
3339%ifidn __OUTPUT_FORMAT__, win64
3340 vmovdqu xmm15, [rsp + 4*16]
3341 vmovdqu xmm14, [rsp + 3*16]
3342 vmovdqu xmm11, [rsp + 2*16]
3343 vmovdqu xmm9, [rsp + 1*16]
3344 vmovdqu xmm6, [rsp + 0*16]
3345 add rsp, 5*16
3346%endif
3347
3348 pop r12
3349 ret
3350
3351
3352;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3353;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4
3354; (const struct gcm_key_data *key_data,
3355; struct gcm_context_data *context_data,
3356; u8 *out,
3357; const u8 *in,
3358; u64 plaintext_len,
3359; u8 *iv,
3360; const u8 *aad,
3361; u64 aad_len,
3362; u8 *auth_tag,
3363; u64 auth_tag_len);
3364;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3365MKGLOBAL(FN_NAME(enc,_),function,)
3366FN_NAME(enc,_):
3367
3368 FUNC_SAVE
3369
3370 GCM_INIT arg1, arg2, arg6, arg7, arg8
3371
3372 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
3373
3374 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
3375
3376 FUNC_RESTORE
3377
3378 ret
3379
3380;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3381;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4
3382; (const struct gcm_key_data *key_data,
3383; struct gcm_context_data *context_data,
3384; u8 *out,
3385; const u8 *in,
3386; u64 plaintext_len,
3387; u8 *iv,
3388; const u8 *aad,
3389; u64 aad_len,
3390; u8 *auth_tag,
3391; u64 auth_tag_len);
3392;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3393MKGLOBAL(FN_NAME(dec,_),function,)
3394FN_NAME(dec,_):
3395
3396 FUNC_SAVE
3397
3398 GCM_INIT arg1, arg2, arg6, arg7, arg8
3399
3400 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
3401
3402 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
3403
3404 FUNC_RESTORE
3405
3406 ret
3407
3408%ifdef LINUX
3409section .note.GNU-stack noalloc noexec nowrite progbits
3410%endif