]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / gcm128_avx_gen2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;
31 ; Authors:
32 ; Erdinc Ozturk
33 ; Vinodh Gopal
34 ; James Guilford
35 ;
36 ;
37 ; References:
38 ; This code was derived and highly optimized from the code described in paper:
39 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40 ;
41 ; For the shift-based reductions used in this code, we used the method described in paper:
42 ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
43 ;
44 ;
45 ;
46 ;
47 ; Assumptions:
48 ;
49 ;
50 ;
51 ; iv:
52 ; 0 1 2 3
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 ; | 0x1 |
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ;
63 ;
64 ;
65 ; AAD:
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
68 ;
69 ; if AAD is 8 bytes:
70 ; AAD[3] = {A0, A1};
71 ; padded AAD in xmm register = {A1 A0 0 0}
72 ;
73 ; 0 1 2 3
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76 ; | SPI (A1) |
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80 ; | 0x0 |
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ;
83 ; AAD Format with 32-bit Sequence Number
84 ;
85 ; if AAD is 12 bytes:
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
88 ;
89 ; 0 1 2 3
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 ; | SPI (A2) |
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
95 ; | |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 ; | 0x0 |
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ;
100 ; AAD Format with 64-bit Extended Sequence Number
101 ;
102 ;
103 ; aadLen:
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
106 ;
107 ; TLen:
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109 ;
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112 ;
113
114 %include "reg_sizes.asm"
115 %include "gcm_defines.asm"
116
117 default rel
118 ; need to push 4 registers into stack to maintain
119 %define STACK_OFFSET 8*4
120
121 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
122 %define TMP3 16*1 ; Temporary storage for AES State 3
123 %define TMP4 16*2 ; Temporary storage for AES State 4
124 %define TMP5 16*3 ; Temporary storage for AES State 5
125 %define TMP6 16*4 ; Temporary storage for AES State 6
126 %define TMP7 16*5 ; Temporary storage for AES State 7
127 %define TMP8 16*6 ; Temporary storage for AES State 8
128
129 %define LOCAL_STORAGE 16*7
130
131 %ifidn __OUTPUT_FORMAT__, win64
132 %define XMM_STORAGE 16*10
133 %else
134 %define XMM_STORAGE 0
135 %endif
136
137 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
138
139 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140 ; Utility Macros
141 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
142
143 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
144 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
145 ; Input: A and B (128-bits each, bit-reflected)
146 ; Output: C = A*B*x mod poly, (i.e. >>1 )
147 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
148 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
149 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
150 %macro GHASH_MUL 7
151 %define %%GH %1 ; 16 Bytes
152 %define %%HK %2 ; 16 Bytes
153 %define %%T1 %3
154 %define %%T2 %4
155 %define %%T3 %5
156 %define %%T4 %6
157 %define %%T5 %7
158 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
159 ;; Karatsuba
160 vpshufd %%T2, %%GH, 01001110b
161 vpshufd %%T3, %%HK, 01001110b
162 vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
163 vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
164
165 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
166 vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
167 vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
168 vpxor %%T2, %%T2, %%GH
169 vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
170
171 vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
172 vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
173 vpxor %%GH, %%GH, %%T3
174 vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
175
176 ;first phase of the reduction
177 vpslld %%T2, %%GH, 31 ; packed right shifting << 31
178 vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
179 vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
180
181 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
182 vpxor %%T2, %%T2, %%T4
183
184 vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
185
186 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
187 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
188 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189
190 ;second phase of the reduction
191
192 vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
193 vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
194 vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
195 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
196 vpxor %%T2, %%T2, %%T4
197
198 vpxor %%T2, %%T2, %%T5
199 vpxor %%GH, %%GH, %%T2
200 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
201
202
203 %endmacro
204
205
206 %macro PRECOMPUTE 8
207 %define %%GDATA %1
208 %define %%HK %2
209 %define %%T1 %3
210 %define %%T2 %4
211 %define %%T3 %5
212 %define %%T4 %6
213 %define %%T5 %7
214 %define %%T6 %8
215
216 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
217 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
218 vmovdqa %%T5, %%HK
219
220 vpshufd %%T1, %%T5, 01001110b
221 vpxor %%T1, %%T5
222 vmovdqu [%%GDATA + HashKey_k], %%T1
223
224 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
225 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
226 vpshufd %%T1, %%T5, 01001110b
227 vpxor %%T1, %%T5
228 vmovdqu [%%GDATA + HashKey_2_k], %%T1
229
230 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
231 vmovdqu [%%GDATA + HashKey_3], %%T5
232 vpshufd %%T1, %%T5, 01001110b
233 vpxor %%T1, %%T5
234 vmovdqu [%%GDATA + HashKey_3_k], %%T1
235
236 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
237 vmovdqu [%%GDATA + HashKey_4], %%T5
238 vpshufd %%T1, %%T5, 01001110b
239 vpxor %%T1, %%T5
240 vmovdqu [%%GDATA + HashKey_4_k], %%T1
241
242 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
243 vmovdqu [%%GDATA + HashKey_5], %%T5
244 vpshufd %%T1, %%T5, 01001110b
245 vpxor %%T1, %%T5
246 vmovdqu [%%GDATA + HashKey_5_k], %%T1
247
248 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
249 vmovdqu [%%GDATA + HashKey_6], %%T5
250 vpshufd %%T1, %%T5, 01001110b
251 vpxor %%T1, %%T5
252 vmovdqu [%%GDATA + HashKey_6_k], %%T1
253
254 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
255 vmovdqu [%%GDATA + HashKey_7], %%T5
256 vpshufd %%T1, %%T5, 01001110b
257 vpxor %%T1, %%T5
258 vmovdqu [%%GDATA + HashKey_7_k], %%T1
259
260 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
261 vmovdqu [%%GDATA + HashKey_8], %%T5
262 vpshufd %%T1, %%T5, 01001110b
263 vpxor %%T1, %%T5
264 vmovdqu [%%GDATA + HashKey_8_k], %%T1
265 %endmacro
266
267
268 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
269 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
270 ; Returns 0 if data has length 0.
271 ; Input: The input data (INPUT), that data's length (LENGTH).
272 ; Output: The packed xmm register (OUTPUT).
273 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
274 %macro READ_SMALL_DATA_INPUT 6
275 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
276 %define %%INPUT %2
277 %define %%LENGTH %3
278 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
279 %define %%COUNTER %5
280 %define %%TMP1 %6
281
282 vpxor %%OUTPUT, %%OUTPUT
283 mov %%COUNTER, %%LENGTH
284 mov %%END_READ_LOCATION, %%INPUT
285 add %%END_READ_LOCATION, %%LENGTH
286 xor %%TMP1, %%TMP1
287
288
289 cmp %%COUNTER, 8
290 jl %%_byte_loop_2
291 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
292 je %%_done
293
294 sub %%COUNTER, 8
295
296 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left
297 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
298 dec %%END_READ_LOCATION
299 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
300 dec %%COUNTER
301 jg %%_byte_loop_1
302 vpinsrq %%OUTPUT, %%TMP1, 1
303 jmp %%_done
304
305 %%_byte_loop_2: ;Read in data 1 byte at a time while data is left
306 cmp %%COUNTER, 0
307 je %%_done
308 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
309 dec %%END_READ_LOCATION
310 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
311 dec %%COUNTER
312 jg %%_byte_loop_2
313 vpinsrq %%OUTPUT, %%TMP1, 0
314 %%_done:
315
316 %endmacro ; READ_SMALL_DATA_INPUT
317
318
319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
321 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
322 ; Output: The hash of the data (AAD_HASH).
323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
324 %macro CALC_AAD_HASH 14
325 %define %%A_IN %1
326 %define %%A_LEN %2
327 %define %%AAD_HASH %3
328 %define %%HASH_KEY %4
329 %define %%XTMP1 %5 ; xmm temp reg 5
330 %define %%XTMP2 %6
331 %define %%XTMP3 %7
332 %define %%XTMP4 %8
333 %define %%XTMP5 %9 ; xmm temp reg 5
334 %define %%T1 %10 ; temp reg 1
335 %define %%T2 %11
336 %define %%T3 %12
337 %define %%T4 %13
338 %define %%T5 %14 ; temp reg 5
339
340
341 mov %%T1, %%A_IN ; T1 = AAD
342 mov %%T2, %%A_LEN ; T2 = aadLen
343 vpxor %%AAD_HASH, %%AAD_HASH
344
345 cmp %%T2, 16
346 jl %%_get_small_AAD_block
347
348 %%_get_AAD_loop16:
349
350 vmovdqu %%XTMP1, [%%T1]
351 ;byte-reflect the AAD data
352 vpshufb %%XTMP1, [SHUF_MASK]
353 vpxor %%AAD_HASH, %%XTMP1
354 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
355
356 sub %%T2, 16
357 je %%_CALC_AAD_done
358
359 add %%T1, 16
360 cmp %%T2, 16
361 jge %%_get_AAD_loop16
362
363 %%_get_small_AAD_block:
364 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
365 ;byte-reflect the AAD data
366 vpshufb %%XTMP1, [SHUF_MASK]
367 vpxor %%AAD_HASH, %%XTMP1
368 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
369
370 %%_CALC_AAD_done:
371
372 %endmacro ; CALC_AAD_HASH
373
374
375
376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
377 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
378 ; Requires the input data be at least 1 byte long.
379 ; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
380 ; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
381 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
382 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384 %macro PARTIAL_BLOCK 7
385 %define %%GDATA %1
386 %define %%CYPH_PLAIN_OUT %2
387 %define %%PLAIN_CYPH_IN %3
388 %define %%PLAIN_CYPH_LEN %4
389 %define %%DATA_OFFSET %5
390 %define %%AAD_HASH %6
391 %define %%ENC_DEC %7
392 mov r13, [%%GDATA + PBlockLen]
393 cmp r13, 0
394 je %%_partial_block_done ;Leave Macro if no partial blocks
395
396 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
397 jl %%_fewer_than_16_bytes
398 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
399 jmp %%_data_read
400
401 %%_fewer_than_16_bytes:
402 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
403 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
404
405 %%_data_read: ;Finished reading in data
406
407
408 vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
409 vmovdqu xmm13, [%%GDATA + HashKey]
410
411 lea r12, [SHIFT_MASK]
412
413 cmp r13, rax
414 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
415 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
416 vpshufb xmm9, xmm2 ;shift right r13 bytes
417
418 %ifidn %%ENC_DEC, DEC
419 vmovdqa xmm3, xmm1
420 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
421
422 mov r15, %%PLAIN_CYPH_LEN
423 add r15, r13
424 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
425 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
426 sub r12, r15
427 %%_no_extra_mask_1:
428
429 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
430 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
431
432 vpand xmm3, xmm1
433 vpshufb xmm3, [SHUF_MASK]
434 vpshufb xmm3, xmm2
435 vpxor %%AAD_HASH, xmm3
436
437
438 cmp r15,0
439 jl %%_partial_incomplete_1
440
441 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
442 xor rax,rax
443 mov [%%GDATA+PBlockLen], rax
444 jmp %%_dec_done
445 %%_partial_incomplete_1:
446 add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
447 %%_dec_done:
448 vmovdqu [%%GDATA + AadHash], %%AAD_HASH
449
450 %else
451 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
452
453 mov r15, %%PLAIN_CYPH_LEN
454 add r15, r13
455 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
456 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
457 sub r12, r15
458 %%_no_extra_mask_2:
459
460 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
461 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
462
463 vpshufb xmm9, [SHUF_MASK]
464 vpshufb xmm9, xmm2
465 vpxor %%AAD_HASH, xmm9
466
467 cmp r15,0
468 jl %%_partial_incomplete_2
469
470 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
471 xor rax,rax
472 mov [%%GDATA+PBlockLen], rax
473 jmp %%_encode_done
474 %%_partial_incomplete_2:
475 add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
476 %%_encode_done:
477 vmovdqu [%%GDATA + AadHash], %%AAD_HASH
478
479 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
480 vpshufb xmm9, xmm2
481 %endif
482
483
484 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
485 ; output encrypted Bytes
486 cmp r15,0
487 jl %%_partial_fill
488 mov r12, r13
489 mov r13, 16
490 sub r13, r12 ; Set r13 to be the number of bytes to write out
491 jmp %%_count_set
492 %%_partial_fill:
493 mov r13, %%PLAIN_CYPH_LEN
494 %%_count_set:
495 vmovq rax, xmm9
496 cmp r13, 8
497 jle %%_less_than_8_bytes_left
498
499 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
500 add %%DATA_OFFSET, 8
501 vpsrldq xmm9, xmm9, 8
502 vmovq rax, xmm9
503 sub r13, 8
504 %%_less_than_8_bytes_left:
505 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
506 add %%DATA_OFFSET, 1
507 shr rax, 8
508 sub r13, 1
509 jne %%_less_than_8_bytes_left
510 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
511 %%_partial_block_done:
512 %endmacro ; PARTIAL_BLOCK
513
514
515 ; if a = number of total plaintext bytes
516 ; b = floor(a/16)
517 ; %%num_initial_blocks = b mod 8;
518 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
519 ; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
520 ; Updated AAD_HASH is returned in %%T3
521
522 %macro INITIAL_BLOCKS 23
523 %define %%GDATA %1
524 %define %%CYPH_PLAIN_OUT %2
525 %define %%PLAIN_CYPH_IN %3
526 %define %%LENGTH %4
527 %define %%DATA_OFFSET %5
528 %define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
529 %define %%T1 %7
530 %define %%HASH_KEY %8
531 %define %%T3 %9
532 %define %%T4 %10
533 %define %%T5 %11
534 %define %%CTR %12
535 %define %%XMM1 %13
536 %define %%XMM2 %14
537 %define %%XMM3 %15
538 %define %%XMM4 %16
539 %define %%XMM5 %17
540 %define %%XMM6 %18
541 %define %%XMM7 %19
542 %define %%XMM8 %20
543 %define %%T6 %21
544 %define %%T_key %22
545 %define %%ENC_DEC %23
546
547 %assign i (8-%%num_initial_blocks)
548 movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
549 ; start AES for %%num_initial_blocks blocks
550 vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
551
552
553 %assign i (9-%%num_initial_blocks)
554 %rep %%num_initial_blocks
555 vpaddd %%CTR, [ONE] ; INCR Y0
556 vmovdqa reg(i), %%CTR
557 vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
558 %assign i (i+1)
559 %endrep
560
561 vmovdqu %%T_key, [%%GDATA+16*0]
562 %assign i (9-%%num_initial_blocks)
563 %rep %%num_initial_blocks
564 vpxor reg(i),%%T_key
565 %assign i (i+1)
566 %endrep
567
568 %assign j 1
569 %rep 9
570 vmovdqu %%T_key, [%%GDATA+16*j]
571 %assign i (9-%%num_initial_blocks)
572 %rep %%num_initial_blocks
573 vaesenc reg(i),%%T_key
574 %assign i (i+1)
575 %endrep
576
577 %assign j (j+1)
578 %endrep
579
580
581 vmovdqu %%T_key, [%%GDATA+16*10]
582 %assign i (9-%%num_initial_blocks)
583 %rep %%num_initial_blocks
584 vaesenclast reg(i),%%T_key
585 %assign i (i+1)
586 %endrep
587
588 %assign i (9-%%num_initial_blocks)
589 %rep %%num_initial_blocks
590 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
591 vpxor reg(i), %%T1
592 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
593 add %%DATA_OFFSET, 16
594 %ifidn %%ENC_DEC, DEC
595 vmovdqa reg(i), %%T1
596 %endif
597 vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
598 %assign i (i+1)
599 %endrep
600
601
602 %assign i (8-%%num_initial_blocks)
603 %assign j (9-%%num_initial_blocks)
604
605 %rep %%num_initial_blocks
606 vpxor reg(j), reg(i)
607 GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
608 %assign i (i+1)
609 %assign j (j+1)
610 %endrep
611 ; %%XMM8 has the current Hash Value
612 vmovdqa %%T3, %%XMM8
613
614 cmp %%LENGTH, 128
615 jl %%_initial_blocks_done ; no need for precomputed constants
616
617 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
618 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
619 vpaddd %%CTR, [ONE] ; INCR Y0
620 vmovdqa %%XMM1, %%CTR
621 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
622
623 vpaddd %%CTR, [ONE] ; INCR Y0
624 vmovdqa %%XMM2, %%CTR
625 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
626
627 vpaddd %%CTR, [ONE] ; INCR Y0
628 vmovdqa %%XMM3, %%CTR
629 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
630
631 vpaddd %%CTR, [ONE] ; INCR Y0
632 vmovdqa %%XMM4, %%CTR
633 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
634
635 vpaddd %%CTR, [ONE] ; INCR Y0
636 vmovdqa %%XMM5, %%CTR
637 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
638
639 vpaddd %%CTR, [ONE] ; INCR Y0
640 vmovdqa %%XMM6, %%CTR
641 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
642
643 vpaddd %%CTR, [ONE] ; INCR Y0
644 vmovdqa %%XMM7, %%CTR
645 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
646
647 vpaddd %%CTR, [ONE] ; INCR Y0
648 vmovdqa %%XMM8, %%CTR
649 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
650
651 vmovdqu %%T_key, [%%GDATA+16*0]
652 vpxor %%XMM1, %%T_key
653 vpxor %%XMM2, %%T_key
654 vpxor %%XMM3, %%T_key
655 vpxor %%XMM4, %%T_key
656 vpxor %%XMM5, %%T_key
657 vpxor %%XMM6, %%T_key
658 vpxor %%XMM7, %%T_key
659 vpxor %%XMM8, %%T_key
660
661
662 %assign i 1
663 %rep 9 ; do 9 rounds
664 vmovdqu %%T_key, [%%GDATA+16*i]
665 vaesenc %%XMM1, %%T_key
666 vaesenc %%XMM2, %%T_key
667 vaesenc %%XMM3, %%T_key
668 vaesenc %%XMM4, %%T_key
669 vaesenc %%XMM5, %%T_key
670 vaesenc %%XMM6, %%T_key
671 vaesenc %%XMM7, %%T_key
672 vaesenc %%XMM8, %%T_key
673 %assign i (i+1)
674 %endrep
675
676
677 vmovdqu %%T_key, [%%GDATA+16*i]
678 vaesenclast %%XMM1, %%T_key
679 vaesenclast %%XMM2, %%T_key
680 vaesenclast %%XMM3, %%T_key
681 vaesenclast %%XMM4, %%T_key
682 vaesenclast %%XMM5, %%T_key
683 vaesenclast %%XMM6, %%T_key
684 vaesenclast %%XMM7, %%T_key
685 vaesenclast %%XMM8, %%T_key
686
687 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
688 vpxor %%XMM1, %%T1
689 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
690 %ifidn %%ENC_DEC, DEC
691 vmovdqa %%XMM1, %%T1
692 %endif
693
694 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
695 vpxor %%XMM2, %%T1
696 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
697 %ifidn %%ENC_DEC, DEC
698 vmovdqa %%XMM2, %%T1
699 %endif
700
701 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
702 vpxor %%XMM3, %%T1
703 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
704 %ifidn %%ENC_DEC, DEC
705 vmovdqa %%XMM3, %%T1
706 %endif
707
708 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
709 vpxor %%XMM4, %%T1
710 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
711 %ifidn %%ENC_DEC, DEC
712 vmovdqa %%XMM4, %%T1
713 %endif
714
715 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
716 vpxor %%XMM5, %%T1
717 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
718 %ifidn %%ENC_DEC, DEC
719 vmovdqa %%XMM5, %%T1
720 %endif
721
722 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
723 vpxor %%XMM6, %%T1
724 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
725 %ifidn %%ENC_DEC, DEC
726 vmovdqa %%XMM6, %%T1
727 %endif
728
729 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
730 vpxor %%XMM7, %%T1
731 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
732 %ifidn %%ENC_DEC, DEC
733 vmovdqa %%XMM7, %%T1
734 %endif
735
736 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
737 vpxor %%XMM8, %%T1
738 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
739 %ifidn %%ENC_DEC, DEC
740 vmovdqa %%XMM8, %%T1
741 %endif
742
743 add %%DATA_OFFSET, 128
744
745 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
746 vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
747 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
748 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
749 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
750 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
751 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
752 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
753 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
754
755 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
756
757 %%_initial_blocks_done:
758
759
760 %endmacro
761
762
763 ; encrypt 8 blocks at a time
764 ; ghash the 8 previously encrypted ciphertext blocks
765 ; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
766 ; r11 is the data offset value
767 %macro GHASH_8_ENCRYPT_8_PARALLEL 22
768 %define %%GDATA %1
769 %define %%CYPH_PLAIN_OUT %2
770 %define %%PLAIN_CYPH_IN %3
771 %define %%DATA_OFFSET %4
772 %define %%T1 %5
773 %define %%T2 %6
774 %define %%T3 %7
775 %define %%T4 %8
776 %define %%T5 %9
777 %define %%T6 %10
778 %define %%CTR %11
779 %define %%XMM1 %12
780 %define %%XMM2 %13
781 %define %%XMM3 %14
782 %define %%XMM4 %15
783 %define %%XMM5 %16
784 %define %%XMM6 %17
785 %define %%XMM7 %18
786 %define %%XMM8 %19
787 %define %%T7 %20
788 %define %%loop_idx %21
789 %define %%ENC_DEC %22
790
791 vmovdqa %%T2, %%XMM1
792 vmovdqu [rsp + TMP2], %%XMM2
793 vmovdqu [rsp + TMP3], %%XMM3
794 vmovdqu [rsp + TMP4], %%XMM4
795 vmovdqu [rsp + TMP5], %%XMM5
796 vmovdqu [rsp + TMP6], %%XMM6
797 vmovdqu [rsp + TMP7], %%XMM7
798 vmovdqu [rsp + TMP8], %%XMM8
799
800 %ifidn %%loop_idx, in_order
801 vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
802 vpaddd %%XMM2, %%XMM1, [ONE]
803 vpaddd %%XMM3, %%XMM2, [ONE]
804 vpaddd %%XMM4, %%XMM3, [ONE]
805 vpaddd %%XMM5, %%XMM4, [ONE]
806 vpaddd %%XMM6, %%XMM5, [ONE]
807 vpaddd %%XMM7, %%XMM6, [ONE]
808 vpaddd %%XMM8, %%XMM7, [ONE]
809 vmovdqa %%CTR, %%XMM8
810
811 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
812 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
813 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
814 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
815 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
816 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
817 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
818 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
819 %else
820 vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
821 vpaddd %%XMM2, %%XMM1, [ONEf]
822 vpaddd %%XMM3, %%XMM2, [ONEf]
823 vpaddd %%XMM4, %%XMM3, [ONEf]
824 vpaddd %%XMM5, %%XMM4, [ONEf]
825 vpaddd %%XMM6, %%XMM5, [ONEf]
826 vpaddd %%XMM7, %%XMM6, [ONEf]
827 vpaddd %%XMM8, %%XMM7, [ONEf]
828 vmovdqa %%CTR, %%XMM8
829 %endif
830
831
832
833 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
834
835 vmovdqu %%T1, [%%GDATA + 16*0]
836 vpxor %%XMM1, %%T1
837 vpxor %%XMM2, %%T1
838 vpxor %%XMM3, %%T1
839 vpxor %%XMM4, %%T1
840 vpxor %%XMM5, %%T1
841 vpxor %%XMM6, %%T1
842 vpxor %%XMM7, %%T1
843 vpxor %%XMM8, %%T1
844
845 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
846
847
848
849
850
851 vmovdqu %%T1, [%%GDATA + 16*1]
852 vaesenc %%XMM1, %%T1
853 vaesenc %%XMM2, %%T1
854 vaesenc %%XMM3, %%T1
855 vaesenc %%XMM4, %%T1
856 vaesenc %%XMM5, %%T1
857 vaesenc %%XMM6, %%T1
858 vaesenc %%XMM7, %%T1
859 vaesenc %%XMM8, %%T1
860
861
862 vmovdqu %%T1, [%%GDATA + 16*2]
863 vaesenc %%XMM1, %%T1
864 vaesenc %%XMM2, %%T1
865 vaesenc %%XMM3, %%T1
866 vaesenc %%XMM4, %%T1
867 vaesenc %%XMM5, %%T1
868 vaesenc %%XMM6, %%T1
869 vaesenc %%XMM7, %%T1
870 vaesenc %%XMM8, %%T1
871
872 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
873
874 vmovdqu %%T5, [%%GDATA + HashKey_8]
875 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
876 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
877
878 vpshufd %%T6, %%T2, 01001110b
879 vpxor %%T6, %%T2
880
881 vmovdqu %%T5, [%%GDATA + HashKey_8_k]
882 vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
883
884
885 vmovdqu %%T1, [%%GDATA + 16*3]
886 vaesenc %%XMM1, %%T1
887 vaesenc %%XMM2, %%T1
888 vaesenc %%XMM3, %%T1
889 vaesenc %%XMM4, %%T1
890 vaesenc %%XMM5, %%T1
891 vaesenc %%XMM6, %%T1
892 vaesenc %%XMM7, %%T1
893 vaesenc %%XMM8, %%T1
894
895 vmovdqu %%T1, [rsp + TMP2]
896 vmovdqu %%T5, [%%GDATA + HashKey_7]
897 vpclmulqdq %%T3, %%T1, %%T5, 0x11
898 vpxor %%T4, %%T4, %%T3
899 vpclmulqdq %%T3, %%T1, %%T5, 0x00
900 vpxor %%T7, %%T7, %%T3
901
902 vpshufd %%T3, %%T1, 01001110b
903 vpxor %%T3, %%T1
904 vmovdqu %%T5, [%%GDATA + HashKey_7_k]
905 vpclmulqdq %%T3, %%T3, %%T5, 0x10
906 vpxor %%T6, %%T6, %%T3
907
908 vmovdqu %%T1, [%%GDATA + 16*4]
909 vaesenc %%XMM1, %%T1
910 vaesenc %%XMM2, %%T1
911 vaesenc %%XMM3, %%T1
912 vaesenc %%XMM4, %%T1
913 vaesenc %%XMM5, %%T1
914 vaesenc %%XMM6, %%T1
915 vaesenc %%XMM7, %%T1
916 vaesenc %%XMM8, %%T1
917
918 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
919 vmovdqu %%T1, [rsp + TMP3]
920 vmovdqu %%T5, [%%GDATA + HashKey_6]
921 vpclmulqdq %%T3, %%T1, %%T5, 0x11
922 vpxor %%T4, %%T4, %%T3
923 vpclmulqdq %%T3, %%T1, %%T5, 0x00
924 vpxor %%T7, %%T7, %%T3
925
926 vpshufd %%T3, %%T1, 01001110b
927 vpxor %%T3, %%T1
928 vmovdqu %%T5, [%%GDATA + HashKey_6_k]
929 vpclmulqdq %%T3, %%T3, %%T5, 0x10
930 vpxor %%T6, %%T6, %%T3
931
932 vmovdqu %%T1, [%%GDATA + 16*5]
933 vaesenc %%XMM1, %%T1
934 vaesenc %%XMM2, %%T1
935 vaesenc %%XMM3, %%T1
936 vaesenc %%XMM4, %%T1
937 vaesenc %%XMM5, %%T1
938 vaesenc %%XMM6, %%T1
939 vaesenc %%XMM7, %%T1
940 vaesenc %%XMM8, %%T1
941
942
943 vmovdqu %%T1, [rsp + TMP4]
944 vmovdqu %%T5, [%%GDATA + HashKey_5]
945 vpclmulqdq %%T3, %%T1, %%T5, 0x11
946 vpxor %%T4, %%T4, %%T3
947 vpclmulqdq %%T3, %%T1, %%T5, 0x00
948 vpxor %%T7, %%T7, %%T3
949
950 vpshufd %%T3, %%T1, 01001110b
951 vpxor %%T3, %%T1
952 vmovdqu %%T5, [%%GDATA + HashKey_5_k]
953 vpclmulqdq %%T3, %%T3, %%T5, 0x10
954 vpxor %%T6, %%T6, %%T3
955
956 vmovdqu %%T1, [%%GDATA + 16*6]
957 vaesenc %%XMM1, %%T1
958 vaesenc %%XMM2, %%T1
959 vaesenc %%XMM3, %%T1
960 vaesenc %%XMM4, %%T1
961 vaesenc %%XMM5, %%T1
962 vaesenc %%XMM6, %%T1
963 vaesenc %%XMM7, %%T1
964 vaesenc %%XMM8, %%T1
965
966 vmovdqu %%T1, [rsp + TMP5]
967 vmovdqu %%T5, [%%GDATA + HashKey_4]
968 vpclmulqdq %%T3, %%T1, %%T5, 0x11
969 vpxor %%T4, %%T4, %%T3
970 vpclmulqdq %%T3, %%T1, %%T5, 0x00
971 vpxor %%T7, %%T7, %%T3
972
973 vpshufd %%T3, %%T1, 01001110b
974 vpxor %%T3, %%T1
975 vmovdqu %%T5, [%%GDATA + HashKey_4_k]
976 vpclmulqdq %%T3, %%T3, %%T5, 0x10
977 vpxor %%T6, %%T6, %%T3
978
979
980 vmovdqu %%T1, [%%GDATA + 16*7]
981 vaesenc %%XMM1, %%T1
982 vaesenc %%XMM2, %%T1
983 vaesenc %%XMM3, %%T1
984 vaesenc %%XMM4, %%T1
985 vaesenc %%XMM5, %%T1
986 vaesenc %%XMM6, %%T1
987 vaesenc %%XMM7, %%T1
988 vaesenc %%XMM8, %%T1
989
990 vmovdqu %%T1, [rsp + TMP6]
991 vmovdqu %%T5, [%%GDATA + HashKey_3]
992 vpclmulqdq %%T3, %%T1, %%T5, 0x11
993 vpxor %%T4, %%T4, %%T3
994 vpclmulqdq %%T3, %%T1, %%T5, 0x00
995 vpxor %%T7, %%T7, %%T3
996
997 vpshufd %%T3, %%T1, 01001110b
998 vpxor %%T3, %%T1
999 vmovdqu %%T5, [%%GDATA + HashKey_3_k]
1000 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1001 vpxor %%T6, %%T6, %%T3
1002
1003 vmovdqu %%T1, [%%GDATA + 16*8]
1004 vaesenc %%XMM1, %%T1
1005 vaesenc %%XMM2, %%T1
1006 vaesenc %%XMM3, %%T1
1007 vaesenc %%XMM4, %%T1
1008 vaesenc %%XMM5, %%T1
1009 vaesenc %%XMM6, %%T1
1010 vaesenc %%XMM7, %%T1
1011 vaesenc %%XMM8, %%T1
1012
1013 vmovdqu %%T1, [rsp + TMP7]
1014 vmovdqu %%T5, [%%GDATA + HashKey_2]
1015 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1016 vpxor %%T4, %%T4, %%T3
1017 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1018 vpxor %%T7, %%T7, %%T3
1019
1020 vpshufd %%T3, %%T1, 01001110b
1021 vpxor %%T3, %%T1
1022 vmovdqu %%T5, [%%GDATA + HashKey_2_k]
1023 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1024 vpxor %%T6, %%T6, %%T3
1025 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1026
1027 vmovdqu %%T5, [%%GDATA + 16*9]
1028 vaesenc %%XMM1, %%T5
1029 vaesenc %%XMM2, %%T5
1030 vaesenc %%XMM3, %%T5
1031 vaesenc %%XMM4, %%T5
1032 vaesenc %%XMM5, %%T5
1033 vaesenc %%XMM6, %%T5
1034 vaesenc %%XMM7, %%T5
1035 vaesenc %%XMM8, %%T5
1036
1037 vmovdqu %%T1, [rsp + TMP8]
1038 vmovdqu %%T5, [%%GDATA + HashKey]
1039 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1040 vpxor %%T4, %%T4, %%T3
1041 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1042 vpxor %%T7, %%T7, %%T3
1043
1044 vpshufd %%T3, %%T1, 01001110b
1045 vpxor %%T3, %%T1
1046 vmovdqu %%T5, [%%GDATA + HashKey_k]
1047 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1048 vpxor %%T6, %%T6, %%T3
1049
1050 vpxor %%T6, %%T4
1051 vpxor %%T6, %%T7
1052
1053
1054 vmovdqu %%T5, [%%GDATA + 16*10]
1055
1056 %assign i 0
1057 %assign j 1
1058 %rep 8
1059 %ifidn %%ENC_DEC, ENC
1060
1061 %ifdef NT_LD
1062 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1063 vpxor %%T2, %%T2, %%T5
1064 %else
1065 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1066 %endif
1067
1068 vaesenclast reg(j), reg(j), %%T2
1069
1070 %else
1071
1072 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1073 vpxor %%T2, %%T2, %%T5
1074 vaesenclast %%T3, reg(j), %%T2
1075 vpxor reg(j), %%T2, %%T5
1076 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1077
1078 %endif
1079
1080 %assign i (i+1)
1081 %assign j (j+1)
1082 %endrep
1083
1084 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
1085 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
1086 vpxor %%T7, %%T3
1087 vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
1088
1089
1090 ;first phase of the reduction
1091
1092 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1093 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1094 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1095
1096 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1097 vpxor %%T2, %%T2, %%T4
1098
1099 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1100
1101 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1102 vpxor %%T7, %%T2 ; first phase of the reduction complete
1103 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1104 %ifidn %%ENC_DEC, ENC
1105 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
1106 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
1107 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
1108 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
1109 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
1110 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
1111 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
1112 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
1113 %endif
1114
1115 ;second phase of the reduction
1116
1117 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1118 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1119 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1120 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1121 vpxor %%T2, %%T2,%%T4
1122
1123 vpxor %%T2, %%T2, %%T1
1124 vpxor %%T7, %%T7, %%T2
1125 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1126
1127
1128
1129 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1130 vpshufb %%XMM2, [SHUF_MASK]
1131 vpshufb %%XMM3, [SHUF_MASK]
1132 vpshufb %%XMM4, [SHUF_MASK]
1133 vpshufb %%XMM5, [SHUF_MASK]
1134 vpshufb %%XMM6, [SHUF_MASK]
1135 vpshufb %%XMM7, [SHUF_MASK]
1136 vpshufb %%XMM8, [SHUF_MASK]
1137
1138
1139 vpxor %%XMM1, %%T6
1140
1141 %endmacro
1142
1143
1144 ; GHASH the last 4 ciphertext blocks.
1145 %macro GHASH_LAST_8 16
1146 %define %%GDATA %1
1147 %define %%T1 %2
1148 %define %%T2 %3
1149 %define %%T3 %4
1150 %define %%T4 %5
1151 %define %%T5 %6
1152 %define %%T6 %7
1153 %define %%T7 %8
1154 %define %%XMM1 %9
1155 %define %%XMM2 %10
1156 %define %%XMM3 %11
1157 %define %%XMM4 %12
1158 %define %%XMM5 %13
1159 %define %%XMM6 %14
1160 %define %%XMM7 %15
1161 %define %%XMM8 %16
1162 ;; Karatsuba Method
1163
1164
1165 vpshufd %%T2, %%XMM1, 01001110b
1166 vpxor %%T2, %%XMM1
1167 vmovdqu %%T5, [%%GDATA + HashKey_8]
1168 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
1169 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
1170
1171 vmovdqu %%T3, [%%GDATA + HashKey_8_k]
1172 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
1173
1174
1175 ;;;;;;;;;;;;;;;;;;;;;;
1176
1177
1178 vpshufd %%T2, %%XMM2, 01001110b
1179 vpxor %%T2, %%XMM2
1180 vmovdqu %%T5, [%%GDATA + HashKey_7]
1181 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
1182 vpxor %%T6, %%T6, %%T4
1183
1184 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
1185 vpxor %%T7, %%T7, %%T4
1186
1187 vmovdqu %%T3, [%%GDATA + HashKey_7_k]
1188 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1189 vpxor %%XMM1, %%XMM1, %%T2
1190
1191 ;;;;;;;;;;;;;;;;;;;;;;
1192
1193
1194 vpshufd %%T2, %%XMM3, 01001110b
1195 vpxor %%T2, %%XMM3
1196 vmovdqu %%T5, [%%GDATA + HashKey_6]
1197 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
1198 vpxor %%T6, %%T6, %%T4
1199
1200 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
1201 vpxor %%T7, %%T7, %%T4
1202
1203 vmovdqu %%T3, [%%GDATA + HashKey_6_k]
1204 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1205 vpxor %%XMM1, %%XMM1, %%T2
1206
1207 ;;;;;;;;;;;;;;;;;;;;;;
1208
1209
1210 vpshufd %%T2, %%XMM4, 01001110b
1211 vpxor %%T2, %%XMM4
1212 vmovdqu %%T5, [%%GDATA + HashKey_5]
1213 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
1214 vpxor %%T6, %%T6, %%T4
1215
1216 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
1217 vpxor %%T7, %%T7, %%T4
1218
1219 vmovdqu %%T3, [%%GDATA + HashKey_5_k]
1220 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1221 vpxor %%XMM1, %%XMM1, %%T2
1222
1223 ;;;;;;;;;;;;;;;;;;;;;;
1224
1225 vpshufd %%T2, %%XMM5, 01001110b
1226 vpxor %%T2, %%XMM5
1227 vmovdqu %%T5, [%%GDATA + HashKey_4]
1228 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
1229 vpxor %%T6, %%T6, %%T4
1230
1231 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
1232 vpxor %%T7, %%T7, %%T4
1233
1234 vmovdqu %%T3, [%%GDATA + HashKey_4_k]
1235 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1236 vpxor %%XMM1, %%XMM1, %%T2
1237
1238 ;;;;;;;;;;;;;;;;;;;;;;
1239
1240 vpshufd %%T2, %%XMM6, 01001110b
1241 vpxor %%T2, %%XMM6
1242 vmovdqu %%T5, [%%GDATA + HashKey_3]
1243
1244 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
1245 vpxor %%T6, %%T6, %%T4
1246
1247 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
1248 vpxor %%T7, %%T7, %%T4
1249
1250 vmovdqu %%T3, [%%GDATA + HashKey_3_k]
1251 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1252 vpxor %%XMM1, %%XMM1, %%T2
1253
1254 ;;;;;;;;;;;;;;;;;;;;;;
1255
1256 vpshufd %%T2, %%XMM7, 01001110b
1257 vpxor %%T2, %%XMM7
1258 vmovdqu %%T5, [%%GDATA + HashKey_2]
1259 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
1260 vpxor %%T6, %%T6, %%T4
1261
1262 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
1263 vpxor %%T7, %%T7, %%T4
1264
1265 vmovdqu %%T3, [%%GDATA + HashKey_2_k]
1266 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1267 vpxor %%XMM1, %%XMM1, %%T2
1268
1269 ;;;;;;;;;;;;;;;;;;;;;;
1270
1271 vpshufd %%T2, %%XMM8, 01001110b
1272 vpxor %%T2, %%XMM8
1273 vmovdqu %%T5, [%%GDATA + HashKey]
1274 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
1275 vpxor %%T6, %%T6, %%T4
1276
1277 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
1278 vpxor %%T7, %%T7, %%T4
1279
1280 vmovdqu %%T3, [%%GDATA + HashKey_k]
1281 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1282
1283 vpxor %%XMM1, %%XMM1, %%T2
1284 vpxor %%XMM1, %%XMM1, %%T6
1285 vpxor %%T2, %%XMM1, %%T7
1286
1287
1288
1289
1290 vpslldq %%T4, %%T2, 8
1291 vpsrldq %%T2, %%T2, 8
1292
1293 vpxor %%T7, %%T4
1294 vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
1295
1296 ;first phase of the reduction
1297
1298 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1299 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1300 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1301
1302 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1303 vpxor %%T2, %%T2, %%T4
1304
1305 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1306
1307 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1308 vpxor %%T7, %%T2 ; first phase of the reduction complete
1309 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1310
1311 ;second phase of the reduction
1312
1313 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1314 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1315 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1316 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1317 vpxor %%T2, %%T2,%%T4
1318
1319 vpxor %%T2, %%T2, %%T1
1320 vpxor %%T7, %%T7, %%T2
1321 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1322
1323
1324 %endmacro
1325
1326
1327 ; Encryption of a single block
1328 %macro ENCRYPT_SINGLE_BLOCK 2
1329 %define %%GDATA %1
1330 %define %%XMM0 %2
1331
1332 vpxor %%XMM0, [%%GDATA+16*0]
1333 %assign i 1
1334 %rep 9
1335 vaesenc %%XMM0, [%%GDATA+16*i]
1336 %assign i (i+1)
1337 %endrep
1338 vaesenclast %%XMM0, [%%GDATA+16*10]
1339 %endmacro
1340
1341
1342 ;; Start of Stack Setup
1343
1344 %macro FUNC_SAVE 0
1345 ;; Required for Update/GMC_ENC
1346 ;the number of pushes must equal STACK_OFFSET
1347 push r12
1348 push r13
1349 push r14
1350 push r15
1351 mov r14, rsp
1352
1353 sub rsp, VARIABLE_OFFSET
1354 and rsp, ~63
1355
1356 %ifidn __OUTPUT_FORMAT__, win64
1357 ; xmm6:xmm15 need to be maintained for Windows
1358 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1359 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
1360 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
1361 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
1362 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
1363 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
1364 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
1365 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
1366 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
1367 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
1368 %endif
1369 %endmacro
1370
1371
1372 %macro FUNC_RESTORE 0
1373
1374 %ifidn __OUTPUT_FORMAT__, win64
1375 vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
1376 vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
1377 vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
1378 vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
1379 vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
1380 vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
1381 vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
1382 vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
1383 vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
1384 vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
1385 %endif
1386
1387 ;; Required for Update/GMC_ENC
1388 mov rsp, r14
1389 pop r15
1390 pop r14
1391 pop r13
1392 pop r12
1393 %endmacro
1394
1395
1396 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1397 ; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
1398 ; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
1399 ; Data length (A_LEN)
1400 ; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
1401 ; Clobbers rax, r10-r13, and xmm0-xmm6
1402 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1403 %macro GCM_INIT 4
1404 %define %%GDATA %1
1405 %define %%IV %2
1406 %define %%A_IN %3
1407 %define %%A_LEN %4
1408 %define %%AAD_HASH xmm0
1409 %define %%SUBHASH xmm1
1410
1411
1412 vmovdqu %%SUBHASH, [%%GDATA + HashKey]
1413
1414 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
1415 vpxor xmm2, xmm3
1416 mov r10, %%A_LEN
1417
1418 vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
1419 mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
1420 xor r10, r10
1421 mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
1422 mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
1423 vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
1424 mov r10, %%IV
1425 vmovdqu xmm2, [r10]
1426 vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
1427
1428 vpshufb xmm2, [SHUF_MASK]
1429
1430 vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
1431 %endmacro
1432
1433
1434 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1435 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
1436 ; initialized by GCM_INIT
1437 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
1438 ; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
1439 ; and whether encoding or decoding (ENC_DEC)
1440 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
1441 ; Clobbers rax, r10-r15, and xmm0-xmm15
1442 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1443 %macro GCM_ENC_DEC 5
1444 %define %%GDATA %1
1445 %define %%CYPH_PLAIN_OUT %2
1446 %define %%PLAIN_CYPH_IN %3
1447 %define %%PLAIN_CYPH_LEN %4
1448 %define %%ENC_DEC %5
1449 %define %%DATA_OFFSET r11
1450
1451 ; Macro flow:
1452 ; calculate the number of 16byte blocks in the message
1453 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
1454 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
1455 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
1456 cmp %%PLAIN_CYPH_LEN, 0
1457 je %%_multiple_of_16_bytes
1458
1459 xor %%DATA_OFFSET, %%DATA_OFFSET
1460 add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
1461 vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
1462 vmovdqu xmm8, [%%GDATA + AadHash]
1463
1464
1465 PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
1466
1467
1468 mov r13, %%PLAIN_CYPH_LEN
1469 sub r13, %%DATA_OFFSET
1470 mov r10, r13 ; save the amount of data left to process in r10
1471 and r13, -16 ; r13 = r13 - (r13 mod 16)
1472
1473 mov r12, r13
1474 shr r12, 4
1475 and r12, 7
1476
1477 jz %%_initial_num_blocks_is_0
1478
1479 cmp r12, 7
1480 je %%_initial_num_blocks_is_7
1481 cmp r12, 6
1482 je %%_initial_num_blocks_is_6
1483 cmp r12, 5
1484 je %%_initial_num_blocks_is_5
1485 cmp r12, 4
1486 je %%_initial_num_blocks_is_4
1487 cmp r12, 3
1488 je %%_initial_num_blocks_is_3
1489 cmp r12, 2
1490 je %%_initial_num_blocks_is_2
1491
1492 jmp %%_initial_num_blocks_is_1
1493
1494 %%_initial_num_blocks_is_7:
1495 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1496 sub r13, 16*7
1497 jmp %%_initial_blocks_encrypted
1498
1499 %%_initial_num_blocks_is_6:
1500 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1501 sub r13, 16*6
1502 jmp %%_initial_blocks_encrypted
1503
1504 %%_initial_num_blocks_is_5:
1505 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1506 sub r13, 16*5
1507 jmp %%_initial_blocks_encrypted
1508
1509 %%_initial_num_blocks_is_4:
1510 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1511 sub r13, 16*4
1512 jmp %%_initial_blocks_encrypted
1513
1514
1515 %%_initial_num_blocks_is_3:
1516 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1517 sub r13, 16*3
1518 jmp %%_initial_blocks_encrypted
1519 %%_initial_num_blocks_is_2:
1520 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1521 sub r13, 16*2
1522 jmp %%_initial_blocks_encrypted
1523
1524 %%_initial_num_blocks_is_1:
1525 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1526 sub r13, 16
1527 jmp %%_initial_blocks_encrypted
1528
1529 %%_initial_num_blocks_is_0:
1530 INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1531
1532
1533 %%_initial_blocks_encrypted:
1534 cmp r13, 0
1535 je %%_zero_cipher_left
1536
1537 sub r13, 128
1538 je %%_eight_cipher_left
1539
1540
1541
1542
1543 vmovd r15d, xmm9
1544 and r15d, 255
1545 vpshufb xmm9, [SHUF_MASK]
1546
1547
1548 %%_encrypt_by_8_new:
1549 cmp r15d, 255-8
1550 jg %%_encrypt_by_8
1551
1552
1553
1554 add r15b, 8
1555 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
1556 add %%DATA_OFFSET, 128
1557 sub r13, 128
1558 jne %%_encrypt_by_8_new
1559
1560 vpshufb xmm9, [SHUF_MASK]
1561 jmp %%_eight_cipher_left
1562
1563 %%_encrypt_by_8:
1564 vpshufb xmm9, [SHUF_MASK]
1565 add r15b, 8
1566 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
1567 vpshufb xmm9, [SHUF_MASK]
1568 add %%DATA_OFFSET, 128
1569 sub r13, 128
1570 jne %%_encrypt_by_8_new
1571
1572 vpshufb xmm9, [SHUF_MASK]
1573
1574
1575
1576
1577 %%_eight_cipher_left:
1578 GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
1579
1580
1581 %%_zero_cipher_left:
1582 vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
1583 vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
1584
1585 mov r13, r10
1586 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
1587
1588 je %%_multiple_of_16_bytes
1589
1590 mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
1591 ; handle the last <16 Byte block seperately
1592
1593 vpaddd xmm9, [ONE] ; INCR CNT to get Yn
1594 vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
1595 vpshufb xmm9, [SHUF_MASK]
1596 ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
1597 vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
1598
1599 cmp %%PLAIN_CYPH_LEN, 16
1600 jge %%_large_enough_update
1601
1602 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1603 READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
1604 lea r12, [SHIFT_MASK + 16]
1605 sub r12, r13
1606 jmp %%_data_read
1607
1608 %%_large_enough_update:
1609 sub %%DATA_OFFSET, 16
1610 add %%DATA_OFFSET, r13
1611
1612 vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
1613
1614 sub %%DATA_OFFSET, r13
1615 add %%DATA_OFFSET, 16
1616
1617
1618 lea r12, [SHIFT_MASK + 16]
1619 sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
1620
1621 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
1622 vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
1623 %%_data_read:
1624 %ifidn %%ENC_DEC, DEC
1625 vmovdqa xmm2, xmm1
1626 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1627 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1628 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1629 vpand xmm2, xmm1
1630 vpshufb xmm2, [SHUF_MASK]
1631 vpxor xmm14, xmm2
1632 vmovdqu [%%GDATA + AadHash], xmm14
1633
1634 %else
1635 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1636 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1637 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1638 vpshufb xmm9, [SHUF_MASK]
1639 vpxor xmm14, xmm9
1640 vmovdqu [%%GDATA + AadHash], xmm14
1641
1642 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
1643 %endif
1644
1645 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1646 ; output r13 Bytes
1647 vmovq rax, xmm9
1648 cmp r13, 8
1649 jle %%_less_than_8_bytes_left
1650
1651 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
1652 add %%DATA_OFFSET, 8
1653 vpsrldq xmm9, xmm9, 8
1654 vmovq rax, xmm9
1655 sub r13, 8
1656
1657 %%_less_than_8_bytes_left:
1658 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
1659 add %%DATA_OFFSET, 1
1660 shr rax, 8
1661 sub r13, 1
1662 jne %%_less_than_8_bytes_left
1663 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1664
1665 %%_multiple_of_16_bytes:
1666
1667
1668
1669 %endmacro
1670
1671
1672 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1673 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
1674 ; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
1675 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
1676 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
1677 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1678 %macro GCM_COMPLETE 4
1679 %define %%GDATA %1
1680 %define %%AUTH_TAG %2
1681 %define %%AUTH_TAG_LEN %3
1682 %define %%ENC_DEC %4
1683 %define %%PLAIN_CYPH_LEN rax
1684
1685 mov r12, [%%GDATA + PBlockLen]
1686 vmovdqu xmm14, [%%GDATA+AadHash]
1687 vmovdqu xmm13, [%%GDATA+HashKey]
1688
1689 cmp r12, 0
1690
1691 je %%_partial_done
1692
1693 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
1694 vmovdqu [%%GDATA+AadHash], xmm14
1695
1696 %%_partial_done:
1697
1698 mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
1699 mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
1700
1701 shl r12, 3 ; convert into number of bits
1702 vmovd xmm15, r12d ; len(A) in xmm15
1703
1704 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
1705 vmovq xmm1, %%PLAIN_CYPH_LEN
1706 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
1707 vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
1708
1709 vpxor xmm14, xmm15
1710 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
1711 vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
1712
1713 vmovdqu xmm9, [%%GDATA+OrigIV] ; xmm9 = Y0
1714
1715 ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
1716
1717 vpxor xmm9, xmm14
1718
1719
1720 %%_return_T:
1721 mov r10, %%AUTH_TAG ; r10 = authTag
1722 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
1723
1724 cmp r11, 16
1725 je %%_T_16
1726
1727 cmp r11, 12
1728 je %%_T_12
1729
1730 %%_T_8:
1731 vmovq rax, xmm9
1732 mov [r10], rax
1733 jmp %%_return_T_done
1734 %%_T_12:
1735 vmovq rax, xmm9
1736 mov [r10], rax
1737 vpsrldq xmm9, xmm9, 8
1738 vmovd eax, xmm9
1739 mov [r10 + 8], eax
1740 jmp %%_return_T_done
1741
1742 %%_T_16:
1743 vmovdqu [r10], xmm9
1744
1745 %%_return_T_done:
1746 %endmacro ; GCM_COMPLETE
1747
1748
1749 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1750 ;void aesni_gcm128_precomp_avx_gen2
1751 ; (gcm_data *my_ctx_data);
1752 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1753 global aesni_gcm128_precomp_avx_gen2
1754 aesni_gcm128_precomp_avx_gen2:
1755 push r12
1756 push r13
1757 push r14
1758 push r15
1759
1760 mov r14, rsp
1761
1762
1763
1764 sub rsp, VARIABLE_OFFSET
1765 and rsp, ~63 ; align rsp to 64 bytes
1766
1767 %ifidn __OUTPUT_FORMAT__, win64
1768 ; only xmm6 needs to be maintained
1769 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1770 %endif
1771
1772 vpxor xmm6, xmm6
1773 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
1774
1775 vpshufb xmm6, [SHUF_MASK]
1776 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
1777 vmovdqa xmm2, xmm6
1778 vpsllq xmm6, 1
1779 vpsrlq xmm2, 63
1780 vmovdqa xmm1, xmm2
1781 vpslldq xmm2, xmm2, 8
1782 vpsrldq xmm1, xmm1, 8
1783 vpor xmm6, xmm2
1784 ;reduction
1785 vpshufd xmm2, xmm1, 00100100b
1786 vpcmpeqd xmm2, [TWOONE]
1787 vpand xmm2, [POLY]
1788 vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
1789 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1790 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
1791
1792
1793 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
1794
1795 %ifidn __OUTPUT_FORMAT__, win64
1796 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
1797 %endif
1798 mov rsp, r14
1799
1800 pop r15
1801 pop r14
1802 pop r13
1803 pop r12
1804 ret
1805
1806 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1807 ;void aesni_gcm128_init_avx_gen2(
1808 ; gcm_data *my_ctx_data,
1809 ; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
1810 ; const u8 *aad, /* Additional Authentication Data (AAD)*/
1811 ; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
1812 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1813 global aesni_gcm128_init_avx_gen2
1814 aesni_gcm128_init_avx_gen2:
1815
1816 push r12
1817 push r13
1818
1819 %ifidn __OUTPUT_FORMAT__, win64
1820 ; xmm6:xmm15 need to be maintained for Windows
1821 sub rsp, 1*16
1822 vmovdqu [rsp + 0*16],xmm6
1823 %endif
1824
1825 GCM_INIT arg1, arg2, arg3, arg4
1826
1827 %ifidn __OUTPUT_FORMAT__, win64
1828 vmovdqu xmm6 , [rsp + 0*16]
1829 add rsp, 1*16
1830 %endif
1831 pop r13
1832 pop r12
1833 ret
1834
1835
1836 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1837 ;void aesni_gcm128_enc_update_avx_gen2(
1838 ; gcm_data *my_ctx_data,
1839 ; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1840 ; const u8 *in, /* Plaintext input */
1841 ; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
1842 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1843 global aesni_gcm128_enc_update_avx_gen2
1844 aesni_gcm128_enc_update_avx_gen2:
1845
1846 FUNC_SAVE
1847
1848 GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
1849
1850 FUNC_RESTORE
1851
1852 ret
1853
1854
1855 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1856 ;void aesni_gcm128_dec_update_avx_gen2(
1857 ; gcm_data *my_ctx_data,
1858 ; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
1859 ; const u8 *in, /* Cyphertext input */
1860 ; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
1861 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1862 global aesni_gcm128_dec_update_avx_gen2
1863 aesni_gcm128_dec_update_avx_gen2:
1864
1865 FUNC_SAVE
1866
1867 GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
1868
1869 FUNC_RESTORE
1870
1871 ret
1872
1873
1874 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1875 ;void aesni_gcm128_enc_finalize_avx_gen2(
1876 ; gcm_data *my_ctx_data,
1877 ; u8 *auth_tag, /* Authenticated Tag output. */
1878 ; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
1879 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1880 global aesni_gcm128_enc_finalize_avx_gen2
1881 aesni_gcm128_enc_finalize_avx_gen2:
1882
1883 push r12
1884
1885 %ifidn __OUTPUT_FORMAT__, win64
1886 ; xmm6:xmm15 need to be maintained for Windows
1887 sub rsp, 5*16
1888 vmovdqu [rsp + 0*16],xmm6
1889 vmovdqu [rsp + 1*16],xmm9
1890 vmovdqu [rsp + 2*16],xmm11
1891 vmovdqu [rsp + 3*16],xmm14
1892 vmovdqu [rsp + 4*16],xmm15
1893 %endif
1894 GCM_COMPLETE arg1, arg2, arg3, ENC
1895
1896 %ifidn __OUTPUT_FORMAT__, win64
1897 vmovdqu xmm15 , [rsp + 4*16]
1898 vmovdqu xmm14 , [rsp + 3*16]
1899 vmovdqu xmm11 , [rsp + 2*16]
1900 vmovdqu xmm9 , [rsp + 1*16]
1901 vmovdqu xmm6 , [rsp + 0*16]
1902 add rsp, 5*16
1903 %endif
1904
1905 pop r12
1906 ret
1907
1908
1909 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1910 ;void aesni_gcm128_dec_finalize_avx_gen2(
1911 ; gcm_data *my_ctx_data,
1912 ; u8 *auth_tag, /* Authenticated Tag output. */
1913 ; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
1914 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1915 global aesni_gcm128_dec_finalize_avx_gen2
1916 aesni_gcm128_dec_finalize_avx_gen2:
1917
1918 push r12
1919
1920 %ifidn __OUTPUT_FORMAT__, win64
1921 ; xmm6:xmm15 need to be maintained for Windows
1922 sub rsp, 5*16
1923 vmovdqu [rsp + 0*16],xmm6
1924 vmovdqu [rsp + 1*16],xmm9
1925 vmovdqu [rsp + 2*16],xmm11
1926 vmovdqu [rsp + 3*16],xmm14
1927 vmovdqu [rsp + 4*16],xmm15
1928 %endif
1929 GCM_COMPLETE arg1, arg2, arg3, DEC
1930
1931 %ifidn __OUTPUT_FORMAT__, win64
1932 vmovdqu xmm15 , [rsp + 4*16]
1933 vmovdqu xmm14 , [rsp + 3*16]
1934 vmovdqu xmm11 , [rsp + 2*16]
1935 vmovdqu xmm9 , [rsp + 1*16]
1936 vmovdqu xmm6 , [rsp + 0*16]
1937 add rsp, 5*16
1938 %endif
1939
1940 pop r12
1941 ret
1942
1943
1944 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1945 ;void aesni_gcm128_enc_avx_gen2(
1946 ; gcm_data *my_ctx_data,
1947 ; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1948 ; const u8 *in, /* Plaintext input */
1949 ; u64 plaintext_len, /* Length of data in Bytes for encryption. */
1950 ; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
1951 ; const u8 *aad, /* Additional Authentication Data (AAD)*/
1952 ; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
1953 ; u8 *auth_tag, /* Authenticated Tag output. */
1954 ; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
1955 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1956 global aesni_gcm128_enc_avx_gen2
1957 aesni_gcm128_enc_avx_gen2:
1958
1959 FUNC_SAVE
1960
1961 GCM_INIT arg1, arg5, arg6, arg7
1962
1963 GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
1964
1965 GCM_COMPLETE arg1, arg8, arg9, ENC
1966
1967 FUNC_RESTORE
1968
1969 ret
1970
1971 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1972 ;void aesni_gcm128_dec_avx_gen2(
1973 ; gcm_data *my_ctx_data,
1974 ; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1975 ; const u8 *in, /* Ciphertext input */
1976 ; u64 plaintext_len, /* Length of data in Bytes for encryption. */
1977 ; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
1978 ; const u8 *aad, /* Additional Authentication Data (AAD)*/
1979 ; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
1980 ; u8 *auth_tag, /* Authenticated Tag output. */
1981 ; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
1982 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1983 global aesni_gcm128_dec_avx_gen2
1984 aesni_gcm128_dec_avx_gen2:
1985
1986 FUNC_SAVE
1987
1988 GCM_INIT arg1, arg5, arg6, arg7
1989
1990 GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
1991
1992 GCM_COMPLETE arg1, arg8, arg9, DEC
1993
1994 FUNC_RESTORE
1995
1996 ret