]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / gcm_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;
31 ; Authors:
32 ; Erdinc Ozturk
33 ; Vinodh Gopal
34 ; James Guilford
35 ;
36 ;
37 ; References:
38 ; This code was derived and highly optimized from the code described in paper:
39 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40 ;
41 ; For the shift-based reductions used in this code, we used the method described in paper:
42 ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
43 ;
44 ;
45 ;
46 ;
47 ; Assumptions:
48 ;
49 ;
50 ;
51 ; iv:
52 ; 0 1 2 3
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 ; | 0x1 |
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ;
63 ;
64 ;
65 ; AAD:
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
68 ;
69 ; if AAD is 8 bytes:
70 ; AAD[3] = {A0, A1};
71 ; padded AAD in xmm register = {A1 A0 0 0}
72 ;
73 ; 0 1 2 3
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76 ; | SPI (A1) |
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80 ; | 0x0 |
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ;
83 ; AAD Format with 32-bit Sequence Number
84 ;
85 ; if AAD is 12 bytes:
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
88 ;
89 ; 0 1 2 3
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 ; | SPI (A2) |
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
95 ; | |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 ; | 0x0 |
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ;
100 ; AAD Format with 64-bit Extended Sequence Number
101 ;
102 ;
103 ; aadLen:
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
106 ;
107 ; TLen:
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109 ;
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112 ;
113
114 %include "os.asm"
115 %include "reg_sizes.asm"
116 %include "gcm_defines.asm"
117 %include "memcpy.asm"
118
119 %ifndef GCM128_MODE
120 %ifndef GCM192_MODE
121 %ifndef GCM256_MODE
122 %error "No GCM mode selected for gcm_sse.asm!"
123 %endif
124 %endif
125 %endif
126
127 %ifdef NO_AESNI
128 %define SSE sse_no_aesni
129 %else
130 %define SSE sse
131 %endif
132
133 %ifdef GCM128_MODE
134 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ SSE
135 %define NROUNDS 9
136 %endif
137
138 %ifdef GCM192_MODE
139 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ SSE
140 %define NROUNDS 11
141 %endif
142
143 %ifdef GCM256_MODE
144 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ SSE
145 %define NROUNDS 13
146 %endif
147
148 default rel
149 ; need to push 4 registers into stack to maintain
150 %define STACK_OFFSET 8*4
151
152 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
153 %define TMP3 16*1 ; Temporary storage for AES State 3
154 %define TMP4 16*2 ; Temporary storage for AES State 4
155 %define TMP5 16*3 ; Temporary storage for AES State 5
156 %define TMP6 16*4 ; Temporary storage for AES State 6
157 %define TMP7 16*5 ; Temporary storage for AES State 7
158 %define TMP8 16*6 ; Temporary storage for AES State 8
159
160 %define LOCAL_STORAGE 16*7
161
162 %ifidn __OUTPUT_FORMAT__, win64
163 %define XMM_STORAGE 16*10
164 %else
165 %define XMM_STORAGE 0
166 %endif
167
168 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
169
170 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
171 ; Utility Macros
172 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
176 ; Input: A and B (128-bits each, bit-reflected)
177 ; Output: C = A*B*x mod poly, (i.e. >>1 )
178 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
179 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181 %macro GHASH_MUL 7
182 %define %%GH %1 ; 16 Bytes
183 %define %%HK %2 ; 16 Bytes
184 %define %%T1 %3
185 %define %%T2 %4
186 %define %%T3 %5
187 %define %%T4 %6
188 %define %%T5 %7
189 ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
190 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191 ; Karatsuba Method
192 movdqa %%T1, %%GH
193 pshufd %%T2, %%GH, 01001110b
194 pshufd %%T3, %%HK, 01001110b
195 pxor %%T2, %%GH ; %%T2 = (a1+a0)
196 pxor %%T3, %%HK ; %%T3 = (b1+b0)
197
198 pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
199 pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
200 pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
201 pxor %%T2, %%GH
202 pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
203
204 movdqa %%T3, %%T2
205 pslldq %%T3, 8 ; shift-L %%T3 2 DWs
206 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
207 pxor %%GH, %%T3
208 pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
209
210
211 ;first phase of the reduction
212 movdqa %%T2, %%GH
213 movdqa %%T3, %%GH
214 movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
215
216 pslld %%T2, 31 ; packed right shifting << 31
217 pslld %%T3, 30 ; packed right shifting shift << 30
218 pslld %%T4, 25 ; packed right shifting shift << 25
219 pxor %%T2, %%T3 ; xor the shifted versions
220 pxor %%T2, %%T4
221
222 movdqa %%T5, %%T2
223 psrldq %%T5, 4 ; shift-R %%T5 1 DW
224
225 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
226 pxor %%GH, %%T2 ; first phase of the reduction complete
227 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
228
229 ;second phase of the reduction
230 movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
231 movdqa %%T3,%%GH
232 movdqa %%T4,%%GH
233
234 psrld %%T2,1 ; packed left shifting >> 1
235 psrld %%T3,2 ; packed left shifting >> 2
236 psrld %%T4,7 ; packed left shifting >> 7
237 pxor %%T2,%%T3 ; xor the shifted versions
238 pxor %%T2,%%T4
239
240 pxor %%T2, %%T5
241 pxor %%GH, %%T2
242 pxor %%GH, %%T1 ; the result is in %%T1
243
244
245 %endmacro
246
247
248 %macro PRECOMPUTE 8
249 %define %%GDATA %1
250 %define %%HK %2
251 %define %%T1 %3
252 %define %%T2 %4
253 %define %%T3 %5
254 %define %%T4 %6
255 %define %%T5 %7
256 %define %%T6 %8
257
258
259 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
260 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
261 movdqa %%T4, %%HK
262 pshufd %%T1, %%HK, 01001110b
263 pxor %%T1, %%HK
264 movdqu [%%GDATA + HashKey_k], %%T1
265
266
267 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
268 movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
269 pshufd %%T1, %%T4, 01001110b
270 pxor %%T1, %%T4
271 movdqu [%%GDATA + HashKey_2_k], %%T1
272
273 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
274 movdqu [%%GDATA + HashKey_3], %%T4
275 pshufd %%T1, %%T4, 01001110b
276 pxor %%T1, %%T4
277 movdqu [%%GDATA + HashKey_3_k], %%T1
278
279
280 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
281 movdqu [%%GDATA + HashKey_4], %%T4
282 pshufd %%T1, %%T4, 01001110b
283 pxor %%T1, %%T4
284 movdqu [%%GDATA + HashKey_4_k], %%T1
285
286 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
287 movdqu [%%GDATA + HashKey_5], %%T4
288 pshufd %%T1, %%T4, 01001110b
289 pxor %%T1, %%T4
290 movdqu [%%GDATA + HashKey_5_k], %%T1
291
292
293 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
294 movdqu [%%GDATA + HashKey_6], %%T4
295 pshufd %%T1, %%T4, 01001110b
296 pxor %%T1, %%T4
297 movdqu [%%GDATA + HashKey_6_k], %%T1
298
299 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
300 movdqu [%%GDATA + HashKey_7], %%T4
301 pshufd %%T1, %%T4, 01001110b
302 pxor %%T1, %%T4
303 movdqu [%%GDATA + HashKey_7_k], %%T1
304
305 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
306 movdqu [%%GDATA + HashKey_8], %%T4
307 pshufd %%T1, %%T4, 01001110b
308 pxor %%T1, %%T4
309 movdqu [%%GDATA + HashKey_8_k], %%T1
310
311
312 %endmacro
313
314
315 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
316 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
317 ; Returns 0 if data has length 0.
318 ; Input: The input data (INPUT), that data's length (LENGTH).
319 ; Output: The packed xmm register (OUTPUT).
320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321 %macro READ_SMALL_DATA_INPUT 6
322 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
323 %define %%INPUT %2
324 %define %%LENGTH %3
325 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
326 %define %%COUNTER %5
327 %define %%TMP1 %6
328
329 pxor %%OUTPUT, %%OUTPUT
330 mov %%COUNTER, %%LENGTH
331 mov %%END_READ_LOCATION, %%INPUT
332 add %%END_READ_LOCATION, %%LENGTH
333 xor %%TMP1, %%TMP1
334
335
336 cmp %%COUNTER, 8
337 jl %%_byte_loop_2
338 pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
339 je %%_done
340
341 sub %%COUNTER, 8
342
343 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left
344 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
345 dec %%END_READ_LOCATION
346 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
347 dec %%COUNTER
348 jg %%_byte_loop_1
349 pinsrq %%OUTPUT, %%TMP1, 1
350 jmp %%_done
351
352 %%_byte_loop_2: ;Read in data 1 byte at a time while data is left
353 cmp %%COUNTER, 0
354 je %%_done
355 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
356 dec %%END_READ_LOCATION
357 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
358 dec %%COUNTER
359 jg %%_byte_loop_2
360 pinsrq %%OUTPUT, %%TMP1, 0
361 %%_done:
362
363 %endmacro ; READ_SMALL_DATA_INPUT
364
365
366 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
367 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
368 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
369 ; Output: The hash of the data (AAD_HASH).
370 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
371 %macro CALC_AAD_HASH 15
372 %define %%A_IN %1
373 %define %%A_LEN %2
374 %define %%AAD_HASH %3
375 %define %%GDATA_KEY %4
376 %define %%XTMP0 %5 ; xmm temp reg 5
377 %define %%XTMP1 %6 ; xmm temp reg 5
378 %define %%XTMP2 %7
379 %define %%XTMP3 %8
380 %define %%XTMP4 %9
381 %define %%XTMP5 %10 ; xmm temp reg 5
382 %define %%T1 %11 ; temp reg 1
383 %define %%T2 %12
384 %define %%T3 %13
385 %define %%T4 %14
386 %define %%T5 %15 ; temp reg 5
387
388
389 mov %%T1, %%A_IN ; T1 = AAD
390 mov %%T2, %%A_LEN ; T2 = aadLen
391 pxor %%AAD_HASH, %%AAD_HASH
392
393 %%_get_AAD_loop128:
394 cmp %%T2, 128
395 jl %%_exit_AAD_loop128
396
397 movdqu %%XTMP0, [%%T1 + 16*0]
398 pshufb %%XTMP0, [rel SHUF_MASK]
399
400 pxor %%XTMP0, %%AAD_HASH
401
402 movdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
403 movdqa %%XTMP1, %%XTMP0
404 movdqa %%XTMP2, %%XTMP0
405 movdqa %%XTMP3, %%XTMP0
406 movdqa %%XTMP4, %%XTMP0
407 pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1
408 pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0
409 pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0
410 pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1
411 pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
412
413 %assign i 1
414 %assign j 7
415 %rep 7
416 movdqu %%XTMP0, [%%T1 + 16*i]
417 pshufb %%XTMP0, [rel SHUF_MASK]
418
419 movdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
420 movdqa %%XTMP4, %%XTMP0
421 pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
422 pxor %%XTMP1, %%XTMP4
423
424 movdqa %%XTMP4, %%XTMP0
425 pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
426 pxor %%XTMP2, %%XTMP4
427
428 movdqa %%XTMP4, %%XTMP0
429 pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
430 pxor %%XTMP3, %%XTMP4
431 movdqa %%XTMP4, %%XTMP0
432 pclmulqdq %%XTMP4, %%XTMP5, 0x10
433 pxor %%XTMP3, %%XTMP4
434 %assign i (i + 1)
435 %assign j (j - 1)
436 %endrep
437
438 movdqa %%XTMP4, %%XTMP3
439 pslldq %%XTMP4, 8 ; shift-L 2 DWs
440 psrldq %%XTMP3, 8 ; shift-R 2 DWs
441 pxor %%XTMP2, %%XTMP4
442 pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
443
444 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
445 ;first phase of the reduction
446 movdqa %%XTMP5, [rel POLY2]
447 movdqa %%XTMP0, %%XTMP5
448 pclmulqdq %%XTMP0, %%XTMP2, 0x01
449 pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs
450 pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete
451
452 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
453 ;second phase of the reduction
454 movdqa %%XTMP3, %%XTMP5
455 pclmulqdq %%XTMP3, %%XTMP2, 0x00
456 psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
457
458 movdqa %%XTMP4, %%XTMP5
459 pclmulqdq %%XTMP4, %%XTMP2, 0x10
460 pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
461
462 pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete
463 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
464 movdqa %%AAD_HASH, %%XTMP1
465 pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1
466
467 sub %%T2, 128
468 je %%_CALC_AAD_done
469
470 add %%T1, 128
471 jmp %%_get_AAD_loop128
472
473 %%_exit_AAD_loop128:
474 cmp %%T2, 16
475 jl %%_get_small_AAD_block
476
477 ;; calculate hash_key position to start with
478 mov %%T3, %%T2
479 and %%T3, -16 ; 1 to 7 blocks possible here
480 neg %%T3
481 add %%T3, HashKey_1 + 16
482 lea %%T3, [%%GDATA_KEY + %%T3]
483
484 movdqu %%XTMP0, [%%T1]
485 pshufb %%XTMP0, [rel SHUF_MASK]
486
487 pxor %%XTMP0, %%AAD_HASH
488
489 movdqu %%XTMP5, [%%T3]
490 movdqa %%XTMP1, %%XTMP0
491 movdqa %%XTMP2, %%XTMP0
492 movdqa %%XTMP3, %%XTMP0
493 movdqa %%XTMP4, %%XTMP0
494 pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1
495 pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0
496 pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0
497 pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1
498 pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
499
500 add %%T3, 16 ; move to next hashkey
501 add %%T1, 16 ; move to next data block
502 sub %%T2, 16
503 cmp %%T2, 16
504 jl %%_AAD_reduce
505
506 %%_AAD_blocks:
507 movdqu %%XTMP0, [%%T1]
508 pshufb %%XTMP0, [rel SHUF_MASK]
509
510 movdqu %%XTMP5, [%%T3]
511 movdqa %%XTMP4, %%XTMP0
512 pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
513 pxor %%XTMP1, %%XTMP4
514
515 movdqa %%XTMP4, %%XTMP0
516 pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
517 pxor %%XTMP2, %%XTMP4
518
519 movdqa %%XTMP4, %%XTMP0
520 pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
521 pxor %%XTMP3, %%XTMP4
522 movdqa %%XTMP4, %%XTMP0
523 pclmulqdq %%XTMP4, %%XTMP5, 0x10
524 pxor %%XTMP3, %%XTMP4
525
526 add %%T3, 16 ; move to next hashkey
527 add %%T1, 16
528 sub %%T2, 16
529 cmp %%T2, 16
530 jl %%_AAD_reduce
531 jmp %%_AAD_blocks
532
533 %%_AAD_reduce:
534 movdqa %%XTMP4, %%XTMP3
535 pslldq %%XTMP4, 8 ; shift-L 2 DWs
536 psrldq %%XTMP3, 8 ; shift-R 2 DWs
537 pxor %%XTMP2, %%XTMP4
538 pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
539
540 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
541 ;first phase of the reduction
542 movdqa %%XTMP5, [rel POLY2]
543 movdqa %%XTMP0, %%XTMP5
544 pclmulqdq %%XTMP0, %%XTMP2, 0x01
545 pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs
546 pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete
547
548 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
549 ;second phase of the reduction
550 movdqa %%XTMP3, %%XTMP5
551 pclmulqdq %%XTMP3, %%XTMP2, 0x00
552 psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
553
554 movdqa %%XTMP4, %%XTMP5
555 pclmulqdq %%XTMP4, %%XTMP2, 0x10
556 pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
557
558 pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete
559 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
560 movdqa %%AAD_HASH, %%XTMP1
561 pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1
562
563 or %%T2, %%T2
564 je %%_CALC_AAD_done
565
566 %%_get_small_AAD_block:
567 movdqu %%XTMP0, [%%GDATA_KEY + HashKey]
568 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
569 ;byte-reflect the AAD data
570 pshufb %%XTMP1, [rel SHUF_MASK]
571 pxor %%AAD_HASH, %%XTMP1
572 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
573
574 %%_CALC_AAD_done:
575
576 %endmacro ; CALC_AAD_HASH
577
578 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
579 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
580 ; Requires the input data be at least 1 byte long.
581 ; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
582 ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
583 ; and whether encoding or decoding (ENC_DEC).
584 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
585 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
586 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
587 %macro PARTIAL_BLOCK 8
588 %define %%GDATA_KEY %1
589 %define %%GDATA_CTX %2
590 %define %%CYPH_PLAIN_OUT %3
591 %define %%PLAIN_CYPH_IN %4
592 %define %%PLAIN_CYPH_LEN %5
593 %define %%DATA_OFFSET %6
594 %define %%AAD_HASH %7
595 %define %%ENC_DEC %8
596 mov r13, [%%GDATA_CTX + PBlockLen]
597 cmp r13, 0
598 je %%_partial_block_done ;Leave Macro if no partial blocks
599
600 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
601 jl %%_fewer_than_16_bytes
602 XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
603 jmp %%_data_read
604
605 %%_fewer_than_16_bytes:
606 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
607 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
608 mov r13, [%%GDATA_CTX + PBlockLen]
609
610 %%_data_read: ;Finished reading in data
611
612
613 movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key
614 movdqu xmm13, [%%GDATA_KEY + HashKey]
615
616 lea r12, [SHIFT_MASK]
617
618 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
619 movdqu xmm2, [r12] ; get the appropriate shuffle mask
620 pshufb xmm9, xmm2 ;shift right r13 bytes
621
622 %ifidn %%ENC_DEC, DEC
623 movdqa xmm3, xmm1
624 pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
625
626 mov r15, %%PLAIN_CYPH_LEN
627 add r15, r13
628 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
629 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
630 sub r12, r15
631 %%_no_extra_mask_1:
632
633 movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
634 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
635
636 pand xmm3, xmm1
637 pshufb xmm3, [SHUF_MASK]
638 pshufb xmm3, xmm2
639 pxor %%AAD_HASH, xmm3
640
641
642 cmp r15,0
643 jl %%_partial_incomplete_1
644
645 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
646 xor rax,rax
647 mov [%%GDATA_CTX + PBlockLen], rax
648 jmp %%_dec_done
649 %%_partial_incomplete_1:
650 %ifidn __OUTPUT_FORMAT__, win64
651 mov rax, %%PLAIN_CYPH_LEN
652 add [%%GDATA_CTX + PBlockLen], rax
653 %else
654 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
655 %endif
656 %%_dec_done:
657 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
658
659 %else
660 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
661
662 mov r15, %%PLAIN_CYPH_LEN
663 add r15, r13
664 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
665 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
666 sub r12, r15
667 %%_no_extra_mask_2:
668
669 movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
670 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
671
672 pshufb xmm9, [SHUF_MASK]
673 pshufb xmm9, xmm2
674 pxor %%AAD_HASH, xmm9
675
676 cmp r15,0
677 jl %%_partial_incomplete_2
678
679 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
680 xor rax,rax
681 mov [%%GDATA_CTX + PBlockLen], rax
682 jmp %%_encode_done
683 %%_partial_incomplete_2:
684 %ifidn __OUTPUT_FORMAT__, win64
685 mov rax, %%PLAIN_CYPH_LEN
686 add [%%GDATA_CTX + PBlockLen], rax
687 %else
688 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
689 %endif
690 %%_encode_done:
691 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
692
693 pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
694 pshufb xmm9, xmm2
695 %endif
696
697
698 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
699 ; output encrypted Bytes
700 cmp r15,0
701 jl %%_partial_fill
702 mov r12, r13
703 mov r13, 16
704 sub r13, r12 ; Set r13 to be the number of bytes to write out
705 jmp %%_count_set
706 %%_partial_fill:
707 mov r13, %%PLAIN_CYPH_LEN
708 %%_count_set:
709 movq rax, xmm9
710 cmp r13, 8
711 jle %%_less_than_8_bytes_left
712
713 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
714 add %%DATA_OFFSET, 8
715 psrldq xmm9, 8
716 movq rax, xmm9
717 sub r13, 8
718 %%_less_than_8_bytes_left:
719 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
720 add %%DATA_OFFSET, 1
721 shr rax, 8
722 sub r13, 1
723 jne %%_less_than_8_bytes_left
724 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
725 %%_partial_block_done:
726 %endmacro ; PARTIAL_BLOCK
727
728
729 ; if a = number of total plaintext bytes
730 ; b = floor(a/16)
731 ; %%num_initial_blocks = b mod 8;
732 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
733 ; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
734 ; Updated AAD_HASH is returned in %%T3
735
736 %macro INITIAL_BLOCKS 24
737 %define %%GDATA_KEY %1
738 %define %%GDATA_CTX %2
739 %define %%CYPH_PLAIN_OUT %3
740 %define %%PLAIN_CYPH_IN %4
741 %define %%LENGTH %5
742 %define %%DATA_OFFSET %6
743 %define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
744 %define %%T1 %8
745 %define %%HASH_KEY %9
746 %define %%T3 %10
747 %define %%T4 %11
748 %define %%T5 %12
749 %define %%CTR %13
750 %define %%XMM1 %14
751 %define %%XMM2 %15
752 %define %%XMM3 %16
753 %define %%XMM4 %17
754 %define %%XMM5 %18
755 %define %%XMM6 %19
756 %define %%XMM7 %20
757 %define %%XMM8 %21
758 %define %%T6 %22
759 %define %%T_key %23
760 %define %%ENC_DEC %24
761
762 %assign i (8-%%num_initial_blocks)
763 movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
764
765 ; start AES for %%num_initial_blocks blocks
766 movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
767
768
769 %assign i (9-%%num_initial_blocks)
770 %rep %%num_initial_blocks
771 paddd %%CTR, [ONE] ; INCR Y0
772 movdqa reg(i), %%CTR
773 pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
774 %assign i (i+1)
775 %endrep
776
777 movdqu %%T_key, [%%GDATA_KEY+16*0]
778 %assign i (9-%%num_initial_blocks)
779 %rep %%num_initial_blocks
780 pxor reg(i),%%T_key
781 %assign i (i+1)
782 %endrep
783
784 %assign j 1
785 %rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192)
786 movdqu %%T_key, [%%GDATA_KEY+16*j]
787 %assign i (9-%%num_initial_blocks)
788 %rep %%num_initial_blocks
789 aesenc reg(i),%%T_key
790 %assign i (i+1)
791 %endrep
792
793 %assign j (j+1)
794 %endrep
795
796
797 movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192)
798 %assign i (9-%%num_initial_blocks)
799 %rep %%num_initial_blocks
800 aesenclast reg(i),%%T_key
801 %assign i (i+1)
802 %endrep
803
804 %assign i (9-%%num_initial_blocks)
805 %rep %%num_initial_blocks
806 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
807 pxor reg(i), %%T1
808 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
809 add %%DATA_OFFSET, 16
810 %ifidn %%ENC_DEC, DEC
811 movdqa reg(i), %%T1
812 %endif
813 pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
814 %assign i (i+1)
815 %endrep
816
817
818 %assign i (8-%%num_initial_blocks)
819 %assign j (9-%%num_initial_blocks)
820
821 %rep %%num_initial_blocks
822 pxor reg(j), reg(i)
823 GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
824 %assign i (i+1)
825 %assign j (j+1)
826 %endrep
827 ; %%XMM8 has the current Hash Value
828 movdqa %%T3, %%XMM8
829
830 cmp %%LENGTH, 128
831 jl %%_initial_blocks_done ; no need for precomputed constants
832
833 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
834 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
835 paddd %%CTR, [ONE] ; INCR Y0
836 movdqa %%XMM1, %%CTR
837 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
838
839 paddd %%CTR, [ONE] ; INCR Y0
840 movdqa %%XMM2, %%CTR
841 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
842
843 paddd %%CTR, [ONE] ; INCR Y0
844 movdqa %%XMM3, %%CTR
845 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
846
847 paddd %%CTR, [ONE] ; INCR Y0
848 movdqa %%XMM4, %%CTR
849 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
850
851 paddd %%CTR, [ONE] ; INCR Y0
852 movdqa %%XMM5, %%CTR
853 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
854
855 paddd %%CTR, [ONE] ; INCR Y0
856 movdqa %%XMM6, %%CTR
857 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
858
859 paddd %%CTR, [ONE] ; INCR Y0
860 movdqa %%XMM7, %%CTR
861 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
862
863 paddd %%CTR, [ONE] ; INCR Y0
864 movdqa %%XMM8, %%CTR
865 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
866
867 movdqu %%T_key, [%%GDATA_KEY+16*0]
868 pxor %%XMM1, %%T_key
869 pxor %%XMM2, %%T_key
870 pxor %%XMM3, %%T_key
871 pxor %%XMM4, %%T_key
872 pxor %%XMM5, %%T_key
873 pxor %%XMM6, %%T_key
874 pxor %%XMM7, %%T_key
875 pxor %%XMM8, %%T_key
876
877
878 %assign i 1
879 %rep NROUNDS ; do early (13) rounds (11 for GCM192)
880 movdqu %%T_key, [%%GDATA_KEY+16*i]
881 aesenc %%XMM1, %%T_key
882 aesenc %%XMM2, %%T_key
883 aesenc %%XMM3, %%T_key
884 aesenc %%XMM4, %%T_key
885 aesenc %%XMM5, %%T_key
886 aesenc %%XMM6, %%T_key
887 aesenc %%XMM7, %%T_key
888 aesenc %%XMM8, %%T_key
889 %assign i (i+1)
890 %endrep
891
892
893 movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round
894 aesenclast %%XMM1, %%T_key
895 aesenclast %%XMM2, %%T_key
896 aesenclast %%XMM3, %%T_key
897 aesenclast %%XMM4, %%T_key
898 aesenclast %%XMM5, %%T_key
899 aesenclast %%XMM6, %%T_key
900 aesenclast %%XMM7, %%T_key
901 aesenclast %%XMM8, %%T_key
902
903 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
904 pxor %%XMM1, %%T1
905 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
906 %ifidn %%ENC_DEC, DEC
907 movdqa %%XMM1, %%T1
908 %endif
909
910 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
911 pxor %%XMM2, %%T1
912 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
913 %ifidn %%ENC_DEC, DEC
914 movdqa %%XMM2, %%T1
915 %endif
916
917 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
918 pxor %%XMM3, %%T1
919 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
920 %ifidn %%ENC_DEC, DEC
921 movdqa %%XMM3, %%T1
922 %endif
923
924 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
925 pxor %%XMM4, %%T1
926 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
927 %ifidn %%ENC_DEC, DEC
928 movdqa %%XMM4, %%T1
929 %endif
930
931 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
932 pxor %%XMM5, %%T1
933 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
934 %ifidn %%ENC_DEC, DEC
935 movdqa %%XMM5, %%T1
936 %endif
937
938 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
939 pxor %%XMM6, %%T1
940 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
941 %ifidn %%ENC_DEC, DEC
942 movdqa %%XMM6, %%T1
943 %endif
944
945 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
946 pxor %%XMM7, %%T1
947 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
948 %ifidn %%ENC_DEC, DEC
949 movdqa %%XMM7, %%T1
950 %endif
951
952 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
953 pxor %%XMM8, %%T1
954 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
955 %ifidn %%ENC_DEC, DEC
956 movdqa %%XMM8, %%T1
957 %endif
958
959 add %%DATA_OFFSET, 128
960
961 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
962 pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
963 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
964 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
965 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
966 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
967 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
968 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
969 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
970
971 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
972
973 %%_initial_blocks_done:
974
975
976 %endmacro
977
978
979
980 ; encrypt 8 blocks at a time
981 ; ghash the 8 previously encrypted ciphertext blocks
982 ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
983 ; %%DATA_OFFSET is the data offset value
984 %macro GHASH_8_ENCRYPT_8_PARALLEL 22
985 %define %%GDATA %1
986 %define %%CYPH_PLAIN_OUT %2
987 %define %%PLAIN_CYPH_IN %3
988 %define %%DATA_OFFSET %4
989 %define %%T1 %5
990 %define %%T2 %6
991 %define %%T3 %7
992 %define %%T4 %8
993 %define %%T5 %9
994 %define %%T6 %10
995 %define %%CTR %11
996 %define %%XMM1 %12
997 %define %%XMM2 %13
998 %define %%XMM3 %14
999 %define %%XMM4 %15
1000 %define %%XMM5 %16
1001 %define %%XMM6 %17
1002 %define %%XMM7 %18
1003 %define %%XMM8 %19
1004 %define %%T7 %20
1005 %define %%loop_idx %21
1006 %define %%ENC_DEC %22
1007
1008 movdqa %%T7, %%XMM1
1009 movdqu [rsp + TMP2], %%XMM2
1010 movdqu [rsp + TMP3], %%XMM3
1011 movdqu [rsp + TMP4], %%XMM4
1012 movdqu [rsp + TMP5], %%XMM5
1013 movdqu [rsp + TMP6], %%XMM6
1014 movdqu [rsp + TMP7], %%XMM7
1015 movdqu [rsp + TMP8], %%XMM8
1016
1017 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1018 ;; Karatsuba Method
1019
1020 movdqa %%T4, %%T7
1021 pshufd %%T6, %%T7, 01001110b
1022 pxor %%T6, %%T7
1023 %ifidn %%loop_idx, in_order
1024 paddd %%CTR, [ONE] ; INCR CNT
1025 %else
1026 paddd %%CTR, [ONEf] ; INCR CNT
1027 %endif
1028 movdqu %%T5, [%%GDATA + HashKey_8]
1029 pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
1030 pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
1031 movdqu %%T5, [%%GDATA + HashKey_8_k]
1032 pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1033 movdqa %%XMM1, %%CTR
1034
1035 %ifidn %%loop_idx, in_order
1036 paddd %%CTR, [ONE] ; INCR CNT
1037 movdqa %%XMM2, %%CTR
1038
1039 paddd %%CTR, [ONE] ; INCR CNT
1040 movdqa %%XMM3, %%CTR
1041
1042 paddd %%CTR, [ONE] ; INCR CNT
1043 movdqa %%XMM4, %%CTR
1044
1045 paddd %%CTR, [ONE] ; INCR CNT
1046 movdqa %%XMM5, %%CTR
1047
1048 paddd %%CTR, [ONE] ; INCR CNT
1049 movdqa %%XMM6, %%CTR
1050
1051 paddd %%CTR, [ONE] ; INCR CNT
1052 movdqa %%XMM7, %%CTR
1053
1054 paddd %%CTR, [ONE] ; INCR CNT
1055 movdqa %%XMM8, %%CTR
1056
1057 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1058 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
1059 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
1060 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
1061 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
1062 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
1063 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
1064 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
1065 %else
1066 paddd %%CTR, [ONEf] ; INCR CNT
1067 movdqa %%XMM2, %%CTR
1068
1069 paddd %%CTR, [ONEf] ; INCR CNT
1070 movdqa %%XMM3, %%CTR
1071
1072 paddd %%CTR, [ONEf] ; INCR CNT
1073 movdqa %%XMM4, %%CTR
1074
1075 paddd %%CTR, [ONEf] ; INCR CNT
1076 movdqa %%XMM5, %%CTR
1077
1078 paddd %%CTR, [ONEf] ; INCR CNT
1079 movdqa %%XMM6, %%CTR
1080
1081 paddd %%CTR, [ONEf] ; INCR CNT
1082 movdqa %%XMM7, %%CTR
1083
1084 paddd %%CTR, [ONEf] ; INCR CNT
1085 movdqa %%XMM8, %%CTR
1086 %endif
1087 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1088
1089 movdqu %%T1, [%%GDATA + 16*0]
1090 pxor %%XMM1, %%T1
1091 pxor %%XMM2, %%T1
1092 pxor %%XMM3, %%T1
1093 pxor %%XMM4, %%T1
1094 pxor %%XMM5, %%T1
1095 pxor %%XMM6, %%T1
1096 pxor %%XMM7, %%T1
1097 pxor %%XMM8, %%T1
1098
1099 ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
1100 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1101 ;; Karatsuba Method
1102 movdqu %%T1, [rsp + TMP2]
1103 movdqa %%T3, %%T1
1104
1105 pshufd %%T2, %%T3, 01001110b
1106 pxor %%T2, %%T3
1107 movdqu %%T5, [%%GDATA + HashKey_7]
1108 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1109 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1110 movdqu %%T5, [%%GDATA + HashKey_7_k]
1111 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1112 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1113 pxor %%T7, %%T3
1114 pxor %%T6, %%T2
1115
1116 movdqu %%T1, [%%GDATA + 16*1]
1117 aesenc %%XMM1, %%T1
1118 aesenc %%XMM2, %%T1
1119 aesenc %%XMM3, %%T1
1120 aesenc %%XMM4, %%T1
1121 aesenc %%XMM5, %%T1
1122 aesenc %%XMM6, %%T1
1123 aesenc %%XMM7, %%T1
1124 aesenc %%XMM8, %%T1
1125
1126
1127 movdqu %%T1, [%%GDATA + 16*2]
1128 aesenc %%XMM1, %%T1
1129 aesenc %%XMM2, %%T1
1130 aesenc %%XMM3, %%T1
1131 aesenc %%XMM4, %%T1
1132 aesenc %%XMM5, %%T1
1133 aesenc %%XMM6, %%T1
1134 aesenc %%XMM7, %%T1
1135 aesenc %%XMM8, %%T1
1136
1137 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1138 ; Karatsuba Method
1139 movdqu %%T1, [rsp + TMP3]
1140 movdqa %%T3, %%T1
1141 pshufd %%T2, %%T3, 01001110b
1142 pxor %%T2, %%T3
1143 movdqu %%T5, [%%GDATA + HashKey_6]
1144 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1145 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1146 movdqu %%T5, [%%GDATA + HashKey_6_k]
1147 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1148 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1149 pxor %%T7, %%T3
1150 pxor %%T6, %%T2
1151
1152 movdqu %%T1, [%%GDATA + 16*3]
1153 aesenc %%XMM1, %%T1
1154 aesenc %%XMM2, %%T1
1155 aesenc %%XMM3, %%T1
1156 aesenc %%XMM4, %%T1
1157 aesenc %%XMM5, %%T1
1158 aesenc %%XMM6, %%T1
1159 aesenc %%XMM7, %%T1
1160 aesenc %%XMM8, %%T1
1161
1162 movdqu %%T1, [rsp + TMP4]
1163 movdqa %%T3, %%T1
1164 pshufd %%T2, %%T3, 01001110b
1165 pxor %%T2, %%T3
1166 movdqu %%T5, [%%GDATA + HashKey_5]
1167 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1168 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1169 movdqu %%T5, [%%GDATA + HashKey_5_k]
1170 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1171 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1172 pxor %%T7, %%T3
1173 pxor %%T6, %%T2
1174
1175 movdqu %%T1, [%%GDATA + 16*4]
1176 aesenc %%XMM1, %%T1
1177 aesenc %%XMM2, %%T1
1178 aesenc %%XMM3, %%T1
1179 aesenc %%XMM4, %%T1
1180 aesenc %%XMM5, %%T1
1181 aesenc %%XMM6, %%T1
1182 aesenc %%XMM7, %%T1
1183 aesenc %%XMM8, %%T1
1184
1185 movdqu %%T1, [%%GDATA + 16*5]
1186 aesenc %%XMM1, %%T1
1187 aesenc %%XMM2, %%T1
1188 aesenc %%XMM3, %%T1
1189 aesenc %%XMM4, %%T1
1190 aesenc %%XMM5, %%T1
1191 aesenc %%XMM6, %%T1
1192 aesenc %%XMM7, %%T1
1193 aesenc %%XMM8, %%T1
1194
1195 movdqu %%T1, [rsp + TMP5]
1196 movdqa %%T3, %%T1
1197 pshufd %%T2, %%T3, 01001110b
1198 pxor %%T2, %%T3
1199 movdqu %%T5, [%%GDATA + HashKey_4]
1200 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1201 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1202 movdqu %%T5, [%%GDATA + HashKey_4_k]
1203 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1204 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1205 pxor %%T7, %%T3
1206 pxor %%T6, %%T2
1207
1208
1209 movdqu %%T1, [%%GDATA + 16*6]
1210 aesenc %%XMM1, %%T1
1211 aesenc %%XMM2, %%T1
1212 aesenc %%XMM3, %%T1
1213 aesenc %%XMM4, %%T1
1214 aesenc %%XMM5, %%T1
1215 aesenc %%XMM6, %%T1
1216 aesenc %%XMM7, %%T1
1217 aesenc %%XMM8, %%T1
1218 movdqu %%T1, [rsp + TMP6]
1219 movdqa %%T3, %%T1
1220 pshufd %%T2, %%T3, 01001110b
1221 pxor %%T2, %%T3
1222 movdqu %%T5, [%%GDATA + HashKey_3]
1223 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1224 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1225 movdqu %%T5, [%%GDATA + HashKey_3_k]
1226 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1227 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1228 pxor %%T7, %%T3
1229 pxor %%T6, %%T2
1230
1231 movdqu %%T1, [%%GDATA + 16*7]
1232 aesenc %%XMM1, %%T1
1233 aesenc %%XMM2, %%T1
1234 aesenc %%XMM3, %%T1
1235 aesenc %%XMM4, %%T1
1236 aesenc %%XMM5, %%T1
1237 aesenc %%XMM6, %%T1
1238 aesenc %%XMM7, %%T1
1239 aesenc %%XMM8, %%T1
1240
1241 movdqu %%T1, [rsp + TMP7]
1242 movdqa %%T3, %%T1
1243 pshufd %%T2, %%T3, 01001110b
1244 pxor %%T2, %%T3
1245 movdqu %%T5, [%%GDATA + HashKey_2]
1246 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1247 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1248 movdqu %%T5, [%%GDATA + HashKey_2_k]
1249 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1250 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1251 pxor %%T7, %%T3
1252 pxor %%T6, %%T2
1253
1254 movdqu %%T1, [%%GDATA + 16*8]
1255 aesenc %%XMM1, %%T1
1256 aesenc %%XMM2, %%T1
1257 aesenc %%XMM3, %%T1
1258 aesenc %%XMM4, %%T1
1259 aesenc %%XMM5, %%T1
1260 aesenc %%XMM6, %%T1
1261 aesenc %%XMM7, %%T1
1262 aesenc %%XMM8, %%T1
1263
1264
1265 ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
1266 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1267 ;; Karatsuba Method
1268 movdqu %%T1, [rsp + TMP8]
1269 movdqa %%T3, %%T1
1270
1271 pshufd %%T2, %%T3, 01001110b
1272 pxor %%T2, %%T3
1273 movdqu %%T5, [%%GDATA + HashKey]
1274 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1275 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1276 movdqu %%T5, [%%GDATA + HashKey_k]
1277 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1278 pxor %%T7, %%T3
1279 pxor %%T4, %%T1
1280
1281 movdqu %%T1, [%%GDATA + 16*9]
1282 aesenc %%XMM1, %%T1
1283 aesenc %%XMM2, %%T1
1284 aesenc %%XMM3, %%T1
1285 aesenc %%XMM4, %%T1
1286 aesenc %%XMM5, %%T1
1287 aesenc %%XMM6, %%T1
1288 aesenc %%XMM7, %%T1
1289 aesenc %%XMM8, %%T1
1290
1291
1292 %ifdef GCM128_MODE
1293 movdqu %%T5, [%%GDATA + 16*10]
1294 %endif
1295 %ifdef GCM192_MODE
1296 movdqu %%T1, [%%GDATA + 16*10]
1297 aesenc %%XMM1, %%T1
1298 aesenc %%XMM2, %%T1
1299 aesenc %%XMM3, %%T1
1300 aesenc %%XMM4, %%T1
1301 aesenc %%XMM5, %%T1
1302 aesenc %%XMM6, %%T1
1303 aesenc %%XMM7, %%T1
1304 aesenc %%XMM8, %%T1
1305
1306 movdqu %%T1, [%%GDATA + 16*11]
1307 aesenc %%XMM1, %%T1
1308 aesenc %%XMM2, %%T1
1309 aesenc %%XMM3, %%T1
1310 aesenc %%XMM4, %%T1
1311 aesenc %%XMM5, %%T1
1312 aesenc %%XMM6, %%T1
1313 aesenc %%XMM7, %%T1
1314 aesenc %%XMM8, %%T1
1315
1316 movdqu %%T5, [%%GDATA + 16*12] ; finish last key round
1317 %endif
1318 %ifdef GCM256_MODE
1319 movdqu %%T1, [%%GDATA + 16*10]
1320 aesenc %%XMM1, %%T1
1321 aesenc %%XMM2, %%T1
1322 aesenc %%XMM3, %%T1
1323 aesenc %%XMM4, %%T1
1324 aesenc %%XMM5, %%T1
1325 aesenc %%XMM6, %%T1
1326 aesenc %%XMM7, %%T1
1327 aesenc %%XMM8, %%T1
1328
1329 movdqu %%T1, [%%GDATA + 16*11]
1330 aesenc %%XMM1, %%T1
1331 aesenc %%XMM2, %%T1
1332 aesenc %%XMM3, %%T1
1333 aesenc %%XMM4, %%T1
1334 aesenc %%XMM5, %%T1
1335 aesenc %%XMM6, %%T1
1336 aesenc %%XMM7, %%T1
1337 aesenc %%XMM8, %%T1
1338
1339 movdqu %%T1, [%%GDATA + 16*12]
1340 aesenc %%XMM1, %%T1
1341 aesenc %%XMM2, %%T1
1342 aesenc %%XMM3, %%T1
1343 aesenc %%XMM4, %%T1
1344 aesenc %%XMM5, %%T1
1345 aesenc %%XMM6, %%T1
1346 aesenc %%XMM7, %%T1
1347 aesenc %%XMM8, %%T1
1348
1349 movdqu %%T1, [%%GDATA + 16*13]
1350 aesenc %%XMM1, %%T1
1351 aesenc %%XMM2, %%T1
1352 aesenc %%XMM3, %%T1
1353 aesenc %%XMM4, %%T1
1354 aesenc %%XMM5, %%T1
1355 aesenc %%XMM6, %%T1
1356 aesenc %%XMM7, %%T1
1357 aesenc %%XMM8, %%T1
1358
1359 movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
1360 %endif
1361
1362 %assign i 0
1363 %assign j 1
1364 %rep 8
1365 XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1366
1367 %ifidn %%ENC_DEC, DEC
1368 movdqa %%T3, %%T1
1369 %endif
1370
1371 pxor %%T1, %%T5
1372 aesenclast reg(j), %%T1 ; XMM1:XMM8
1373 XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
1374
1375 %ifidn %%ENC_DEC, DEC
1376 movdqa reg(j), %%T3
1377 %endif
1378 %assign i (i+1)
1379 %assign j (j+1)
1380 %endrep
1381
1382
1383
1384
1385 pxor %%T2, %%T6
1386 pxor %%T2, %%T4
1387 pxor %%T2, %%T7
1388
1389
1390 movdqa %%T3, %%T2
1391 pslldq %%T3, 8 ; shift-L %%T3 2 DWs
1392 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
1393 pxor %%T7, %%T3
1394 pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
1395
1396
1397
1398 ;first phase of the reduction
1399 movdqa %%T2, %%T7
1400 movdqa %%T3, %%T7
1401 movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
1402
1403 pslld %%T2, 31 ; packed right shifting << 31
1404 pslld %%T3, 30 ; packed right shifting shift << 30
1405 pslld %%T1, 25 ; packed right shifting shift << 25
1406 pxor %%T2, %%T3 ; xor the shifted versions
1407 pxor %%T2, %%T1
1408
1409 movdqa %%T5, %%T2
1410 psrldq %%T5, 4 ; shift-R %%T5 1 DW
1411
1412 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
1413 pxor %%T7, %%T2 ; first phase of the reduction complete
1414 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1415
1416 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1417 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
1418 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
1419 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
1420 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
1421 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
1422 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
1423 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
1424
1425 ;second phase of the reduction
1426 movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
1427 movdqa %%T3,%%T7
1428 movdqa %%T1,%%T7
1429
1430 psrld %%T2,1 ; packed left shifting >> 1
1431 psrld %%T3,2 ; packed left shifting >> 2
1432 psrld %%T1,7 ; packed left shifting >> 7
1433 pxor %%T2,%%T3 ; xor the shifted versions
1434 pxor %%T2,%%T1
1435
1436 pxor %%T2, %%T5
1437 pxor %%T7, %%T2
1438 pxor %%T7, %%T4 ; the result is in %%T4
1439
1440
1441 pxor %%XMM1, %%T7
1442
1443 %endmacro
1444
1445
1446 ; GHASH the last 4 ciphertext blocks.
1447 %macro GHASH_LAST_8 16
1448 %define %%GDATA %1
1449 %define %%T1 %2
1450 %define %%T2 %3
1451 %define %%T3 %4
1452 %define %%T4 %5
1453 %define %%T5 %6
1454 %define %%T6 %7
1455 %define %%T7 %8
1456 %define %%XMM1 %9
1457 %define %%XMM2 %10
1458 %define %%XMM3 %11
1459 %define %%XMM4 %12
1460 %define %%XMM5 %13
1461 %define %%XMM6 %14
1462 %define %%XMM7 %15
1463 %define %%XMM8 %16
1464
1465 ; Karatsuba Method
1466 movdqa %%T6, %%XMM1
1467 pshufd %%T2, %%XMM1, 01001110b
1468 pxor %%T2, %%XMM1
1469 movdqu %%T5, [%%GDATA + HashKey_8]
1470 pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
1471
1472 pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
1473 movdqu %%T4, [%%GDATA + HashKey_8_k]
1474 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1475
1476 movdqa %%T7, %%XMM1
1477 movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
1478
1479
1480 ; Karatsuba Method
1481 movdqa %%T1, %%XMM2
1482 pshufd %%T2, %%XMM2, 01001110b
1483 pxor %%T2, %%XMM2
1484 movdqu %%T5, [%%GDATA + HashKey_7]
1485 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1486
1487 pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
1488 movdqu %%T4, [%%GDATA + HashKey_7_k]
1489 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1490
1491 pxor %%T6, %%T1
1492 pxor %%T7, %%XMM2
1493 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1494
1495
1496 ; Karatsuba Method
1497 movdqa %%T1, %%XMM3
1498 pshufd %%T2, %%XMM3, 01001110b
1499 pxor %%T2, %%XMM3
1500 movdqu %%T5, [%%GDATA + HashKey_6]
1501 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1502
1503 pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
1504 movdqu %%T4, [%%GDATA + HashKey_6_k]
1505 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1506
1507 pxor %%T6, %%T1
1508 pxor %%T7, %%XMM3
1509 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1510
1511 ; Karatsuba Method
1512 movdqa %%T1, %%XMM4
1513 pshufd %%T2, %%XMM4, 01001110b
1514 pxor %%T2, %%XMM4
1515 movdqu %%T5, [%%GDATA + HashKey_5]
1516 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1517
1518 pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
1519 movdqu %%T4, [%%GDATA + HashKey_5_k]
1520 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1521
1522 pxor %%T6, %%T1
1523 pxor %%T7, %%XMM4
1524 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1525
1526 ; Karatsuba Method
1527 movdqa %%T1, %%XMM5
1528 pshufd %%T2, %%XMM5, 01001110b
1529 pxor %%T2, %%XMM5
1530 movdqu %%T5, [%%GDATA + HashKey_4]
1531 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1532
1533 pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
1534 movdqu %%T4, [%%GDATA + HashKey_4_k]
1535 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1536
1537 pxor %%T6, %%T1
1538 pxor %%T7, %%XMM5
1539 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1540
1541 ; Karatsuba Method
1542 movdqa %%T1, %%XMM6
1543 pshufd %%T2, %%XMM6, 01001110b
1544 pxor %%T2, %%XMM6
1545 movdqu %%T5, [%%GDATA + HashKey_3]
1546 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1547
1548 pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
1549 movdqu %%T4, [%%GDATA + HashKey_3_k]
1550 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1551
1552 pxor %%T6, %%T1
1553 pxor %%T7, %%XMM6
1554 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1555
1556 ; Karatsuba Method
1557 movdqa %%T1, %%XMM7
1558 pshufd %%T2, %%XMM7, 01001110b
1559 pxor %%T2, %%XMM7
1560 movdqu %%T5, [%%GDATA + HashKey_2]
1561 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1562
1563 pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
1564 movdqu %%T4, [%%GDATA + HashKey_2_k]
1565 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1566
1567 pxor %%T6, %%T1
1568 pxor %%T7, %%XMM7
1569 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1570
1571
1572 ; Karatsuba Method
1573 movdqa %%T1, %%XMM8
1574 pshufd %%T2, %%XMM8, 01001110b
1575 pxor %%T2, %%XMM8
1576 movdqu %%T5, [%%GDATA + HashKey]
1577 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1578
1579 pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
1580 movdqu %%T4, [%%GDATA + HashKey_k]
1581 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1582
1583 pxor %%T6, %%T1
1584 pxor %%T7, %%XMM8
1585 pxor %%T2, %%XMM1
1586 pxor %%T2, %%T6
1587 pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
1588
1589
1590 movdqa %%T4, %%T2
1591 pslldq %%T4, 8 ; shift-L %%T4 2 DWs
1592 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
1593 pxor %%T7, %%T4
1594 pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
1595
1596
1597 ;first phase of the reduction
1598 movdqa %%T2, %%T7
1599 movdqa %%T3, %%T7
1600 movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
1601
1602 pslld %%T2, 31 ; packed right shifting << 31
1603 pslld %%T3, 30 ; packed right shifting shift << 30
1604 pslld %%T4, 25 ; packed right shifting shift << 25
1605 pxor %%T2, %%T3 ; xor the shifted versions
1606 pxor %%T2, %%T4
1607
1608 movdqa %%T1, %%T2
1609 psrldq %%T1, 4 ; shift-R %%T1 1 DW
1610
1611 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
1612 pxor %%T7, %%T2 ; first phase of the reduction complete
1613 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1614
1615 ;second phase of the reduction
1616 movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
1617 movdqa %%T3,%%T7
1618 movdqa %%T4,%%T7
1619
1620 psrld %%T2,1 ; packed left shifting >> 1
1621 psrld %%T3,2 ; packed left shifting >> 2
1622 psrld %%T4,7 ; packed left shifting >> 7
1623 pxor %%T2,%%T3 ; xor the shifted versions
1624 pxor %%T2,%%T4
1625
1626 pxor %%T2, %%T1
1627 pxor %%T7, %%T2
1628 pxor %%T6, %%T7 ; the result is in %%T6
1629
1630 %endmacro
1631
1632 ; Encryption of a single block
1633 %macro ENCRYPT_SINGLE_BLOCK 3
1634 %define %%GDATA %1
1635 %define %%ST %2
1636 %define %%T1 %3
1637 movdqu %%T1, [%%GDATA+16*0]
1638 pxor %%ST, %%T1
1639 %assign i 1
1640 %rep NROUNDS
1641 movdqu %%T1, [%%GDATA+16*i]
1642 aesenc %%ST, %%T1
1643 %assign i (i+1)
1644 %endrep
1645 movdqu %%T1, [%%GDATA+16*i]
1646 aesenclast %%ST, %%T1
1647 %endmacro
1648
1649
1650 ;; Start of Stack Setup
1651
1652 %macro FUNC_SAVE 0
1653 ;; Required for Update/GCM_ENC
1654 ;the number of pushes must equal STACK_OFFSET
1655 push r12
1656 push r13
1657 push r14
1658 push r15
1659 mov r14, rsp
1660
1661 sub rsp, VARIABLE_OFFSET
1662 and rsp, ~63
1663
1664 %ifidn __OUTPUT_FORMAT__, win64
1665 ; xmm6:xmm15 need to be maintained for Windows
1666 movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1667 movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
1668 movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
1669 movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
1670 movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
1671 movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
1672 movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
1673 movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
1674 movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
1675 movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
1676 %endif
1677 %endmacro
1678
1679
1680 %macro FUNC_RESTORE 0
1681
1682 %ifidn __OUTPUT_FORMAT__, win64
1683 movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
1684 movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
1685 movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
1686 movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
1687 movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
1688 movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
1689 movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
1690 movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
1691 movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
1692 movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
1693 %endif
1694
1695 ;; Required for Update/GCM_ENC
1696 mov rsp, r14
1697 pop r15
1698 pop r14
1699 pop r13
1700 pop r12
1701 %endmacro
1702
1703
1704 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1705 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
1706 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
1707 ; Additional Authentication data (A_IN), Additional Data length (A_LEN).
1708 ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
1709 ; Clobbers rax, r10-r13 and xmm0-xmm6
1710 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1711 %macro GCM_INIT 5
1712 %define %%GDATA_KEY %1
1713 %define %%GDATA_CTX %2
1714 %define %%IV %3
1715 %define %%A_IN %4
1716 %define %%A_LEN %5
1717 %define %%AAD_HASH xmm0
1718
1719 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
1720 pxor xmm2, xmm3
1721 mov r10, %%A_LEN
1722
1723 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
1724 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
1725 xor r10, r10
1726 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
1727 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
1728 movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
1729 mov r10, %%IV
1730 movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
1731 pinsrq xmm2, [r10], 0
1732 pinsrd xmm2, [r10+8], 2
1733 movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
1734
1735 pshufb xmm2, [SHUF_MASK]
1736
1737 movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
1738 %endmacro
1739
1740
1741 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1742 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
1743 ; struct has been initialized by GCM_INIT.
1744 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
1745 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
1746 ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
1747 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
1748 ; Clobbers rax, r10-r15, and xmm0-xmm15
1749 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1750 %macro GCM_ENC_DEC 6
1751 %define %%GDATA_KEY %1
1752 %define %%GDATA_CTX %2
1753 %define %%CYPH_PLAIN_OUT %3
1754 %define %%PLAIN_CYPH_IN %4
1755 %define %%PLAIN_CYPH_LEN %5
1756 %define %%ENC_DEC %6
1757 %define %%DATA_OFFSET r11
1758
1759 ; Macro flow:
1760 ; calculate the number of 16byte blocks in the message
1761 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
1762 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
1763 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
1764
1765 cmp %%PLAIN_CYPH_LEN, 0
1766 je %%_multiple_of_16_bytes
1767
1768 xor %%DATA_OFFSET, %%DATA_OFFSET
1769 %ifidn __OUTPUT_FORMAT__, win64
1770 mov r12, %%PLAIN_CYPH_LEN
1771 add [%%GDATA_CTX + InLen], r12 ;Update length of data processed
1772 %else
1773 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
1774 %endif
1775 movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
1776 movdqu xmm8, [%%GDATA_CTX + AadHash]
1777
1778
1779 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
1780
1781 mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
1782 sub r13, %%DATA_OFFSET
1783 mov r10, r13 ;save the amount of data left to process in r10
1784 and r13, -16 ; r13 = r13 - (r13 mod 16)
1785
1786 mov r12, r13
1787 shr r12, 4
1788 and r12, 7
1789 jz %%_initial_num_blocks_is_0
1790
1791 cmp r12, 7
1792 je %%_initial_num_blocks_is_7
1793 cmp r12, 6
1794 je %%_initial_num_blocks_is_6
1795 cmp r12, 5
1796 je %%_initial_num_blocks_is_5
1797 cmp r12, 4
1798 je %%_initial_num_blocks_is_4
1799 cmp r12, 3
1800 je %%_initial_num_blocks_is_3
1801 cmp r12, 2
1802 je %%_initial_num_blocks_is_2
1803
1804 jmp %%_initial_num_blocks_is_1
1805
1806 %%_initial_num_blocks_is_7:
1807 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1808 sub r13, 16*7
1809 jmp %%_initial_blocks_encrypted
1810
1811 %%_initial_num_blocks_is_6:
1812 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1813 sub r13, 16*6
1814 jmp %%_initial_blocks_encrypted
1815
1816 %%_initial_num_blocks_is_5:
1817 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1818 sub r13, 16*5
1819 jmp %%_initial_blocks_encrypted
1820
1821 %%_initial_num_blocks_is_4:
1822 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1823 sub r13, 16*4
1824 jmp %%_initial_blocks_encrypted
1825
1826
1827 %%_initial_num_blocks_is_3:
1828 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1829 sub r13, 16*3
1830 jmp %%_initial_blocks_encrypted
1831 %%_initial_num_blocks_is_2:
1832 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1833 sub r13, 16*2
1834 jmp %%_initial_blocks_encrypted
1835
1836 %%_initial_num_blocks_is_1:
1837 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1838 sub r13, 16
1839 jmp %%_initial_blocks_encrypted
1840
1841 %%_initial_num_blocks_is_0:
1842 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1843
1844
1845 %%_initial_blocks_encrypted:
1846 cmp r13, 0
1847 je %%_zero_cipher_left
1848
1849 sub r13, 128
1850 je %%_eight_cipher_left
1851
1852
1853
1854
1855 movd r15d, xmm9
1856 and r15d, 255
1857 pshufb xmm9, [SHUF_MASK]
1858
1859
1860 %%_encrypt_by_8_new:
1861 cmp r15d, 255-8
1862 jg %%_encrypt_by_8
1863
1864
1865
1866 add r15b, 8
1867 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
1868 add %%DATA_OFFSET, 128
1869 sub r13, 128
1870 jne %%_encrypt_by_8_new
1871
1872 pshufb xmm9, [SHUF_MASK]
1873 jmp %%_eight_cipher_left
1874
1875 %%_encrypt_by_8:
1876 pshufb xmm9, [SHUF_MASK]
1877 add r15b, 8
1878 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
1879 pshufb xmm9, [SHUF_MASK]
1880 add %%DATA_OFFSET, 128
1881 sub r13, 128
1882 jne %%_encrypt_by_8_new
1883
1884 pshufb xmm9, [SHUF_MASK]
1885
1886
1887
1888
1889 %%_eight_cipher_left:
1890 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
1891
1892
1893 %%_zero_cipher_left:
1894 movdqu [%%GDATA_CTX + AadHash], xmm14
1895 movdqu [%%GDATA_CTX + CurCount], xmm9
1896
1897 mov r13, r10
1898 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
1899
1900 je %%_multiple_of_16_bytes
1901
1902 mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
1903 ; handle the last <16 Byte block seperately
1904
1905 paddd xmm9, [ONE] ; INCR CNT to get Yn
1906 movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
1907 pshufb xmm9, [SHUF_MASK]
1908 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn)
1909 movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
1910
1911 cmp %%PLAIN_CYPH_LEN, 16
1912 jge %%_large_enough_update
1913
1914 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1915 READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
1916 lea r12, [SHIFT_MASK + 16]
1917 sub r12, r13
1918 jmp %%_data_read
1919
1920 %%_large_enough_update:
1921 sub %%DATA_OFFSET, 16
1922 add %%DATA_OFFSET, r13
1923
1924 movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
1925
1926 sub %%DATA_OFFSET, r13
1927 add %%DATA_OFFSET, 16
1928
1929 lea r12, [SHIFT_MASK + 16]
1930 sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
1931 movdqu xmm2, [r12] ; get the appropriate shuffle mask
1932 pshufb xmm1, xmm2 ; shift right 16-r13 bytes
1933 %%_data_read:
1934 %ifidn %%ENC_DEC, DEC
1935 movdqa xmm2, xmm1
1936 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1937 movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1938 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1939 pand xmm2, xmm1
1940 pshufb xmm2, [SHUF_MASK]
1941 pxor xmm14, xmm2
1942 movdqu [%%GDATA_CTX + AadHash], xmm14
1943
1944 %else
1945 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1946 movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1947 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1948 pshufb xmm9, [SHUF_MASK]
1949 pxor xmm14, xmm9
1950 movdqu [%%GDATA_CTX + AadHash], xmm14
1951
1952 pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
1953 %endif
1954
1955
1956 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1957 ; output r13 Bytes
1958 movq rax, xmm9
1959 cmp r13, 8
1960 jle %%_less_than_8_bytes_left
1961
1962 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
1963 add %%DATA_OFFSET, 8
1964 psrldq xmm9, 8
1965 movq rax, xmm9
1966 sub r13, 8
1967
1968 %%_less_than_8_bytes_left:
1969 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
1970 add %%DATA_OFFSET, 1
1971 shr rax, 8
1972 sub r13, 1
1973 jne %%_less_than_8_bytes_left
1974 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1975
1976 %%_multiple_of_16_bytes:
1977
1978 %endmacro
1979
1980
1981 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1982 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
1983 ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
1984 ; whether encoding or decoding (ENC_DEC).
1985 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
1986 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
1987 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1988 %macro GCM_COMPLETE 5
1989 %define %%GDATA_KEY %1
1990 %define %%GDATA_CTX %2
1991 %define %%AUTH_TAG %3
1992 %define %%AUTH_TAG_LEN %4
1993 %define %%ENC_DEC %5
1994 %define %%PLAIN_CYPH_LEN rax
1995
1996 mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes)
1997 movdqu xmm14, [%%GDATA_CTX + AadHash]
1998 movdqu xmm13, [%%GDATA_KEY + HashKey]
1999
2000 cmp r12, 0
2001
2002 je %%_partial_done
2003
2004 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
2005 movdqu [%%GDATA_CTX + AadHash], xmm14
2006
2007 %%_partial_done:
2008
2009 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
2010 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
2011
2012 shl r12, 3 ; convert into number of bits
2013 movd xmm15, r12d ; len(A) in xmm15
2014
2015 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
2016 movq xmm1, %%PLAIN_CYPH_LEN
2017 pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
2018 pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
2019
2020 pxor xmm14, xmm15
2021 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
2022 pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
2023
2024 movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
2025
2026 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0)
2027
2028 pxor xmm9, xmm14
2029
2030
2031
2032 %%_return_T:
2033 mov r10, %%AUTH_TAG ; r10 = authTag
2034 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
2035
2036 cmp r11, 16
2037 je %%_T_16
2038
2039 cmp r11, 12
2040 je %%_T_12
2041
2042 cmp r11, 8
2043 je %%_T_8
2044
2045 simd_store_sse r10, xmm9, r11, r12, rax
2046 jmp %%_return_T_done
2047 %%_T_8:
2048 movq rax, xmm9
2049 mov [r10], rax
2050 jmp %%_return_T_done
2051 %%_T_12:
2052 movq rax, xmm9
2053 mov [r10], rax
2054 psrldq xmm9, 8
2055 movd eax, xmm9
2056 mov [r10 + 8], eax
2057 jmp %%_return_T_done
2058 %%_T_16:
2059 movdqu [r10], xmm9
2060
2061 %%_return_T_done:
2062 %endmacro ;GCM_COMPLETE
2063
2064
2065 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2066 ;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
2067 ; (struct gcm_key_data *key_data);
2068 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2069 MKGLOBAL(FN_NAME(precomp,_),function,)
2070 FN_NAME(precomp,_):
2071
2072 push r12
2073 push r13
2074 push r14
2075 push r15
2076
2077 mov r14, rsp
2078
2079
2080
2081 sub rsp, VARIABLE_OFFSET
2082 and rsp, ~63 ; align rsp to 64 bytes
2083
2084 %ifidn __OUTPUT_FORMAT__, win64
2085 ; only xmm6 needs to be maintained
2086 movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2087 %endif
2088
2089 pxor xmm6, xmm6
2090 ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
2091
2092 pshufb xmm6, [SHUF_MASK]
2093 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
2094 movdqa xmm2, xmm6
2095 psllq xmm6, 1
2096 psrlq xmm2, 63
2097 movdqa xmm1, xmm2
2098 pslldq xmm2, 8
2099 psrldq xmm1, 8
2100 por xmm6, xmm2
2101 ;reduction
2102 pshufd xmm2, xmm1, 00100100b
2103 pcmpeqd xmm2, [TWOONE]
2104 pand xmm2, [POLY]
2105 pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
2106 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2107 movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
2108
2109
2110 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2111
2112 %ifidn __OUTPUT_FORMAT__, win64
2113 movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2114 %endif
2115 mov rsp, r14
2116
2117 pop r15
2118 pop r14
2119 pop r13
2120 pop r12
2121 ret
2122
2123
2124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2125 ;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
2126 ; const struct gcm_key_data *key_data,
2127 ; struct gcm_context_data *context_data,
2128 ; u8 *iv,
2129 ; const u8 *aad,
2130 ; u64 aad_len);
2131 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2132 MKGLOBAL(FN_NAME(init,_),function,)
2133 FN_NAME(init,_):
2134 push r12
2135 push r13
2136 %ifidn __OUTPUT_FORMAT__, win64
2137 push r14
2138 push r15
2139 mov r14, rsp
2140 ; xmm6:xmm15 need to be maintained for Windows
2141 sub rsp, 1*16
2142 movdqu [rsp + 0*16], xmm6
2143 %endif
2144
2145 GCM_INIT arg1, arg2, arg3, arg4, arg5
2146
2147 %ifidn __OUTPUT_FORMAT__, win64
2148 movdqu xmm6 , [rsp + 0*16]
2149 mov rsp, r14
2150 pop r15
2151 pop r14
2152 %endif
2153 pop r13
2154 pop r12
2155 ret
2156
2157
2158 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2159 ;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
2160 ; const struct gcm_key_data *key_data,
2161 ; struct gcm_context_data *context_data,
2162 ; u8 *out,
2163 ; const u8 *in,
2164 ; u64 plaintext_len);
2165 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2166 MKGLOBAL(FN_NAME(enc,_update_),function,)
2167 FN_NAME(enc,_update_):
2168
2169 FUNC_SAVE
2170
2171 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2172
2173 FUNC_RESTORE
2174
2175 ret
2176
2177
2178 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2179 ;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
2180 ; const struct gcm_key_data *key_data,
2181 ; struct gcm_context_data *context_data,
2182 ; u8 *out,
2183 ; const u8 *in,
2184 ; u64 plaintext_len);
2185 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2186 MKGLOBAL(FN_NAME(dec,_update_),function,)
2187 FN_NAME(dec,_update_):
2188
2189 FUNC_SAVE
2190
2191 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2192
2193 FUNC_RESTORE
2194
2195 ret
2196
2197
2198 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2199 ;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
2200 ; const struct gcm_key_data *key_data,
2201 ; struct gcm_context_data *context_data,
2202 ; u8 *auth_tag,
2203 ; u64 auth_tag_len);
2204 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2205 MKGLOBAL(FN_NAME(enc,_finalize_),function,)
2206 FN_NAME(enc,_finalize_):
2207
2208 push r12
2209
2210 %ifidn __OUTPUT_FORMAT__, win64
2211 ; xmm6:xmm15 need to be maintained for Windows
2212 sub rsp, 5*16
2213 movdqu [rsp + 0*16],xmm6
2214 movdqu [rsp + 1*16],xmm9
2215 movdqu [rsp + 2*16],xmm11
2216 movdqu [rsp + 3*16],xmm14
2217 movdqu [rsp + 4*16],xmm15
2218 %endif
2219 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
2220
2221 %ifidn __OUTPUT_FORMAT__, win64
2222 movdqu xmm15 , [rsp + 4*16]
2223 movdqu xmm14 , [rsp+ 3*16]
2224 movdqu xmm11 , [rsp + 2*16]
2225 movdqu xmm9 , [rsp + 1*16]
2226 movdqu xmm6 , [rsp + 0*16]
2227 add rsp, 5*16
2228 %endif
2229
2230 pop r12
2231 ret
2232
2233
2234 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2235 ;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
2236 ; const struct gcm_key_data *key_data,
2237 ; struct gcm_context_data *context_data,
2238 ; u8 *auth_tag,
2239 ; u64 auth_tag_len);
2240 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2241 MKGLOBAL(FN_NAME(dec,_finalize_),function,)
2242 FN_NAME(dec,_finalize_):
2243
2244 push r12
2245
2246 %ifidn __OUTPUT_FORMAT__, win64
2247 ; xmm6:xmm15 need to be maintained for Windows
2248 sub rsp, 5*16
2249 movdqu [rsp + 0*16],xmm6
2250 movdqu [rsp + 1*16],xmm9
2251 movdqu [rsp + 2*16],xmm11
2252 movdqu [rsp + 3*16],xmm14
2253 movdqu [rsp + 4*16],xmm15
2254 %endif
2255 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
2256
2257 %ifidn __OUTPUT_FORMAT__, win64
2258 movdqu xmm15 , [rsp + 4*16]
2259 movdqu xmm14 , [rsp+ 3*16]
2260 movdqu xmm11 , [rsp + 2*16]
2261 movdqu xmm9 , [rsp + 1*16]
2262 movdqu xmm6 , [rsp + 0*16]
2263 add rsp, 5*16
2264 %endif
2265
2266 pop r12
2267 ret
2268
2269
2270 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2271 ;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
2272 ; const struct gcm_key_data *key_data,
2273 ; struct gcm_context_data *context_data,
2274 ; u8 *out,
2275 ; const u8 *in,
2276 ; u64 plaintext_len,
2277 ; u8 *iv,
2278 ; const u8 *aad,
2279 ; u64 aad_len,
2280 ; u8 *auth_tag,
2281 ; u64 auth_tag_len);
2282 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2283 MKGLOBAL(FN_NAME(enc,_),function,)
2284 FN_NAME(enc,_):
2285
2286 FUNC_SAVE
2287
2288 GCM_INIT arg1, arg2, arg6, arg7, arg8
2289
2290 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2291
2292 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
2293
2294 FUNC_RESTORE
2295
2296 ret
2297
2298 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2299 ;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
2300 ; const struct gcm_key_data *key_data,
2301 ; struct gcm_context_data *context_data,
2302 ; u8 *out,
2303 ; const u8 *in,
2304 ; u64 plaintext_len,
2305 ; u8 *iv,
2306 ; const u8 *aad,
2307 ; u64 aad_len,
2308 ; u8 *auth_tag,
2309 ; u64 auth_tag_len);
2310 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2311 MKGLOBAL(FN_NAME(dec,_),function,)
2312 FN_NAME(dec,_):
2313
2314 FUNC_SAVE
2315
2316 GCM_INIT arg1, arg2, arg6, arg7, arg8
2317
2318 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2319
2320 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
2321
2322 FUNC_RESTORE
2323
2324 ret
2325
2326 %ifdef LINUX
2327 section .note.GNU-stack noalloc noexec nowrite progbits
2328 %endif