]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / gcm_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;
31 ; Authors:
32 ; Erdinc Ozturk
33 ; Vinodh Gopal
34 ; James Guilford
35 ;
36 ;
37 ; References:
38 ; This code was derived and highly optimized from the code described in paper:
39 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40 ;
41 ; For the shift-based reductions used in this code, we used the method described in paper:
42 ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
43 ;
44 ;
45 ;
46 ;
47 ; Assumptions:
48 ;
49 ;
50 ;
51 ; iv:
52 ; 0 1 2 3
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 ; | 0x1 |
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ;
63 ;
64 ;
65 ; AAD:
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
68 ;
69 ; if AAD is 8 bytes:
70 ; AAD[3] = {A0, A1};
71 ; padded AAD in xmm register = {A1 A0 0 0}
72 ;
73 ; 0 1 2 3
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76 ; | SPI (A1) |
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80 ; | 0x0 |
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ;
83 ; AAD Format with 32-bit Sequence Number
84 ;
85 ; if AAD is 12 bytes:
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
88 ;
89 ; 0 1 2 3
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 ; | SPI (A2) |
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
95 ; | |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 ; | 0x0 |
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ;
100 ; AAD Format with 64-bit Extended Sequence Number
101 ;
102 ;
103 ; aadLen:
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
106 ;
107 ; TLen:
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109 ;
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112 ;
113
114 %include "include/os.asm"
115 %include "include/reg_sizes.asm"
116 %include "include/clear_regs.asm"
117 %include "include/gcm_defines.asm"
118 %include "include/gcm_keys_sse_avx.asm"
119 %include "include/memcpy.asm"
120
121 %ifndef GCM128_MODE
122 %ifndef GCM192_MODE
123 %ifndef GCM256_MODE
124 %error "No GCM mode selected for gcm_sse.asm!"
125 %endif
126 %endif
127 %endif
128
129 %ifdef NO_AESNI
130 %define SSE sse_no_aesni
131 %else
132 %define SSE sse
133 %endif
134
135 %ifdef GCM128_MODE
136 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ SSE
137 %define NROUNDS 9
138 %endif
139
140 %ifdef GCM192_MODE
141 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ SSE
142 %define NROUNDS 11
143 %endif
144
145 %ifdef GCM256_MODE
146 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ SSE
147 %define NROUNDS 13
148 %endif
149
150 default rel
151 ; need to push 4 registers into stack to maintain
152 %define STACK_OFFSET 8*4
153
154 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
155 %define TMP3 16*1 ; Temporary storage for AES State 3
156 %define TMP4 16*2 ; Temporary storage for AES State 4
157 %define TMP5 16*3 ; Temporary storage for AES State 5
158 %define TMP6 16*4 ; Temporary storage for AES State 6
159 %define TMP7 16*5 ; Temporary storage for AES State 7
160 %define TMP8 16*6 ; Temporary storage for AES State 8
161
162 %define LOCAL_STORAGE 16*7
163
164 %ifidn __OUTPUT_FORMAT__, win64
165 %define XMM_STORAGE 16*10
166 %else
167 %define XMM_STORAGE 0
168 %endif
169
170 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
171
172 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 ; Utility Macros
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175
176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
177 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
178 ; Input: A and B (128-bits each, bit-reflected)
179 ; Output: C = A*B*x mod poly, (i.e. >>1 )
180 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
181 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
183 %macro GHASH_MUL 7
184 %define %%GH %1 ; 16 Bytes
185 %define %%HK %2 ; 16 Bytes
186 %define %%T1 %3
187 %define %%T2 %4
188 %define %%T3 %5
189 %define %%T4 %6
190 %define %%T5 %7
191 ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
192 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193 ; Karatsuba Method
194 movdqa %%T1, %%GH
195 pshufd %%T2, %%GH, 01001110b
196 pshufd %%T3, %%HK, 01001110b
197 pxor %%T2, %%GH ; %%T2 = (a1+a0)
198 pxor %%T3, %%HK ; %%T3 = (b1+b0)
199
200 pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
201 pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
202 pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
203 pxor %%T2, %%GH
204 pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
205
206 movdqa %%T3, %%T2
207 pslldq %%T3, 8 ; shift-L %%T3 2 DWs
208 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
209 pxor %%GH, %%T3
210 pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
211
212
213 ;first phase of the reduction
214 movdqa %%T2, %%GH
215 movdqa %%T3, %%GH
216 movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
217
218 pslld %%T2, 31 ; packed right shifting << 31
219 pslld %%T3, 30 ; packed right shifting shift << 30
220 pslld %%T4, 25 ; packed right shifting shift << 25
221 pxor %%T2, %%T3 ; xor the shifted versions
222 pxor %%T2, %%T4
223
224 movdqa %%T5, %%T2
225 psrldq %%T5, 4 ; shift-R %%T5 1 DW
226
227 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
228 pxor %%GH, %%T2 ; first phase of the reduction complete
229 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
230
231 ;second phase of the reduction
232 movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
233 movdqa %%T3,%%GH
234 movdqa %%T4,%%GH
235
236 psrld %%T2,1 ; packed left shifting >> 1
237 psrld %%T3,2 ; packed left shifting >> 2
238 psrld %%T4,7 ; packed left shifting >> 7
239 pxor %%T2,%%T3 ; xor the shifted versions
240 pxor %%T2,%%T4
241
242 pxor %%T2, %%T5
243 pxor %%GH, %%T2
244 pxor %%GH, %%T1 ; the result is in %%T1
245
246
247 %endmacro
248
249
250 %macro PRECOMPUTE 8
251 %define %%GDATA %1
252 %define %%HK %2
253 %define %%T1 %3
254 %define %%T2 %4
255 %define %%T3 %5
256 %define %%T4 %6
257 %define %%T5 %7
258 %define %%T6 %8
259
260
261 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
262 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
263 movdqa %%T4, %%HK
264 pshufd %%T1, %%HK, 01001110b
265 pxor %%T1, %%HK
266 movdqu [%%GDATA + HashKey_k], %%T1
267
268
269 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
270 movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
271 pshufd %%T1, %%T4, 01001110b
272 pxor %%T1, %%T4
273 movdqu [%%GDATA + HashKey_2_k], %%T1
274
275 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
276 movdqu [%%GDATA + HashKey_3], %%T4
277 pshufd %%T1, %%T4, 01001110b
278 pxor %%T1, %%T4
279 movdqu [%%GDATA + HashKey_3_k], %%T1
280
281
282 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
283 movdqu [%%GDATA + HashKey_4], %%T4
284 pshufd %%T1, %%T4, 01001110b
285 pxor %%T1, %%T4
286 movdqu [%%GDATA + HashKey_4_k], %%T1
287
288 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
289 movdqu [%%GDATA + HashKey_5], %%T4
290 pshufd %%T1, %%T4, 01001110b
291 pxor %%T1, %%T4
292 movdqu [%%GDATA + HashKey_5_k], %%T1
293
294
295 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
296 movdqu [%%GDATA + HashKey_6], %%T4
297 pshufd %%T1, %%T4, 01001110b
298 pxor %%T1, %%T4
299 movdqu [%%GDATA + HashKey_6_k], %%T1
300
301 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
302 movdqu [%%GDATA + HashKey_7], %%T4
303 pshufd %%T1, %%T4, 01001110b
304 pxor %%T1, %%T4
305 movdqu [%%GDATA + HashKey_7_k], %%T1
306
307 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
308 movdqu [%%GDATA + HashKey_8], %%T4
309 pshufd %%T1, %%T4, 01001110b
310 pxor %%T1, %%T4
311 movdqu [%%GDATA + HashKey_8_k], %%T1
312
313
314 %endmacro
315
316
317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
319 ; Returns 0 if data has length 0.
320 ; Input: The input data (INPUT), that data's length (LENGTH).
321 ; Output: The packed xmm register (OUTPUT).
322 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323 %macro READ_SMALL_DATA_INPUT 6
324 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
325 %define %%INPUT %2
326 %define %%LENGTH %3
327 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
328 %define %%COUNTER %5
329 %define %%TMP1 %6
330
331 pxor %%OUTPUT, %%OUTPUT
332 mov %%COUNTER, %%LENGTH
333 mov %%END_READ_LOCATION, %%INPUT
334 add %%END_READ_LOCATION, %%LENGTH
335 xor %%TMP1, %%TMP1
336
337
338 cmp %%COUNTER, 8
339 jl %%_byte_loop_2
340 pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
341 je %%_done
342
343 sub %%COUNTER, 8
344
345 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left
346 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
347 dec %%END_READ_LOCATION
348 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
349 dec %%COUNTER
350 jg %%_byte_loop_1
351 pinsrq %%OUTPUT, %%TMP1, 1
352 jmp %%_done
353
354 %%_byte_loop_2: ;Read in data 1 byte at a time while data is left
355 cmp %%COUNTER, 0
356 je %%_done
357 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
358 dec %%END_READ_LOCATION
359 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
360 dec %%COUNTER
361 jg %%_byte_loop_2
362 pinsrq %%OUTPUT, %%TMP1, 0
363 %%_done:
364
365 %endmacro ; READ_SMALL_DATA_INPUT
366
367
368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
369 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
370 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
371 ; Output: The hash of the data (AAD_HASH).
372 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
373 %macro CALC_AAD_HASH 15
374 %define %%A_IN %1
375 %define %%A_LEN %2
376 %define %%AAD_HASH %3
377 %define %%GDATA_KEY %4
378 %define %%XTMP0 %5 ; xmm temp reg 5
379 %define %%XTMP1 %6 ; xmm temp reg 5
380 %define %%XTMP2 %7
381 %define %%XTMP3 %8
382 %define %%XTMP4 %9
383 %define %%XTMP5 %10 ; xmm temp reg 5
384 %define %%T1 %11 ; temp reg 1
385 %define %%T2 %12
386 %define %%T3 %13
387 %define %%T4 %14
388 %define %%T5 %15 ; temp reg 5
389
390
391 mov %%T1, %%A_IN ; T1 = AAD
392 mov %%T2, %%A_LEN ; T2 = aadLen
393 pxor %%AAD_HASH, %%AAD_HASH
394
395 %%_get_AAD_loop128:
396 cmp %%T2, 128
397 jl %%_exit_AAD_loop128
398
399 movdqu %%XTMP0, [%%T1 + 16*0]
400 pshufb %%XTMP0, [rel SHUF_MASK]
401
402 pxor %%XTMP0, %%AAD_HASH
403
404 movdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
405 movdqa %%XTMP1, %%XTMP0
406 movdqa %%XTMP2, %%XTMP0
407 movdqa %%XTMP3, %%XTMP0
408 movdqa %%XTMP4, %%XTMP0
409 pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1
410 pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0
411 pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0
412 pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1
413 pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
414
415 %assign i 1
416 %assign j 7
417 %rep 7
418 movdqu %%XTMP0, [%%T1 + 16*i]
419 pshufb %%XTMP0, [rel SHUF_MASK]
420
421 movdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
422 movdqa %%XTMP4, %%XTMP0
423 pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
424 pxor %%XTMP1, %%XTMP4
425
426 movdqa %%XTMP4, %%XTMP0
427 pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
428 pxor %%XTMP2, %%XTMP4
429
430 movdqa %%XTMP4, %%XTMP0
431 pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
432 pxor %%XTMP3, %%XTMP4
433 movdqa %%XTMP4, %%XTMP0
434 pclmulqdq %%XTMP4, %%XTMP5, 0x10
435 pxor %%XTMP3, %%XTMP4
436 %assign i (i + 1)
437 %assign j (j - 1)
438 %endrep
439
440 movdqa %%XTMP4, %%XTMP3
441 pslldq %%XTMP4, 8 ; shift-L 2 DWs
442 psrldq %%XTMP3, 8 ; shift-R 2 DWs
443 pxor %%XTMP2, %%XTMP4
444 pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
445
446 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
447 ;first phase of the reduction
448 movdqa %%XTMP5, [rel POLY2]
449 movdqa %%XTMP0, %%XTMP5
450 pclmulqdq %%XTMP0, %%XTMP2, 0x01
451 pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs
452 pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete
453
454 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
455 ;second phase of the reduction
456 movdqa %%XTMP3, %%XTMP5
457 pclmulqdq %%XTMP3, %%XTMP2, 0x00
458 psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
459
460 movdqa %%XTMP4, %%XTMP5
461 pclmulqdq %%XTMP4, %%XTMP2, 0x10
462 pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
463
464 pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete
465 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
466 movdqa %%AAD_HASH, %%XTMP1
467 pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1
468
469 sub %%T2, 128
470 je %%_CALC_AAD_done
471
472 add %%T1, 128
473 jmp %%_get_AAD_loop128
474
475 %%_exit_AAD_loop128:
476 cmp %%T2, 16
477 jl %%_get_small_AAD_block
478
479 ;; calculate hash_key position to start with
480 mov %%T3, %%T2
481 and %%T3, -16 ; 1 to 7 blocks possible here
482 neg %%T3
483 add %%T3, HashKey_1 + 16
484 lea %%T3, [%%GDATA_KEY + %%T3]
485
486 movdqu %%XTMP0, [%%T1]
487 pshufb %%XTMP0, [rel SHUF_MASK]
488
489 pxor %%XTMP0, %%AAD_HASH
490
491 movdqu %%XTMP5, [%%T3]
492 movdqa %%XTMP1, %%XTMP0
493 movdqa %%XTMP2, %%XTMP0
494 movdqa %%XTMP3, %%XTMP0
495 movdqa %%XTMP4, %%XTMP0
496 pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1
497 pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0
498 pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0
499 pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1
500 pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
501
502 add %%T3, 16 ; move to next hashkey
503 add %%T1, 16 ; move to next data block
504 sub %%T2, 16
505 cmp %%T2, 16
506 jl %%_AAD_reduce
507
508 %%_AAD_blocks:
509 movdqu %%XTMP0, [%%T1]
510 pshufb %%XTMP0, [rel SHUF_MASK]
511
512 movdqu %%XTMP5, [%%T3]
513 movdqa %%XTMP4, %%XTMP0
514 pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
515 pxor %%XTMP1, %%XTMP4
516
517 movdqa %%XTMP4, %%XTMP0
518 pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
519 pxor %%XTMP2, %%XTMP4
520
521 movdqa %%XTMP4, %%XTMP0
522 pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
523 pxor %%XTMP3, %%XTMP4
524 movdqa %%XTMP4, %%XTMP0
525 pclmulqdq %%XTMP4, %%XTMP5, 0x10
526 pxor %%XTMP3, %%XTMP4
527
528 add %%T3, 16 ; move to next hashkey
529 add %%T1, 16
530 sub %%T2, 16
531 cmp %%T2, 16
532 jl %%_AAD_reduce
533 jmp %%_AAD_blocks
534
535 %%_AAD_reduce:
536 movdqa %%XTMP4, %%XTMP3
537 pslldq %%XTMP4, 8 ; shift-L 2 DWs
538 psrldq %%XTMP3, 8 ; shift-R 2 DWs
539 pxor %%XTMP2, %%XTMP4
540 pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
541
542 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
543 ;first phase of the reduction
544 movdqa %%XTMP5, [rel POLY2]
545 movdqa %%XTMP0, %%XTMP5
546 pclmulqdq %%XTMP0, %%XTMP2, 0x01
547 pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs
548 pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete
549
550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
551 ;second phase of the reduction
552 movdqa %%XTMP3, %%XTMP5
553 pclmulqdq %%XTMP3, %%XTMP2, 0x00
554 psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
555
556 movdqa %%XTMP4, %%XTMP5
557 pclmulqdq %%XTMP4, %%XTMP2, 0x10
558 pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
559
560 pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete
561 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
562 movdqa %%AAD_HASH, %%XTMP1
563 pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1
564
565 or %%T2, %%T2
566 je %%_CALC_AAD_done
567
568 %%_get_small_AAD_block:
569 movdqu %%XTMP0, [%%GDATA_KEY + HashKey]
570 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
571 ;byte-reflect the AAD data
572 pshufb %%XTMP1, [rel SHUF_MASK]
573 pxor %%AAD_HASH, %%XTMP1
574 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
575
576 %%_CALC_AAD_done:
577
578 %endmacro ; CALC_AAD_HASH
579
580 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
581 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
582 ; Requires the input data be at least 1 byte long.
583 ; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
584 ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
585 ; and whether encoding or decoding (ENC_DEC).
586 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
587 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
588 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
589 %macro PARTIAL_BLOCK 8
590 %define %%GDATA_KEY %1
591 %define %%GDATA_CTX %2
592 %define %%CYPH_PLAIN_OUT %3
593 %define %%PLAIN_CYPH_IN %4
594 %define %%PLAIN_CYPH_LEN %5
595 %define %%DATA_OFFSET %6
596 %define %%AAD_HASH %7
597 %define %%ENC_DEC %8
598 mov r13, [%%GDATA_CTX + PBlockLen]
599 cmp r13, 0
600 je %%_partial_block_done ;Leave Macro if no partial blocks
601
602 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
603 jl %%_fewer_than_16_bytes
604 XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
605 jmp %%_data_read
606
607 %%_fewer_than_16_bytes:
608 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
609 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
610 mov r13, [%%GDATA_CTX + PBlockLen]
611
612 %%_data_read: ;Finished reading in data
613
614
615 movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key
616 movdqu xmm13, [%%GDATA_KEY + HashKey]
617
618 lea r12, [SHIFT_MASK]
619
620 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
621 movdqu xmm2, [r12] ; get the appropriate shuffle mask
622 pshufb xmm9, xmm2 ;shift right r13 bytes
623
624 %ifidn %%ENC_DEC, DEC
625 movdqa xmm3, xmm1
626 pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
627
628 mov r15, %%PLAIN_CYPH_LEN
629 add r15, r13
630 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
631 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
632 sub r12, r15
633 %%_no_extra_mask_1:
634
635 movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
636 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
637
638 pand xmm3, xmm1
639 pshufb xmm3, [SHUF_MASK]
640 pshufb xmm3, xmm2
641 pxor %%AAD_HASH, xmm3
642
643
644 cmp r15,0
645 jl %%_partial_incomplete_1
646
647 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
648 xor rax,rax
649 mov [%%GDATA_CTX + PBlockLen], rax
650 jmp %%_dec_done
651 %%_partial_incomplete_1:
652 %ifidn __OUTPUT_FORMAT__, win64
653 mov rax, %%PLAIN_CYPH_LEN
654 add [%%GDATA_CTX + PBlockLen], rax
655 %else
656 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
657 %endif
658 %%_dec_done:
659 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
660
661 %else
662 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
663
664 mov r15, %%PLAIN_CYPH_LEN
665 add r15, r13
666 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
667 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
668 sub r12, r15
669 %%_no_extra_mask_2:
670
671 movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
672 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
673
674 pshufb xmm9, [SHUF_MASK]
675 pshufb xmm9, xmm2
676 pxor %%AAD_HASH, xmm9
677
678 cmp r15,0
679 jl %%_partial_incomplete_2
680
681 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
682 xor rax,rax
683 mov [%%GDATA_CTX + PBlockLen], rax
684 jmp %%_encode_done
685 %%_partial_incomplete_2:
686 %ifidn __OUTPUT_FORMAT__, win64
687 mov rax, %%PLAIN_CYPH_LEN
688 add [%%GDATA_CTX + PBlockLen], rax
689 %else
690 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
691 %endif
692 %%_encode_done:
693 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
694
695 pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
696 pshufb xmm9, xmm2
697 %endif
698
699
700 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
701 ; output encrypted Bytes
702 cmp r15,0
703 jl %%_partial_fill
704 mov r12, r13
705 mov r13, 16
706 sub r13, r12 ; Set r13 to be the number of bytes to write out
707 jmp %%_count_set
708 %%_partial_fill:
709 mov r13, %%PLAIN_CYPH_LEN
710 %%_count_set:
711 movq rax, xmm9
712 cmp r13, 8
713 jle %%_less_than_8_bytes_left
714
715 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
716 add %%DATA_OFFSET, 8
717 psrldq xmm9, 8
718 movq rax, xmm9
719 sub r13, 8
720 %%_less_than_8_bytes_left:
721 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
722 add %%DATA_OFFSET, 1
723 shr rax, 8
724 sub r13, 1
725 jne %%_less_than_8_bytes_left
726 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
727 %%_partial_block_done:
728 %endmacro ; PARTIAL_BLOCK
729
730
731 ; if a = number of total plaintext bytes
732 ; b = floor(a/16)
733 ; %%num_initial_blocks = b mod 8;
734 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
735 ; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
736 ; Updated AAD_HASH is returned in %%T3
737
738 %macro INITIAL_BLOCKS 24
739 %define %%GDATA_KEY %1
740 %define %%GDATA_CTX %2
741 %define %%CYPH_PLAIN_OUT %3
742 %define %%PLAIN_CYPH_IN %4
743 %define %%LENGTH %5
744 %define %%DATA_OFFSET %6
745 %define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
746 %define %%T1 %8
747 %define %%HASH_KEY %9
748 %define %%T3 %10
749 %define %%T4 %11
750 %define %%T5 %12
751 %define %%CTR %13
752 %define %%XMM1 %14
753 %define %%XMM2 %15
754 %define %%XMM3 %16
755 %define %%XMM4 %17
756 %define %%XMM5 %18
757 %define %%XMM6 %19
758 %define %%XMM7 %20
759 %define %%XMM8 %21
760 %define %%T6 %22
761 %define %%T_key %23
762 %define %%ENC_DEC %24
763
764 %assign i (8-%%num_initial_blocks)
765 movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
766
767 ; start AES for %%num_initial_blocks blocks
768 movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
769
770
771 %assign i (9-%%num_initial_blocks)
772 %rep %%num_initial_blocks
773 paddd %%CTR, [ONE] ; INCR Y0
774 movdqa reg(i), %%CTR
775 pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
776 %assign i (i+1)
777 %endrep
778
779 movdqu %%T_key, [%%GDATA_KEY+16*0]
780 %assign i (9-%%num_initial_blocks)
781 %rep %%num_initial_blocks
782 pxor reg(i),%%T_key
783 %assign i (i+1)
784 %endrep
785
786 %assign j 1
787 %rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192)
788 movdqu %%T_key, [%%GDATA_KEY+16*j]
789 %assign i (9-%%num_initial_blocks)
790 %rep %%num_initial_blocks
791 aesenc reg(i),%%T_key
792 %assign i (i+1)
793 %endrep
794
795 %assign j (j+1)
796 %endrep
797
798
799 movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192)
800 %assign i (9-%%num_initial_blocks)
801 %rep %%num_initial_blocks
802 aesenclast reg(i),%%T_key
803 %assign i (i+1)
804 %endrep
805
806 %assign i (9-%%num_initial_blocks)
807 %rep %%num_initial_blocks
808 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
809 pxor reg(i), %%T1
810 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
811 add %%DATA_OFFSET, 16
812 %ifidn %%ENC_DEC, DEC
813 movdqa reg(i), %%T1
814 %endif
815 pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
816 %assign i (i+1)
817 %endrep
818
819
820 %assign i (8-%%num_initial_blocks)
821 %assign j (9-%%num_initial_blocks)
822
823 %rep %%num_initial_blocks
824 pxor reg(j), reg(i)
825 GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
826 %assign i (i+1)
827 %assign j (j+1)
828 %endrep
829 ; %%XMM8 has the current Hash Value
830 movdqa %%T3, %%XMM8
831
832 cmp %%LENGTH, 128
833 jl %%_initial_blocks_done
834
835 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
836 ; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and store cipher/plain text.
837 ; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8)
838 ; - combine current GHASH value into block 0 (XMM1)
839
840 paddd %%CTR, [ONE] ; INCR Y0
841 movdqa %%XMM1, %%CTR
842 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
843
844 paddd %%CTR, [ONE] ; INCR Y0
845 movdqa %%XMM2, %%CTR
846 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
847
848 paddd %%CTR, [ONE] ; INCR Y0
849 movdqa %%XMM3, %%CTR
850 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
851
852 paddd %%CTR, [ONE] ; INCR Y0
853 movdqa %%XMM4, %%CTR
854 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
855
856 paddd %%CTR, [ONE] ; INCR Y0
857 movdqa %%XMM5, %%CTR
858 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
859
860 paddd %%CTR, [ONE] ; INCR Y0
861 movdqa %%XMM6, %%CTR
862 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
863
864 paddd %%CTR, [ONE] ; INCR Y0
865 movdqa %%XMM7, %%CTR
866 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
867
868 paddd %%CTR, [ONE] ; INCR Y0
869 movdqa %%XMM8, %%CTR
870 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
871
872 movdqu %%T_key, [%%GDATA_KEY+16*0]
873 pxor %%XMM1, %%T_key
874 pxor %%XMM2, %%T_key
875 pxor %%XMM3, %%T_key
876 pxor %%XMM4, %%T_key
877 pxor %%XMM5, %%T_key
878 pxor %%XMM6, %%T_key
879 pxor %%XMM7, %%T_key
880 pxor %%XMM8, %%T_key
881
882
883 %assign i 1
884 %rep NROUNDS ; do early (13) rounds (11 for GCM192)
885 movdqu %%T_key, [%%GDATA_KEY+16*i]
886 aesenc %%XMM1, %%T_key
887 aesenc %%XMM2, %%T_key
888 aesenc %%XMM3, %%T_key
889 aesenc %%XMM4, %%T_key
890 aesenc %%XMM5, %%T_key
891 aesenc %%XMM6, %%T_key
892 aesenc %%XMM7, %%T_key
893 aesenc %%XMM8, %%T_key
894 %assign i (i+1)
895 %endrep
896
897
898 movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round
899 aesenclast %%XMM1, %%T_key
900 aesenclast %%XMM2, %%T_key
901 aesenclast %%XMM3, %%T_key
902 aesenclast %%XMM4, %%T_key
903 aesenclast %%XMM5, %%T_key
904 aesenclast %%XMM6, %%T_key
905 aesenclast %%XMM7, %%T_key
906 aesenclast %%XMM8, %%T_key
907
908 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
909 pxor %%XMM1, %%T1
910 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
911 %ifidn %%ENC_DEC, DEC
912 movdqa %%XMM1, %%T1
913 %endif
914
915 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
916 pxor %%XMM2, %%T1
917 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
918 %ifidn %%ENC_DEC, DEC
919 movdqa %%XMM2, %%T1
920 %endif
921
922 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
923 pxor %%XMM3, %%T1
924 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
925 %ifidn %%ENC_DEC, DEC
926 movdqa %%XMM3, %%T1
927 %endif
928
929 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
930 pxor %%XMM4, %%T1
931 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
932 %ifidn %%ENC_DEC, DEC
933 movdqa %%XMM4, %%T1
934 %endif
935
936 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
937 pxor %%XMM5, %%T1
938 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
939 %ifidn %%ENC_DEC, DEC
940 movdqa %%XMM5, %%T1
941 %endif
942
943 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
944 pxor %%XMM6, %%T1
945 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
946 %ifidn %%ENC_DEC, DEC
947 movdqa %%XMM6, %%T1
948 %endif
949
950 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
951 pxor %%XMM7, %%T1
952 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
953 %ifidn %%ENC_DEC, DEC
954 movdqa %%XMM7, %%T1
955 %endif
956
957 XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
958 pxor %%XMM8, %%T1
959 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
960 %ifidn %%ENC_DEC, DEC
961 movdqa %%XMM8, %%T1
962 %endif
963
964 add %%DATA_OFFSET, 128
965
966 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
967 pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
968 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
969 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
970 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
971 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
972 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
973 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
974 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
975
976 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
977
978 %%_initial_blocks_done:
979
980
981 %endmacro
982
983
984
985 ; encrypt 8 blocks at a time
986 ; ghash the 8 previously encrypted ciphertext blocks
987 ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
988 ; %%DATA_OFFSET is the data offset value
989 %macro GHASH_8_ENCRYPT_8_PARALLEL 22
990 %define %%GDATA %1
991 %define %%CYPH_PLAIN_OUT %2
992 %define %%PLAIN_CYPH_IN %3
993 %define %%DATA_OFFSET %4
994 %define %%T1 %5
995 %define %%T2 %6
996 %define %%T3 %7
997 %define %%T4 %8
998 %define %%T5 %9
999 %define %%T6 %10
1000 %define %%CTR %11
1001 %define %%XMM1 %12
1002 %define %%XMM2 %13
1003 %define %%XMM3 %14
1004 %define %%XMM4 %15
1005 %define %%XMM5 %16
1006 %define %%XMM6 %17
1007 %define %%XMM7 %18
1008 %define %%XMM8 %19
1009 %define %%T7 %20
1010 %define %%loop_idx %21
1011 %define %%ENC_DEC %22
1012
1013 movdqa %%T7, %%XMM1
1014 movdqu [rsp + TMP2], %%XMM2
1015 movdqu [rsp + TMP3], %%XMM3
1016 movdqu [rsp + TMP4], %%XMM4
1017 movdqu [rsp + TMP5], %%XMM5
1018 movdqu [rsp + TMP6], %%XMM6
1019 movdqu [rsp + TMP7], %%XMM7
1020 movdqu [rsp + TMP8], %%XMM8
1021
1022 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1023 ;; Karatsuba Method
1024
1025 movdqa %%T4, %%T7
1026 pshufd %%T6, %%T7, 01001110b
1027 pxor %%T6, %%T7
1028 %ifidn %%loop_idx, in_order
1029 paddd %%CTR, [ONE] ; INCR CNT
1030 %else
1031 paddd %%CTR, [ONEf] ; INCR CNT
1032 %endif
1033 movdqu %%T5, [%%GDATA + HashKey_8]
1034 pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
1035 pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
1036 movdqu %%T5, [%%GDATA + HashKey_8_k]
1037 pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1038 movdqa %%XMM1, %%CTR
1039
1040 %ifidn %%loop_idx, in_order
1041 paddd %%CTR, [ONE] ; INCR CNT
1042 movdqa %%XMM2, %%CTR
1043
1044 paddd %%CTR, [ONE] ; INCR CNT
1045 movdqa %%XMM3, %%CTR
1046
1047 paddd %%CTR, [ONE] ; INCR CNT
1048 movdqa %%XMM4, %%CTR
1049
1050 paddd %%CTR, [ONE] ; INCR CNT
1051 movdqa %%XMM5, %%CTR
1052
1053 paddd %%CTR, [ONE] ; INCR CNT
1054 movdqa %%XMM6, %%CTR
1055
1056 paddd %%CTR, [ONE] ; INCR CNT
1057 movdqa %%XMM7, %%CTR
1058
1059 paddd %%CTR, [ONE] ; INCR CNT
1060 movdqa %%XMM8, %%CTR
1061
1062 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1063 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
1064 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
1065 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
1066 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
1067 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
1068 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
1069 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
1070 %else
1071 paddd %%CTR, [ONEf] ; INCR CNT
1072 movdqa %%XMM2, %%CTR
1073
1074 paddd %%CTR, [ONEf] ; INCR CNT
1075 movdqa %%XMM3, %%CTR
1076
1077 paddd %%CTR, [ONEf] ; INCR CNT
1078 movdqa %%XMM4, %%CTR
1079
1080 paddd %%CTR, [ONEf] ; INCR CNT
1081 movdqa %%XMM5, %%CTR
1082
1083 paddd %%CTR, [ONEf] ; INCR CNT
1084 movdqa %%XMM6, %%CTR
1085
1086 paddd %%CTR, [ONEf] ; INCR CNT
1087 movdqa %%XMM7, %%CTR
1088
1089 paddd %%CTR, [ONEf] ; INCR CNT
1090 movdqa %%XMM8, %%CTR
1091 %endif
1092 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1093
1094 movdqu %%T1, [%%GDATA + 16*0]
1095 pxor %%XMM1, %%T1
1096 pxor %%XMM2, %%T1
1097 pxor %%XMM3, %%T1
1098 pxor %%XMM4, %%T1
1099 pxor %%XMM5, %%T1
1100 pxor %%XMM6, %%T1
1101 pxor %%XMM7, %%T1
1102 pxor %%XMM8, %%T1
1103
1104 ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
1105 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1106 ;; Karatsuba Method
1107 movdqu %%T1, [rsp + TMP2]
1108 movdqa %%T3, %%T1
1109
1110 pshufd %%T2, %%T3, 01001110b
1111 pxor %%T2, %%T3
1112 movdqu %%T5, [%%GDATA + HashKey_7]
1113 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1114 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1115 movdqu %%T5, [%%GDATA + HashKey_7_k]
1116 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1117 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1118 pxor %%T7, %%T3
1119 pxor %%T6, %%T2
1120
1121 movdqu %%T1, [%%GDATA + 16*1]
1122 aesenc %%XMM1, %%T1
1123 aesenc %%XMM2, %%T1
1124 aesenc %%XMM3, %%T1
1125 aesenc %%XMM4, %%T1
1126 aesenc %%XMM5, %%T1
1127 aesenc %%XMM6, %%T1
1128 aesenc %%XMM7, %%T1
1129 aesenc %%XMM8, %%T1
1130
1131
1132 movdqu %%T1, [%%GDATA + 16*2]
1133 aesenc %%XMM1, %%T1
1134 aesenc %%XMM2, %%T1
1135 aesenc %%XMM3, %%T1
1136 aesenc %%XMM4, %%T1
1137 aesenc %%XMM5, %%T1
1138 aesenc %%XMM6, %%T1
1139 aesenc %%XMM7, %%T1
1140 aesenc %%XMM8, %%T1
1141
1142 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1143 ; Karatsuba Method
1144 movdqu %%T1, [rsp + TMP3]
1145 movdqa %%T3, %%T1
1146 pshufd %%T2, %%T3, 01001110b
1147 pxor %%T2, %%T3
1148 movdqu %%T5, [%%GDATA + HashKey_6]
1149 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1150 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1151 movdqu %%T5, [%%GDATA + HashKey_6_k]
1152 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1153 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1154 pxor %%T7, %%T3
1155 pxor %%T6, %%T2
1156
1157 movdqu %%T1, [%%GDATA + 16*3]
1158 aesenc %%XMM1, %%T1
1159 aesenc %%XMM2, %%T1
1160 aesenc %%XMM3, %%T1
1161 aesenc %%XMM4, %%T1
1162 aesenc %%XMM5, %%T1
1163 aesenc %%XMM6, %%T1
1164 aesenc %%XMM7, %%T1
1165 aesenc %%XMM8, %%T1
1166
1167 movdqu %%T1, [rsp + TMP4]
1168 movdqa %%T3, %%T1
1169 pshufd %%T2, %%T3, 01001110b
1170 pxor %%T2, %%T3
1171 movdqu %%T5, [%%GDATA + HashKey_5]
1172 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1173 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1174 movdqu %%T5, [%%GDATA + HashKey_5_k]
1175 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1176 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1177 pxor %%T7, %%T3
1178 pxor %%T6, %%T2
1179
1180 movdqu %%T1, [%%GDATA + 16*4]
1181 aesenc %%XMM1, %%T1
1182 aesenc %%XMM2, %%T1
1183 aesenc %%XMM3, %%T1
1184 aesenc %%XMM4, %%T1
1185 aesenc %%XMM5, %%T1
1186 aesenc %%XMM6, %%T1
1187 aesenc %%XMM7, %%T1
1188 aesenc %%XMM8, %%T1
1189
1190 movdqu %%T1, [%%GDATA + 16*5]
1191 aesenc %%XMM1, %%T1
1192 aesenc %%XMM2, %%T1
1193 aesenc %%XMM3, %%T1
1194 aesenc %%XMM4, %%T1
1195 aesenc %%XMM5, %%T1
1196 aesenc %%XMM6, %%T1
1197 aesenc %%XMM7, %%T1
1198 aesenc %%XMM8, %%T1
1199
1200 movdqu %%T1, [rsp + TMP5]
1201 movdqa %%T3, %%T1
1202 pshufd %%T2, %%T3, 01001110b
1203 pxor %%T2, %%T3
1204 movdqu %%T5, [%%GDATA + HashKey_4]
1205 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1206 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1207 movdqu %%T5, [%%GDATA + HashKey_4_k]
1208 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1209 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1210 pxor %%T7, %%T3
1211 pxor %%T6, %%T2
1212
1213
1214 movdqu %%T1, [%%GDATA + 16*6]
1215 aesenc %%XMM1, %%T1
1216 aesenc %%XMM2, %%T1
1217 aesenc %%XMM3, %%T1
1218 aesenc %%XMM4, %%T1
1219 aesenc %%XMM5, %%T1
1220 aesenc %%XMM6, %%T1
1221 aesenc %%XMM7, %%T1
1222 aesenc %%XMM8, %%T1
1223 movdqu %%T1, [rsp + TMP6]
1224 movdqa %%T3, %%T1
1225 pshufd %%T2, %%T3, 01001110b
1226 pxor %%T2, %%T3
1227 movdqu %%T5, [%%GDATA + HashKey_3]
1228 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1229 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1230 movdqu %%T5, [%%GDATA + HashKey_3_k]
1231 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1232 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1233 pxor %%T7, %%T3
1234 pxor %%T6, %%T2
1235
1236 movdqu %%T1, [%%GDATA + 16*7]
1237 aesenc %%XMM1, %%T1
1238 aesenc %%XMM2, %%T1
1239 aesenc %%XMM3, %%T1
1240 aesenc %%XMM4, %%T1
1241 aesenc %%XMM5, %%T1
1242 aesenc %%XMM6, %%T1
1243 aesenc %%XMM7, %%T1
1244 aesenc %%XMM8, %%T1
1245
1246 movdqu %%T1, [rsp + TMP7]
1247 movdqa %%T3, %%T1
1248 pshufd %%T2, %%T3, 01001110b
1249 pxor %%T2, %%T3
1250 movdqu %%T5, [%%GDATA + HashKey_2]
1251 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1252 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1253 movdqu %%T5, [%%GDATA + HashKey_2_k]
1254 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1255 pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
1256 pxor %%T7, %%T3
1257 pxor %%T6, %%T2
1258
1259 movdqu %%T1, [%%GDATA + 16*8]
1260 aesenc %%XMM1, %%T1
1261 aesenc %%XMM2, %%T1
1262 aesenc %%XMM3, %%T1
1263 aesenc %%XMM4, %%T1
1264 aesenc %%XMM5, %%T1
1265 aesenc %%XMM6, %%T1
1266 aesenc %%XMM7, %%T1
1267 aesenc %%XMM8, %%T1
1268
1269
1270 ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
1271 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1272 ;; Karatsuba Method
1273 movdqu %%T1, [rsp + TMP8]
1274 movdqa %%T3, %%T1
1275
1276 pshufd %%T2, %%T3, 01001110b
1277 pxor %%T2, %%T3
1278 movdqu %%T5, [%%GDATA + HashKey]
1279 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1280 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
1281 movdqu %%T5, [%%GDATA + HashKey_k]
1282 pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1283 pxor %%T7, %%T3
1284 pxor %%T4, %%T1
1285
1286 movdqu %%T1, [%%GDATA + 16*9]
1287 aesenc %%XMM1, %%T1
1288 aesenc %%XMM2, %%T1
1289 aesenc %%XMM3, %%T1
1290 aesenc %%XMM4, %%T1
1291 aesenc %%XMM5, %%T1
1292 aesenc %%XMM6, %%T1
1293 aesenc %%XMM7, %%T1
1294 aesenc %%XMM8, %%T1
1295
1296
1297 %ifdef GCM128_MODE
1298 movdqu %%T5, [%%GDATA + 16*10]
1299 %endif
1300 %ifdef GCM192_MODE
1301 movdqu %%T1, [%%GDATA + 16*10]
1302 aesenc %%XMM1, %%T1
1303 aesenc %%XMM2, %%T1
1304 aesenc %%XMM3, %%T1
1305 aesenc %%XMM4, %%T1
1306 aesenc %%XMM5, %%T1
1307 aesenc %%XMM6, %%T1
1308 aesenc %%XMM7, %%T1
1309 aesenc %%XMM8, %%T1
1310
1311 movdqu %%T1, [%%GDATA + 16*11]
1312 aesenc %%XMM1, %%T1
1313 aesenc %%XMM2, %%T1
1314 aesenc %%XMM3, %%T1
1315 aesenc %%XMM4, %%T1
1316 aesenc %%XMM5, %%T1
1317 aesenc %%XMM6, %%T1
1318 aesenc %%XMM7, %%T1
1319 aesenc %%XMM8, %%T1
1320
1321 movdqu %%T5, [%%GDATA + 16*12] ; finish last key round
1322 %endif
1323 %ifdef GCM256_MODE
1324 movdqu %%T1, [%%GDATA + 16*10]
1325 aesenc %%XMM1, %%T1
1326 aesenc %%XMM2, %%T1
1327 aesenc %%XMM3, %%T1
1328 aesenc %%XMM4, %%T1
1329 aesenc %%XMM5, %%T1
1330 aesenc %%XMM6, %%T1
1331 aesenc %%XMM7, %%T1
1332 aesenc %%XMM8, %%T1
1333
1334 movdqu %%T1, [%%GDATA + 16*11]
1335 aesenc %%XMM1, %%T1
1336 aesenc %%XMM2, %%T1
1337 aesenc %%XMM3, %%T1
1338 aesenc %%XMM4, %%T1
1339 aesenc %%XMM5, %%T1
1340 aesenc %%XMM6, %%T1
1341 aesenc %%XMM7, %%T1
1342 aesenc %%XMM8, %%T1
1343
1344 movdqu %%T1, [%%GDATA + 16*12]
1345 aesenc %%XMM1, %%T1
1346 aesenc %%XMM2, %%T1
1347 aesenc %%XMM3, %%T1
1348 aesenc %%XMM4, %%T1
1349 aesenc %%XMM5, %%T1
1350 aesenc %%XMM6, %%T1
1351 aesenc %%XMM7, %%T1
1352 aesenc %%XMM8, %%T1
1353
1354 movdqu %%T1, [%%GDATA + 16*13]
1355 aesenc %%XMM1, %%T1
1356 aesenc %%XMM2, %%T1
1357 aesenc %%XMM3, %%T1
1358 aesenc %%XMM4, %%T1
1359 aesenc %%XMM5, %%T1
1360 aesenc %%XMM6, %%T1
1361 aesenc %%XMM7, %%T1
1362 aesenc %%XMM8, %%T1
1363
1364 movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
1365 %endif
1366
1367 %assign i 0
1368 %assign j 1
1369 %rep 8
1370 XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1371
1372 %ifidn %%ENC_DEC, DEC
1373 movdqa %%T3, %%T1
1374 %endif
1375
1376 pxor %%T1, %%T5
1377 aesenclast reg(j), %%T1 ; XMM1:XMM8
1378 XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
1379
1380 %ifidn %%ENC_DEC, DEC
1381 movdqa reg(j), %%T3
1382 %endif
1383 %assign i (i+1)
1384 %assign j (j+1)
1385 %endrep
1386
1387
1388
1389
1390 pxor %%T2, %%T6
1391 pxor %%T2, %%T4
1392 pxor %%T2, %%T7
1393
1394
1395 movdqa %%T3, %%T2
1396 pslldq %%T3, 8 ; shift-L %%T3 2 DWs
1397 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
1398 pxor %%T7, %%T3
1399 pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
1400
1401
1402
1403 ;first phase of the reduction
1404 movdqa %%T2, %%T7
1405 movdqa %%T3, %%T7
1406 movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
1407
1408 pslld %%T2, 31 ; packed right shifting << 31
1409 pslld %%T3, 30 ; packed right shifting shift << 30
1410 pslld %%T1, 25 ; packed right shifting shift << 25
1411 pxor %%T2, %%T3 ; xor the shifted versions
1412 pxor %%T2, %%T1
1413
1414 movdqa %%T5, %%T2
1415 psrldq %%T5, 4 ; shift-R %%T5 1 DW
1416
1417 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
1418 pxor %%T7, %%T2 ; first phase of the reduction complete
1419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1420
1421 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1422 pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
1423 pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
1424 pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
1425 pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
1426 pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
1427 pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
1428 pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
1429
1430 ;second phase of the reduction
1431 movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
1432 movdqa %%T3,%%T7
1433 movdqa %%T1,%%T7
1434
1435 psrld %%T2,1 ; packed left shifting >> 1
1436 psrld %%T3,2 ; packed left shifting >> 2
1437 psrld %%T1,7 ; packed left shifting >> 7
1438 pxor %%T2,%%T3 ; xor the shifted versions
1439 pxor %%T2,%%T1
1440
1441 pxor %%T2, %%T5
1442 pxor %%T7, %%T2
1443 pxor %%T7, %%T4 ; the result is in %%T4
1444
1445
1446 pxor %%XMM1, %%T7
1447
1448 %endmacro
1449
1450
1451 ; GHASH the last 4 ciphertext blocks.
1452 %macro GHASH_LAST_8 16
1453 %define %%GDATA %1
1454 %define %%T1 %2
1455 %define %%T2 %3
1456 %define %%T3 %4
1457 %define %%T4 %5
1458 %define %%T5 %6
1459 %define %%T6 %7
1460 %define %%T7 %8
1461 %define %%XMM1 %9
1462 %define %%XMM2 %10
1463 %define %%XMM3 %11
1464 %define %%XMM4 %12
1465 %define %%XMM5 %13
1466 %define %%XMM6 %14
1467 %define %%XMM7 %15
1468 %define %%XMM8 %16
1469
1470 ; Karatsuba Method
1471 movdqa %%T6, %%XMM1
1472 pshufd %%T2, %%XMM1, 01001110b
1473 pxor %%T2, %%XMM1
1474 movdqu %%T5, [%%GDATA + HashKey_8]
1475 pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
1476
1477 pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
1478 movdqu %%T4, [%%GDATA + HashKey_8_k]
1479 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1480
1481 movdqa %%T7, %%XMM1
1482 movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
1483
1484
1485 ; Karatsuba Method
1486 movdqa %%T1, %%XMM2
1487 pshufd %%T2, %%XMM2, 01001110b
1488 pxor %%T2, %%XMM2
1489 movdqu %%T5, [%%GDATA + HashKey_7]
1490 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1491
1492 pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
1493 movdqu %%T4, [%%GDATA + HashKey_7_k]
1494 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1495
1496 pxor %%T6, %%T1
1497 pxor %%T7, %%XMM2
1498 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1499
1500
1501 ; Karatsuba Method
1502 movdqa %%T1, %%XMM3
1503 pshufd %%T2, %%XMM3, 01001110b
1504 pxor %%T2, %%XMM3
1505 movdqu %%T5, [%%GDATA + HashKey_6]
1506 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1507
1508 pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
1509 movdqu %%T4, [%%GDATA + HashKey_6_k]
1510 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1511
1512 pxor %%T6, %%T1
1513 pxor %%T7, %%XMM3
1514 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1515
1516 ; Karatsuba Method
1517 movdqa %%T1, %%XMM4
1518 pshufd %%T2, %%XMM4, 01001110b
1519 pxor %%T2, %%XMM4
1520 movdqu %%T5, [%%GDATA + HashKey_5]
1521 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1522
1523 pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
1524 movdqu %%T4, [%%GDATA + HashKey_5_k]
1525 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1526
1527 pxor %%T6, %%T1
1528 pxor %%T7, %%XMM4
1529 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1530
1531 ; Karatsuba Method
1532 movdqa %%T1, %%XMM5
1533 pshufd %%T2, %%XMM5, 01001110b
1534 pxor %%T2, %%XMM5
1535 movdqu %%T5, [%%GDATA + HashKey_4]
1536 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1537
1538 pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
1539 movdqu %%T4, [%%GDATA + HashKey_4_k]
1540 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1541
1542 pxor %%T6, %%T1
1543 pxor %%T7, %%XMM5
1544 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1545
1546 ; Karatsuba Method
1547 movdqa %%T1, %%XMM6
1548 pshufd %%T2, %%XMM6, 01001110b
1549 pxor %%T2, %%XMM6
1550 movdqu %%T5, [%%GDATA + HashKey_3]
1551 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1552
1553 pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
1554 movdqu %%T4, [%%GDATA + HashKey_3_k]
1555 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1556
1557 pxor %%T6, %%T1
1558 pxor %%T7, %%XMM6
1559 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1560
1561 ; Karatsuba Method
1562 movdqa %%T1, %%XMM7
1563 pshufd %%T2, %%XMM7, 01001110b
1564 pxor %%T2, %%XMM7
1565 movdqu %%T5, [%%GDATA + HashKey_2]
1566 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1567
1568 pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
1569 movdqu %%T4, [%%GDATA + HashKey_2_k]
1570 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1571
1572 pxor %%T6, %%T1
1573 pxor %%T7, %%XMM7
1574 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
1575
1576
1577 ; Karatsuba Method
1578 movdqa %%T1, %%XMM8
1579 pshufd %%T2, %%XMM8, 01001110b
1580 pxor %%T2, %%XMM8
1581 movdqu %%T5, [%%GDATA + HashKey]
1582 pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
1583
1584 pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
1585 movdqu %%T4, [%%GDATA + HashKey_k]
1586 pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
1587
1588 pxor %%T6, %%T1
1589 pxor %%T7, %%XMM8
1590 pxor %%T2, %%XMM1
1591 pxor %%T2, %%T6
1592 pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
1593
1594
1595 movdqa %%T4, %%T2
1596 pslldq %%T4, 8 ; shift-L %%T4 2 DWs
1597 psrldq %%T2, 8 ; shift-R %%T2 2 DWs
1598 pxor %%T7, %%T4
1599 pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
1600
1601
1602 ;first phase of the reduction
1603 movdqa %%T2, %%T7
1604 movdqa %%T3, %%T7
1605 movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
1606
1607 pslld %%T2, 31 ; packed right shifting << 31
1608 pslld %%T3, 30 ; packed right shifting shift << 30
1609 pslld %%T4, 25 ; packed right shifting shift << 25
1610 pxor %%T2, %%T3 ; xor the shifted versions
1611 pxor %%T2, %%T4
1612
1613 movdqa %%T1, %%T2
1614 psrldq %%T1, 4 ; shift-R %%T1 1 DW
1615
1616 pslldq %%T2, 12 ; shift-L %%T2 3 DWs
1617 pxor %%T7, %%T2 ; first phase of the reduction complete
1618 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1619
1620 ;second phase of the reduction
1621 movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
1622 movdqa %%T3,%%T7
1623 movdqa %%T4,%%T7
1624
1625 psrld %%T2,1 ; packed left shifting >> 1
1626 psrld %%T3,2 ; packed left shifting >> 2
1627 psrld %%T4,7 ; packed left shifting >> 7
1628 pxor %%T2,%%T3 ; xor the shifted versions
1629 pxor %%T2,%%T4
1630
1631 pxor %%T2, %%T1
1632 pxor %%T7, %%T2
1633 pxor %%T6, %%T7 ; the result is in %%T6
1634
1635 %endmacro
1636
1637 ; Encryption of a single block
1638 %macro ENCRYPT_SINGLE_BLOCK 3
1639 %define %%GDATA %1
1640 %define %%ST %2
1641 %define %%T1 %3
1642 movdqu %%T1, [%%GDATA+16*0]
1643 pxor %%ST, %%T1
1644 %assign i 1
1645 %rep NROUNDS
1646 movdqu %%T1, [%%GDATA+16*i]
1647 aesenc %%ST, %%T1
1648 %assign i (i+1)
1649 %endrep
1650 movdqu %%T1, [%%GDATA+16*i]
1651 aesenclast %%ST, %%T1
1652 %endmacro
1653
1654
1655 ;; Start of Stack Setup
1656
1657 %macro FUNC_SAVE 0
1658 ;; Required for Update/GCM_ENC
1659 ;the number of pushes must equal STACK_OFFSET
1660 push r12
1661 push r13
1662 push r14
1663 push r15
1664 mov r14, rsp
1665
1666 sub rsp, VARIABLE_OFFSET
1667 and rsp, ~63
1668
1669 %ifidn __OUTPUT_FORMAT__, win64
1670 ; xmm6:xmm15 need to be maintained for Windows
1671 movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1672 movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
1673 movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
1674 movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
1675 movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
1676 movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
1677 movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
1678 movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
1679 movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
1680 movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
1681 %endif
1682 %endmacro
1683
1684
1685 %macro FUNC_RESTORE 0
1686
1687 %ifdef SAFE_DATA
1688 clear_scratch_gps_asm
1689 clear_scratch_xmms_sse_asm
1690 %endif
1691 %ifidn __OUTPUT_FORMAT__, win64
1692 movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
1693 movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
1694 movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
1695 movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
1696 movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
1697 movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
1698 movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
1699 movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
1700 movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
1701 movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
1702 %endif
1703
1704 ;; Required for Update/GCM_ENC
1705 mov rsp, r14
1706 pop r15
1707 pop r14
1708 pop r13
1709 pop r12
1710 %endmacro
1711
1712
1713 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1714 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
1715 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
1716 ; Additional Authentication data (A_IN), Additional Data length (A_LEN).
1717 ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
1718 ; Clobbers rax, r10-r13 and xmm0-xmm6
1719 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1720 %macro GCM_INIT 5
1721 %define %%GDATA_KEY %1
1722 %define %%GDATA_CTX %2
1723 %define %%IV %3
1724 %define %%A_IN %4
1725 %define %%A_LEN %5
1726 %define %%AAD_HASH xmm0
1727
1728 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
1729 pxor xmm2, xmm3
1730 mov r10, %%A_LEN
1731
1732 movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
1733 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
1734 xor r10, r10
1735 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
1736 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
1737 movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
1738 mov r10, %%IV
1739 movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
1740 pinsrq xmm2, [r10], 0
1741 pinsrd xmm2, [r10+8], 2
1742 movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
1743
1744 pshufb xmm2, [SHUF_MASK]
1745
1746 movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
1747 %endmacro
1748
1749
1750 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1751 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
1752 ; struct has been initialized by GCM_INIT.
1753 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
1754 ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
1755 ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
1756 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
1757 ; Clobbers rax, r10-r15, and xmm0-xmm15
1758 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1759 %macro GCM_ENC_DEC 6
1760 %define %%GDATA_KEY %1
1761 %define %%GDATA_CTX %2
1762 %define %%CYPH_PLAIN_OUT %3
1763 %define %%PLAIN_CYPH_IN %4
1764 %define %%PLAIN_CYPH_LEN %5
1765 %define %%ENC_DEC %6
1766 %define %%DATA_OFFSET r11
1767
1768 ; Macro flow:
1769 ; calculate the number of 16byte blocks in the message
1770 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
1771 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
1772 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
1773
1774 cmp %%PLAIN_CYPH_LEN, 0
1775 je %%_multiple_of_16_bytes
1776
1777 xor %%DATA_OFFSET, %%DATA_OFFSET
1778 %ifidn __OUTPUT_FORMAT__, win64
1779 mov r12, %%PLAIN_CYPH_LEN
1780 add [%%GDATA_CTX + InLen], r12 ;Update length of data processed
1781 %else
1782 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
1783 %endif
1784 movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
1785 movdqu xmm8, [%%GDATA_CTX + AadHash]
1786
1787
1788 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
1789
1790 mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
1791 sub r13, %%DATA_OFFSET
1792 mov r10, r13 ;save the amount of data left to process in r10
1793 and r13, -16 ; r13 = r13 - (r13 mod 16)
1794
1795 mov r12, r13
1796 shr r12, 4
1797 and r12, 7
1798 jz %%_initial_num_blocks_is_0
1799
1800 cmp r12, 7
1801 je %%_initial_num_blocks_is_7
1802 cmp r12, 6
1803 je %%_initial_num_blocks_is_6
1804 cmp r12, 5
1805 je %%_initial_num_blocks_is_5
1806 cmp r12, 4
1807 je %%_initial_num_blocks_is_4
1808 cmp r12, 3
1809 je %%_initial_num_blocks_is_3
1810 cmp r12, 2
1811 je %%_initial_num_blocks_is_2
1812
1813 jmp %%_initial_num_blocks_is_1
1814
1815 %%_initial_num_blocks_is_7:
1816 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1817 sub r13, 16*7
1818 jmp %%_initial_blocks_encrypted
1819
1820 %%_initial_num_blocks_is_6:
1821 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1822 sub r13, 16*6
1823 jmp %%_initial_blocks_encrypted
1824
1825 %%_initial_num_blocks_is_5:
1826 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1827 sub r13, 16*5
1828 jmp %%_initial_blocks_encrypted
1829
1830 %%_initial_num_blocks_is_4:
1831 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1832 sub r13, 16*4
1833 jmp %%_initial_blocks_encrypted
1834
1835
1836 %%_initial_num_blocks_is_3:
1837 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1838 sub r13, 16*3
1839 jmp %%_initial_blocks_encrypted
1840 %%_initial_num_blocks_is_2:
1841 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1842 sub r13, 16*2
1843 jmp %%_initial_blocks_encrypted
1844
1845 %%_initial_num_blocks_is_1:
1846 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1847 sub r13, 16
1848 jmp %%_initial_blocks_encrypted
1849
1850 %%_initial_num_blocks_is_0:
1851 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1852
1853
1854 %%_initial_blocks_encrypted:
1855 cmp r13, 0
1856 je %%_zero_cipher_left
1857
1858 sub r13, 128
1859 je %%_eight_cipher_left
1860
1861
1862
1863
1864 movd r15d, xmm9
1865 and r15d, 255
1866 pshufb xmm9, [SHUF_MASK]
1867
1868
1869 %%_encrypt_by_8_new:
1870 cmp r15d, 255-8
1871 jg %%_encrypt_by_8
1872
1873
1874
1875 add r15b, 8
1876 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
1877 add %%DATA_OFFSET, 128
1878 sub r13, 128
1879 jne %%_encrypt_by_8_new
1880
1881 pshufb xmm9, [SHUF_MASK]
1882 jmp %%_eight_cipher_left
1883
1884 %%_encrypt_by_8:
1885 pshufb xmm9, [SHUF_MASK]
1886 add r15b, 8
1887 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
1888 pshufb xmm9, [SHUF_MASK]
1889 add %%DATA_OFFSET, 128
1890 sub r13, 128
1891 jne %%_encrypt_by_8_new
1892
1893 pshufb xmm9, [SHUF_MASK]
1894
1895
1896
1897
1898 %%_eight_cipher_left:
1899 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
1900
1901
1902 %%_zero_cipher_left:
1903 movdqu [%%GDATA_CTX + AadHash], xmm14
1904 movdqu [%%GDATA_CTX + CurCount], xmm9
1905
1906 mov r13, r10
1907 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
1908
1909 je %%_multiple_of_16_bytes
1910
1911 mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
1912 ; handle the last <16 Byte block seperately
1913
1914 paddd xmm9, [ONE] ; INCR CNT to get Yn
1915 movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
1916 pshufb xmm9, [SHUF_MASK]
1917 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn)
1918 movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
1919
1920 cmp %%PLAIN_CYPH_LEN, 16
1921 jge %%_large_enough_update
1922
1923 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1924 READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
1925 lea r12, [SHIFT_MASK + 16]
1926 sub r12, r13
1927 jmp %%_data_read
1928
1929 %%_large_enough_update:
1930 sub %%DATA_OFFSET, 16
1931 add %%DATA_OFFSET, r13
1932
1933 movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
1934
1935 sub %%DATA_OFFSET, r13
1936 add %%DATA_OFFSET, 16
1937
1938 lea r12, [SHIFT_MASK + 16]
1939 sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
1940 movdqu xmm2, [r12] ; get the appropriate shuffle mask
1941 pshufb xmm1, xmm2 ; shift right 16-r13 bytes
1942 %%_data_read:
1943 %ifidn %%ENC_DEC, DEC
1944 movdqa xmm2, xmm1
1945 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1946 movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1947 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1948 pand xmm2, xmm1
1949 pshufb xmm2, [SHUF_MASK]
1950 pxor xmm14, xmm2
1951 movdqu [%%GDATA_CTX + AadHash], xmm14
1952
1953 %else
1954 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1955 movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1956 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1957 pshufb xmm9, [SHUF_MASK]
1958 pxor xmm14, xmm9
1959 movdqu [%%GDATA_CTX + AadHash], xmm14
1960
1961 pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
1962 %endif
1963
1964
1965 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1966 ; output r13 Bytes
1967 movq rax, xmm9
1968 cmp r13, 8
1969 jle %%_less_than_8_bytes_left
1970
1971 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
1972 add %%DATA_OFFSET, 8
1973 psrldq xmm9, 8
1974 movq rax, xmm9
1975 sub r13, 8
1976
1977 %%_less_than_8_bytes_left:
1978 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
1979 add %%DATA_OFFSET, 1
1980 shr rax, 8
1981 sub r13, 1
1982 jne %%_less_than_8_bytes_left
1983 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1984
1985 %%_multiple_of_16_bytes:
1986
1987 %endmacro
1988
1989
1990 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1991 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
1992 ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
1993 ; whether encoding or decoding (ENC_DEC).
1994 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
1995 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
1996 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1997 %macro GCM_COMPLETE 5
1998 %define %%GDATA_KEY %1
1999 %define %%GDATA_CTX %2
2000 %define %%AUTH_TAG %3
2001 %define %%AUTH_TAG_LEN %4
2002 %define %%ENC_DEC %5
2003 %define %%PLAIN_CYPH_LEN rax
2004
2005 mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes)
2006 movdqu xmm14, [%%GDATA_CTX + AadHash]
2007 movdqu xmm13, [%%GDATA_KEY + HashKey]
2008
2009 cmp r12, 0
2010
2011 je %%_partial_done
2012
2013 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
2014 movdqu [%%GDATA_CTX + AadHash], xmm14
2015
2016 %%_partial_done:
2017
2018 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
2019 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
2020
2021 shl r12, 3 ; convert into number of bits
2022 movd xmm15, r12d ; len(A) in xmm15
2023
2024 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
2025 movq xmm1, %%PLAIN_CYPH_LEN
2026 pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
2027 pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
2028
2029 pxor xmm14, xmm15
2030 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
2031 pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
2032
2033 movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
2034
2035 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0)
2036
2037 pxor xmm9, xmm14
2038
2039
2040
2041 %%_return_T:
2042 mov r10, %%AUTH_TAG ; r10 = authTag
2043 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
2044
2045 cmp r11, 16
2046 je %%_T_16
2047
2048 cmp r11, 12
2049 je %%_T_12
2050
2051 cmp r11, 8
2052 je %%_T_8
2053
2054 simd_store_sse r10, xmm9, r11, r12, rax
2055 jmp %%_return_T_done
2056 %%_T_8:
2057 movq rax, xmm9
2058 mov [r10], rax
2059 jmp %%_return_T_done
2060 %%_T_12:
2061 movq rax, xmm9
2062 mov [r10], rax
2063 psrldq xmm9, 8
2064 movd eax, xmm9
2065 mov [r10 + 8], eax
2066 jmp %%_return_T_done
2067 %%_T_16:
2068 movdqu [r10], xmm9
2069
2070 %%_return_T_done:
2071
2072 %ifdef SAFE_DATA
2073 ;; Clear sensitive data from context structure
2074 pxor xmm0, xmm0
2075 movdqu [%%GDATA_CTX + AadHash], xmm0
2076 movdqu [%%GDATA_CTX + PBlockEncKey], xmm0
2077 %endif
2078
2079 %endmacro ;GCM_COMPLETE
2080
2081
2082 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2083 ;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
2084 ; (struct gcm_key_data *key_data);
2085 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2086 MKGLOBAL(FN_NAME(precomp,_),function,)
2087 FN_NAME(precomp,_):
2088
2089 %ifdef SAFE_PARAM
2090 ;; Check key_data != NULL
2091 cmp arg1, 0
2092 jz exit_precomp
2093 %endif
2094
2095 push r12
2096 push r13
2097 push r14
2098 push r15
2099
2100 mov r14, rsp
2101
2102
2103
2104 sub rsp, VARIABLE_OFFSET
2105 and rsp, ~63 ; align rsp to 64 bytes
2106
2107 %ifidn __OUTPUT_FORMAT__, win64
2108 ; only xmm6 needs to be maintained
2109 movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2110 %endif
2111
2112 pxor xmm6, xmm6
2113 ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
2114
2115 pshufb xmm6, [SHUF_MASK]
2116 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
2117 movdqa xmm2, xmm6
2118 psllq xmm6, 1
2119 psrlq xmm2, 63
2120 movdqa xmm1, xmm2
2121 pslldq xmm2, 8
2122 psrldq xmm1, 8
2123 por xmm6, xmm2
2124 ;reduction
2125 pshufd xmm2, xmm1, 00100100b
2126 pcmpeqd xmm2, [TWOONE]
2127 pand xmm2, [POLY]
2128 pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
2129 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2130 movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
2131
2132
2133 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2134
2135 %ifdef SAFE_DATA
2136 clear_scratch_gps_asm
2137 clear_scratch_xmms_sse_asm
2138 %endif
2139 %ifidn __OUTPUT_FORMAT__, win64
2140 movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2141 %endif
2142 mov rsp, r14
2143
2144 pop r15
2145 pop r14
2146 pop r13
2147 pop r12
2148
2149 exit_precomp:
2150
2151 ret
2152
2153 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2154 ;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
2155 ; const struct gcm_key_data *key_data,
2156 ; struct gcm_context_data *context_data,
2157 ; u8 *iv,
2158 ; const u8 *aad,
2159 ; u64 aad_len);
2160 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2161 MKGLOBAL(FN_NAME(init,_),function,)
2162 FN_NAME(init,_):
2163 push r12
2164 push r13
2165 %ifidn __OUTPUT_FORMAT__, win64
2166 push r14
2167 push r15
2168 mov r14, rsp
2169 ; xmm6:xmm15 need to be maintained for Windows
2170 sub rsp, 1*16
2171 movdqu [rsp + 0*16], xmm6
2172 %endif
2173
2174 %ifdef SAFE_PARAM
2175 ;; Check key_data != NULL
2176 cmp arg1, 0
2177 jz exit_init
2178
2179 ;; Check context_data != NULL
2180 cmp arg2, 0
2181 jz exit_init
2182
2183 ;; Check IV != NULL
2184 cmp arg3, 0
2185 jz exit_init
2186
2187 ;; Check if aad_len == 0
2188 cmp arg5, 0
2189 jz skip_aad_check_init
2190
2191 ;; Check aad != NULL (aad_len != 0)
2192 cmp arg4, 0
2193 jz exit_init
2194
2195 skip_aad_check_init:
2196 %endif
2197 GCM_INIT arg1, arg2, arg3, arg4, arg5
2198
2199 %ifdef SAFE_DATA
2200 clear_scratch_gps_asm
2201 clear_scratch_xmms_sse_asm
2202 %endif
2203 exit_init:
2204
2205 %ifidn __OUTPUT_FORMAT__, win64
2206 movdqu xmm6 , [rsp + 0*16]
2207 mov rsp, r14
2208 pop r15
2209 pop r14
2210 %endif
2211 pop r13
2212 pop r12
2213 ret
2214
2215
2216 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2217 ;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
2218 ; const struct gcm_key_data *key_data,
2219 ; struct gcm_context_data *context_data,
2220 ; u8 *out,
2221 ; const u8 *in,
2222 ; u64 plaintext_len);
2223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2224 MKGLOBAL(FN_NAME(enc,_update_),function,)
2225 FN_NAME(enc,_update_):
2226
2227 FUNC_SAVE
2228
2229 %ifdef SAFE_PARAM
2230 ;; Check key_data != NULL
2231 cmp arg1, 0
2232 jz exit_update_enc
2233
2234 ;; Check context_data != NULL
2235 cmp arg2, 0
2236 jz exit_update_enc
2237
2238 ;; Check if plaintext_len == 0
2239 cmp arg5, 0
2240 jz skip_in_out_check_update_enc
2241
2242 ;; Check out != NULL (plaintext_len != 0)
2243 cmp arg3, 0
2244 jz exit_update_enc
2245
2246 ;; Check in != NULL (plaintext_len != 0)
2247 cmp arg4, 0
2248 jz exit_update_enc
2249
2250 skip_in_out_check_update_enc:
2251 %endif
2252 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2253
2254 exit_update_enc:
2255 FUNC_RESTORE
2256
2257 ret
2258
2259
2260 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2261 ;void aes_gcm_dec_128_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
2262 ; const struct gcm_key_data *key_data,
2263 ; struct gcm_context_data *context_data,
2264 ; u8 *out,
2265 ; const u8 *in,
2266 ; u64 plaintext_len);
2267 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2268 MKGLOBAL(FN_NAME(dec,_update_),function,)
2269 FN_NAME(dec,_update_):
2270
2271 FUNC_SAVE
2272
2273 %ifdef SAFE_PARAM
2274 ;; Check key_data != NULL
2275 cmp arg1, 0
2276 jz exit_update_dec
2277
2278 ;; Check context_data != NULL
2279 cmp arg2, 0
2280 jz exit_update_dec
2281
2282 ;; Check if plaintext_len == 0
2283 cmp arg5, 0
2284 jz skip_in_out_check_update_dec
2285
2286 ;; Check out != NULL (plaintext_len != 0)
2287 cmp arg3, 0
2288 jz exit_update_dec
2289
2290 ;; Check in != NULL (plaintext_len != 0)
2291 cmp arg4, 0
2292 jz exit_update_dec
2293
2294 skip_in_out_check_update_dec:
2295 %endif
2296
2297 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2298
2299 exit_update_dec:
2300 FUNC_RESTORE
2301
2302 ret
2303
2304
2305 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2306 ;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
2307 ; const struct gcm_key_data *key_data,
2308 ; struct gcm_context_data *context_data,
2309 ; u8 *auth_tag,
2310 ; u64 auth_tag_len);
2311 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2312 MKGLOBAL(FN_NAME(enc,_finalize_),function,)
2313 FN_NAME(enc,_finalize_):
2314
2315 %ifdef SAFE_PARAM
2316 ;; Check key_data != NULL
2317 cmp arg1, 0
2318 jz exit_enc_fin
2319
2320 ;; Check context_data != NULL
2321 cmp arg2, 0
2322 jz exit_enc_fin
2323
2324 ;; Check auth_tag != NULL
2325 cmp arg3, 0
2326 jz exit_enc_fin
2327
2328 ;; Check auth_tag_len == 0 or > 16
2329 cmp arg4, 0
2330 jz exit_enc_fin
2331
2332 cmp arg4, 16
2333 ja exit_enc_fin
2334 %endif
2335 push r12
2336
2337 %ifidn __OUTPUT_FORMAT__, win64
2338 ; xmm6:xmm15 need to be maintained for Windows
2339 sub rsp, 5*16
2340 movdqu [rsp + 0*16],xmm6
2341 movdqu [rsp + 1*16],xmm9
2342 movdqu [rsp + 2*16],xmm11
2343 movdqu [rsp + 3*16],xmm14
2344 movdqu [rsp + 4*16],xmm15
2345 %endif
2346
2347 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
2348
2349 %ifdef SAFE_DATA
2350 clear_scratch_gps_asm
2351 clear_scratch_xmms_sse_asm
2352 %endif
2353 %ifidn __OUTPUT_FORMAT__, win64
2354 movdqu xmm15 , [rsp + 4*16]
2355 movdqu xmm14 , [rsp+ 3*16]
2356 movdqu xmm11 , [rsp + 2*16]
2357 movdqu xmm9 , [rsp + 1*16]
2358 movdqu xmm6 , [rsp + 0*16]
2359 add rsp, 5*16
2360 %endif
2361
2362 pop r12
2363
2364 exit_enc_fin:
2365 ret
2366
2367
2368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2369 ;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
2370 ; const struct gcm_key_data *key_data,
2371 ; struct gcm_context_data *context_data,
2372 ; u8 *auth_tag,
2373 ; u64 auth_tag_len);
2374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2375 MKGLOBAL(FN_NAME(dec,_finalize_),function,)
2376 FN_NAME(dec,_finalize_):
2377
2378 %ifdef SAFE_PARAM
2379 ;; Check key_data != NULL
2380 cmp arg1, 0
2381 jz exit_dec_fin
2382
2383 ;; Check context_data != NULL
2384 cmp arg2, 0
2385 jz exit_dec_fin
2386
2387 ;; Check auth_tag != NULL
2388 cmp arg3, 0
2389 jz exit_dec_fin
2390
2391 ;; Check auth_tag_len == 0 or > 16
2392 cmp arg4, 0
2393 jz exit_dec_fin
2394
2395 cmp arg4, 16
2396 ja exit_dec_fin
2397 %endif
2398
2399 push r12
2400
2401 %ifidn __OUTPUT_FORMAT__, win64
2402 ; xmm6:xmm15 need to be maintained for Windows
2403 sub rsp, 5*16
2404 movdqu [rsp + 0*16],xmm6
2405 movdqu [rsp + 1*16],xmm9
2406 movdqu [rsp + 2*16],xmm11
2407 movdqu [rsp + 3*16],xmm14
2408 movdqu [rsp + 4*16],xmm15
2409 %endif
2410 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
2411
2412 %ifdef SAFE_DATA
2413 clear_scratch_gps_asm
2414 clear_scratch_xmms_sse_asm
2415 %endif
2416 %ifidn __OUTPUT_FORMAT__, win64
2417 movdqu xmm15 , [rsp + 4*16]
2418 movdqu xmm14 , [rsp+ 3*16]
2419 movdqu xmm11 , [rsp + 2*16]
2420 movdqu xmm9 , [rsp + 1*16]
2421 movdqu xmm6 , [rsp + 0*16]
2422 add rsp, 5*16
2423 %endif
2424
2425 pop r12
2426
2427 exit_dec_fin:
2428 ret
2429
2430
2431 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2432 ;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
2433 ; const struct gcm_key_data *key_data,
2434 ; struct gcm_context_data *context_data,
2435 ; u8 *out,
2436 ; const u8 *in,
2437 ; u64 plaintext_len,
2438 ; u8 *iv,
2439 ; const u8 *aad,
2440 ; u64 aad_len,
2441 ; u8 *auth_tag,
2442 ; u64 auth_tag_len);
2443 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2444 MKGLOBAL(FN_NAME(enc,_),function,)
2445 FN_NAME(enc,_):
2446
2447 FUNC_SAVE
2448
2449 %ifdef SAFE_PARAM
2450 ;; Check key_data != NULL
2451 cmp arg1, 0
2452 jz exit_enc
2453
2454 ;; Check context_data != NULL
2455 cmp arg2, 0
2456 jz exit_enc
2457
2458 ;; Check IV != NULL
2459 cmp arg6, 0
2460 jz exit_enc
2461
2462 ;; Check auth_tag != NULL
2463 cmp arg9, 0
2464 jz exit_enc
2465
2466 ;; Check auth_tag_len == 0 or > 16
2467 cmp arg10, 0
2468 jz exit_enc
2469
2470 cmp arg10, 16
2471 ja exit_enc
2472
2473 ;; Check if plaintext_len == 0
2474 cmp arg5, 0
2475 jz skip_in_out_check_enc
2476
2477 ;; Check out != NULL (plaintext_len != 0)
2478 cmp arg3, 0
2479 jz exit_enc
2480
2481 ;; Check in != NULL (plaintext_len != 0)
2482 cmp arg4, 0
2483 jz exit_enc
2484
2485 skip_in_out_check_enc:
2486 ;; Check if aad_len == 0
2487 cmp arg8, 0
2488 jz skip_aad_check_enc
2489
2490 ;; Check aad != NULL (aad_len != 0)
2491 cmp arg7, 0
2492 jz exit_enc
2493
2494 skip_aad_check_enc:
2495 %endif
2496 GCM_INIT arg1, arg2, arg6, arg7, arg8
2497
2498 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2499
2500 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
2501
2502 exit_enc:
2503 FUNC_RESTORE
2504
2505 ret
2506
2507 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2508 ;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
2509 ; const struct gcm_key_data *key_data,
2510 ; struct gcm_context_data *context_data,
2511 ; u8 *out,
2512 ; const u8 *in,
2513 ; u64 plaintext_len,
2514 ; u8 *iv,
2515 ; const u8 *aad,
2516 ; u64 aad_len,
2517 ; u8 *auth_tag,
2518 ; u64 auth_tag_len);
2519 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2520 MKGLOBAL(FN_NAME(dec,_),function,)
2521 FN_NAME(dec,_):
2522
2523 FUNC_SAVE
2524
2525 %ifdef SAFE_PARAM
2526 ;; Check key_data != NULL
2527 cmp arg1, 0
2528 jz exit_dec
2529
2530 ;; Check context_data != NULL
2531 cmp arg2, 0
2532 jz exit_dec
2533
2534 ;; Check IV != NULL
2535 cmp arg6, 0
2536 jz exit_dec
2537
2538 ;; Check auth_tag != NULL
2539 cmp arg9, 0
2540 jz exit_dec
2541
2542 ;; Check auth_tag_len == 0 or > 16
2543 cmp arg10, 0
2544 jz exit_dec
2545
2546 cmp arg10, 16
2547 ja exit_dec
2548
2549 ;; Check if plaintext_len == 0
2550 cmp arg5, 0
2551 jz skip_in_out_check_dec
2552
2553 ;; Check out != NULL (plaintext_len != 0)
2554 cmp arg3, 0
2555 jz exit_dec
2556
2557 ;; Check in != NULL (plaintext_len != 0)
2558 cmp arg4, 0
2559 jz exit_dec
2560
2561 skip_in_out_check_dec:
2562 ;; Check if aad_len == 0
2563 cmp arg8, 0
2564 jz skip_aad_check_dec
2565
2566 ;; Check aad != NULL (aad_len != 0)
2567 cmp arg7, 0
2568 jz exit_dec
2569
2570 skip_aad_check_dec:
2571 %endif
2572
2573 GCM_INIT arg1, arg2, arg6, arg7, arg8
2574
2575 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2576
2577 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
2578
2579 exit_dec:
2580 FUNC_RESTORE
2581
2582 ret
2583
2584 %ifdef LINUX
2585 section .note.GNU-stack noalloc noexec nowrite progbits
2586 %endif