]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / gcm_avx_gen2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;
31 ; Authors:
32 ; Erdinc Ozturk
33 ; Vinodh Gopal
34 ; James Guilford
35 ;
36 ;
37 ; References:
38 ; This code was derived and highly optimized from the code described in paper:
39 ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40 ;
41 ; For the shift-based reductions used in this code, we used the method described in paper:
42 ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
43 ;
44 ;
45 ;
46 ;
47 ; Assumptions:
48 ;
49 ;
50 ;
51 ; iv:
52 ; 0 1 2 3
53 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55 ; | Salt (From the SA) |
56 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57 ; | Initialization Vector |
58 ; | (This is the sequence number from IPSec header) |
59 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 ; | 0x1 |
61 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ;
63 ;
64 ;
65 ; AAD:
66 ; AAD will be padded with 0 to the next 16byte multiple
67 ; for example, assume AAD is a u32 vector
68 ;
69 ; if AAD is 8 bytes:
70 ; AAD[3] = {A0, A1};
71 ; padded AAD in xmm register = {A1 A0 0 0}
72 ;
73 ; 0 1 2 3
74 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76 ; | SPI (A1) |
77 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78 ; | 32-bit Sequence Number (A0) |
79 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80 ; | 0x0 |
81 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ;
83 ; AAD Format with 32-bit Sequence Number
84 ;
85 ; if AAD is 12 bytes:
86 ; AAD[3] = {A0, A1, A2};
87 ; padded AAD in xmm register = {A2 A1 A0 0}
88 ;
89 ; 0 1 2 3
90 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 ; | SPI (A2) |
93 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94 ; | 64-bit Extended Sequence Number {A1,A0} |
95 ; | |
96 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 ; | 0x0 |
98 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ;
100 ; AAD Format with 64-bit Extended Sequence Number
101 ;
102 ;
103 ; aadLen:
104 ; Must be a multiple of 4 bytes and from the definition of the spec.
105 ; The code additionally supports any aadLen length.
106 ;
107 ; TLen:
108 ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109 ;
110 ; poly = x^128 + x^127 + x^126 + x^121 + 1
111 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112 ;
113
114 %include "os.asm"
115 %include "reg_sizes.asm"
116 %include "gcm_defines.asm"
117 %include "memcpy.asm"
118
119 %ifndef GCM128_MODE
120 %ifndef GCM192_MODE
121 %ifndef GCM256_MODE
122 %error "No GCM mode selected for gcm_avx_gen2.asm!"
123 %endif
124 %endif
125 %endif
126
127 %ifdef GCM128_MODE
128 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2
129 %define NROUNDS 9
130 %endif
131
132 %ifdef GCM192_MODE
133 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2
134 %define NROUNDS 11
135 %endif
136
137 %ifdef GCM256_MODE
138 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2
139 %define NROUNDS 13
140 %endif
141
142 default rel
143 ; need to push 4 registers into stack to maintain
144 %define STACK_OFFSET 8*4
145
146 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
147 %define TMP3 16*1 ; Temporary storage for AES State 3
148 %define TMP4 16*2 ; Temporary storage for AES State 4
149 %define TMP5 16*3 ; Temporary storage for AES State 5
150 %define TMP6 16*4 ; Temporary storage for AES State 6
151 %define TMP7 16*5 ; Temporary storage for AES State 7
152 %define TMP8 16*6 ; Temporary storage for AES State 8
153
154 %define LOCAL_STORAGE 16*7
155
156 %ifidn __OUTPUT_FORMAT__, win64
157 %define XMM_STORAGE 16*10
158 %else
159 %define XMM_STORAGE 0
160 %endif
161
162 %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
163
164 section .text
165 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
166 ; Utility Macros
167 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
168
169 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
170 ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
171 ; Input: A and B (128-bits each, bit-reflected)
172 ; Output: C = A*B*x mod poly, (i.e. >>1 )
173 ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
174 ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176 %macro GHASH_MUL 7
177 %define %%GH %1 ; 16 Bytes
178 %define %%HK %2 ; 16 Bytes
179 %define %%T1 %3
180 %define %%T2 %4
181 %define %%T3 %5
182 %define %%T4 %6
183 %define %%T5 %7
184 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
185 ;; Karatsuba
186 vpshufd %%T2, %%GH, 01001110b
187 vpshufd %%T3, %%HK, 01001110b
188 vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
189 vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
190
191 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
192 vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
193 vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
194 vpxor %%T2, %%T2, %%GH
195 vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
196
197 vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
198 vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
199 vpxor %%GH, %%GH, %%T3
200 vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
201
202 ;first phase of the reduction
203 vpslld %%T2, %%GH, 31 ; packed right shifting << 31
204 vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
205 vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
206
207 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
208 vpxor %%T2, %%T2, %%T4
209
210 vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
211
212 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
213 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
214 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
215
216 ;second phase of the reduction
217
218 vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
219 vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
220 vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
221 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
222 vpxor %%T2, %%T2, %%T4
223
224 vpxor %%T2, %%T2, %%T5
225 vpxor %%GH, %%GH, %%T2
226 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
227
228
229 %endmacro
230
231
232 %macro PRECOMPUTE 8
233 %define %%GDATA %1
234 %define %%HK %2
235 %define %%T1 %3
236 %define %%T2 %4
237 %define %%T3 %5
238 %define %%T4 %6
239 %define %%T5 %7
240 %define %%T6 %8
241
242 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
243 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
244 vmovdqa %%T5, %%HK
245
246 vpshufd %%T1, %%T5, 01001110b
247 vpxor %%T1, %%T5
248 vmovdqu [%%GDATA + HashKey_k], %%T1
249
250 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
251 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
252 vpshufd %%T1, %%T5, 01001110b
253 vpxor %%T1, %%T5
254 vmovdqu [%%GDATA + HashKey_2_k], %%T1
255
256 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
257 vmovdqu [%%GDATA + HashKey_3], %%T5
258 vpshufd %%T1, %%T5, 01001110b
259 vpxor %%T1, %%T5
260 vmovdqu [%%GDATA + HashKey_3_k], %%T1
261
262 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
263 vmovdqu [%%GDATA + HashKey_4], %%T5
264 vpshufd %%T1, %%T5, 01001110b
265 vpxor %%T1, %%T5
266 vmovdqu [%%GDATA + HashKey_4_k], %%T1
267
268 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
269 vmovdqu [%%GDATA + HashKey_5], %%T5
270 vpshufd %%T1, %%T5, 01001110b
271 vpxor %%T1, %%T5
272 vmovdqu [%%GDATA + HashKey_5_k], %%T1
273
274 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
275 vmovdqu [%%GDATA + HashKey_6], %%T5
276 vpshufd %%T1, %%T5, 01001110b
277 vpxor %%T1, %%T5
278 vmovdqu [%%GDATA + HashKey_6_k], %%T1
279
280 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
281 vmovdqu [%%GDATA + HashKey_7], %%T5
282 vpshufd %%T1, %%T5, 01001110b
283 vpxor %%T1, %%T5
284 vmovdqu [%%GDATA + HashKey_7_k], %%T1
285
286 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
287 vmovdqu [%%GDATA + HashKey_8], %%T5
288 vpshufd %%T1, %%T5, 01001110b
289 vpxor %%T1, %%T5
290 vmovdqu [%%GDATA + HashKey_8_k], %%T1
291 %endmacro
292
293
294 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295 ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
296 ; Returns 0 if data has length 0.
297 ; Input: The input data (INPUT), that data's length (LENGTH).
298 ; Output: The packed xmm register (OUTPUT).
299 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
300 %macro READ_SMALL_DATA_INPUT 6
301 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register
302 %define %%INPUT %2
303 %define %%LENGTH %3
304 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
305 %define %%COUNTER %5
306 %define %%TMP1 %6
307
308 vpxor %%OUTPUT, %%OUTPUT
309 mov %%COUNTER, %%LENGTH
310 mov %%END_READ_LOCATION, %%INPUT
311 add %%END_READ_LOCATION, %%LENGTH
312 xor %%TMP1, %%TMP1
313
314
315 cmp %%COUNTER, 8
316 jl %%_byte_loop_2
317 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
318 je %%_done
319
320 sub %%COUNTER, 8
321
322 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left
323 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
324 dec %%END_READ_LOCATION
325 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
326 dec %%COUNTER
327 jg %%_byte_loop_1
328 vpinsrq %%OUTPUT, %%TMP1, 1
329 jmp %%_done
330
331 %%_byte_loop_2: ;Read in data 1 byte at a time while data is left
332 cmp %%COUNTER, 0
333 je %%_done
334 shl %%TMP1, 8 ;This loop handles when no bytes were already read in
335 dec %%END_READ_LOCATION
336 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
337 dec %%COUNTER
338 jg %%_byte_loop_2
339 vpinsrq %%OUTPUT, %%TMP1, 0
340 %%_done:
341
342 %endmacro ; READ_SMALL_DATA_INPUT
343
344
345 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346 ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
347 ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
348 ; Output: The hash of the data (AAD_HASH).
349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350 %macro CALC_AAD_HASH 15
351 %define %%A_IN %1
352 %define %%A_LEN %2
353 %define %%AAD_HASH %3
354 %define %%GDATA_KEY %4
355 %define %%XTMP0 %5 ; xmm temp reg 5
356 %define %%XTMP1 %6 ; xmm temp reg 5
357 %define %%XTMP2 %7
358 %define %%XTMP3 %8
359 %define %%XTMP4 %9
360 %define %%XTMP5 %10 ; xmm temp reg 5
361 %define %%T1 %11 ; temp reg 1
362 %define %%T2 %12
363 %define %%T3 %13
364 %define %%T4 %14
365 %define %%T5 %15 ; temp reg 5
366
367
368 mov %%T1, %%A_IN ; T1 = AAD
369 mov %%T2, %%A_LEN ; T2 = aadLen
370 vpxor %%AAD_HASH, %%AAD_HASH
371
372 %%_get_AAD_loop128:
373 cmp %%T2, 128
374 jl %%_exit_AAD_loop128
375
376 vmovdqu %%XTMP0, [%%T1 + 16*0]
377 vpshufb %%XTMP0, [rel SHUF_MASK]
378
379 vpxor %%XTMP0, %%AAD_HASH
380
381 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
382 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
383 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
384 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
385 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
386 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
387
388 %assign i 1
389 %assign j 7
390 %rep 7
391 vmovdqu %%XTMP0, [%%T1 + 16*i]
392 vpshufb %%XTMP0, [rel SHUF_MASK]
393
394 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
395 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
396 vpxor %%XTMP1, %%XTMP1, %%XTMP4
397
398 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
399 vpxor %%XTMP2, %%XTMP2, %%XTMP4
400
401 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
402 vpxor %%XTMP3, %%XTMP3, %%XTMP4
403 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
404 vpxor %%XTMP3, %%XTMP3, %%XTMP4
405 %assign i (i + 1)
406 %assign j (j - 1)
407 %endrep
408
409 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
410 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
411 vpxor %%XTMP2, %%XTMP2, %%XTMP4
412 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
413
414 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
415 ;first phase of the reduction
416 vmovdqa %%XTMP5, [rel POLY2]
417 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
418 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
419 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
420
421 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
422 ;second phase of the reduction
423 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
424 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
425
426 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
427 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
428
429 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
430 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
431 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
432
433 sub %%T2, 128
434 je %%_CALC_AAD_done
435
436 add %%T1, 128
437 jmp %%_get_AAD_loop128
438
439 %%_exit_AAD_loop128:
440 cmp %%T2, 16
441 jl %%_get_small_AAD_block
442
443 ;; calculate hash_key position to start with
444 mov %%T3, %%T2
445 and %%T3, -16 ; 1 to 7 blocks possible here
446 neg %%T3
447 add %%T3, HashKey_1 + 16
448 lea %%T3, [%%GDATA_KEY + %%T3]
449
450 vmovdqu %%XTMP0, [%%T1]
451 vpshufb %%XTMP0, [rel SHUF_MASK]
452
453 vpxor %%XTMP0, %%AAD_HASH
454
455 vmovdqu %%XTMP5, [%%T3]
456 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
457 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
458 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
459 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
460 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
461
462 add %%T3, 16 ; move to next hashkey
463 add %%T1, 16 ; move to next data block
464 sub %%T2, 16
465 cmp %%T2, 16
466 jl %%_AAD_reduce
467
468 %%_AAD_blocks:
469 vmovdqu %%XTMP0, [%%T1]
470 vpshufb %%XTMP0, [rel SHUF_MASK]
471
472 vmovdqu %%XTMP5, [%%T3]
473 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
474 vpxor %%XTMP1, %%XTMP1, %%XTMP4
475
476 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
477 vpxor %%XTMP2, %%XTMP2, %%XTMP4
478
479 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
480 vpxor %%XTMP3, %%XTMP3, %%XTMP4
481 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
482 vpxor %%XTMP3, %%XTMP3, %%XTMP4
483
484 add %%T3, 16 ; move to next hashkey
485 add %%T1, 16
486 sub %%T2, 16
487 cmp %%T2, 16
488 jl %%_AAD_reduce
489 jmp %%_AAD_blocks
490
491 %%_AAD_reduce:
492 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
493 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
494 vpxor %%XTMP2, %%XTMP2, %%XTMP4
495 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
496
497 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
498 ;first phase of the reduction
499 vmovdqa %%XTMP5, [rel POLY2]
500 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
501 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
502 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
503
504 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
505 ;second phase of the reduction
506 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
507 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
508
509 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
510 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
511
512 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
513 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
514 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
515
516 or %%T2, %%T2
517 je %%_CALC_AAD_done
518
519 %%_get_small_AAD_block:
520 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
521 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
522 ;byte-reflect the AAD data
523 vpshufb %%XTMP1, [rel SHUF_MASK]
524 vpxor %%AAD_HASH, %%XTMP1
525 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
526
527 %%_CALC_AAD_done:
528
529 %endmacro ; CALC_AAD_HASH
530
531 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
532 ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
533 ; Requires the input data be at least 1 byte long.
534 ; Input:
535 ; GDATA_KEY - struct gcm_key_data *
536 ; GDATA_CTX - struct gcm_context_data *
537 ; PLAIN_CYPH_IN - input text
538 ; PLAIN_CYPH_LEN - input text length
539 ; DATA_OFFSET - the current data offset
540 ; ENC_DEC - whether encoding or decoding
541 ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
542 ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
543 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
544 %macro PARTIAL_BLOCK 8
545 %define %%GDATA_KEY %1
546 %define %%GDATA_CTX %2
547 %define %%CYPH_PLAIN_OUT %3
548 %define %%PLAIN_CYPH_IN %4
549 %define %%PLAIN_CYPH_LEN %5
550 %define %%DATA_OFFSET %6
551 %define %%AAD_HASH %7
552 %define %%ENC_DEC %8
553 mov r13, [%%GDATA_CTX + PBlockLen]
554 cmp r13, 0
555 je %%_partial_block_done ;Leave Macro if no partial blocks
556
557 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
558 jl %%_fewer_than_16_bytes
559 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
560 jmp %%_data_read
561
562 %%_fewer_than_16_bytes:
563 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
564 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
565
566 %%_data_read: ;Finished reading in data
567
568
569 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
570 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
571
572 lea r12, [SHIFT_MASK]
573
574 cmp r13, rax
575 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
576 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
577 vpshufb xmm9, xmm2 ;shift right r13 bytes
578
579 %ifidn %%ENC_DEC, DEC
580 vmovdqa xmm3, xmm1
581 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
582
583 mov r15, %%PLAIN_CYPH_LEN
584 add r15, r13
585 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
586 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
587 sub r12, r15
588 %%_no_extra_mask_1:
589
590 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
591 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
592
593 vpand xmm3, xmm1
594 vpshufb xmm3, [SHUF_MASK]
595 vpshufb xmm3, xmm2
596 vpxor %%AAD_HASH, xmm3
597
598
599 cmp r15,0
600 jl %%_partial_incomplete_1
601
602 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
603 xor rax,rax
604 mov [%%GDATA_CTX + PBlockLen], rax
605 jmp %%_dec_done
606 %%_partial_incomplete_1:
607 %ifidn __OUTPUT_FORMAT__, win64
608 mov rax, %%PLAIN_CYPH_LEN
609 add [%%GDATA_CTX + PBlockLen], rax
610 %else
611 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
612 %endif
613 %%_dec_done:
614 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
615
616 %else
617 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
618
619 mov r15, %%PLAIN_CYPH_LEN
620 add r15, r13
621 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
622 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
623 sub r12, r15
624 %%_no_extra_mask_2:
625
626 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
627 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
628
629 vpshufb xmm9, [SHUF_MASK]
630 vpshufb xmm9, xmm2
631 vpxor %%AAD_HASH, xmm9
632
633 cmp r15,0
634 jl %%_partial_incomplete_2
635
636 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
637 xor rax,rax
638 mov [%%GDATA_CTX + PBlockLen], rax
639 jmp %%_encode_done
640 %%_partial_incomplete_2:
641 %ifidn __OUTPUT_FORMAT__, win64
642 mov rax, %%PLAIN_CYPH_LEN
643 add [%%GDATA_CTX + PBlockLen], rax
644 %else
645 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
646 %endif
647 %%_encode_done:
648 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
649
650 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
651 vpshufb xmm9, xmm2
652 %endif
653
654
655 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
656 ; output encrypted Bytes
657 cmp r15,0
658 jl %%_partial_fill
659 mov r12, r13
660 mov r13, 16
661 sub r13, r12 ; Set r13 to be the number of bytes to write out
662 jmp %%_count_set
663 %%_partial_fill:
664 mov r13, %%PLAIN_CYPH_LEN
665 %%_count_set:
666 vmovq rax, xmm9
667 cmp r13, 8
668 jle %%_less_than_8_bytes_left
669
670 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
671 add %%DATA_OFFSET, 8
672 vpsrldq xmm9, xmm9, 8
673 vmovq rax, xmm9
674 sub r13, 8
675 %%_less_than_8_bytes_left:
676 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
677 add %%DATA_OFFSET, 1
678 shr rax, 8
679 sub r13, 1
680 jne %%_less_than_8_bytes_left
681 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
682 %%_partial_block_done:
683 %endmacro ; PARTIAL_BLOCK
684
685
686 ; if a = number of total plaintext bytes
687 ; b = floor(a/16)
688 ; %%num_initial_blocks = b mod 8;
689 ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
690 ; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
691 ; Updated AAD_HASH is returned in %%T3
692
693 %macro INITIAL_BLOCKS 24
694 %define %%GDATA_KEY %1
695 %define %%GDATA_CTX %2
696 %define %%CYPH_PLAIN_OUT %3
697 %define %%PLAIN_CYPH_IN %4
698 %define %%LENGTH %5
699 %define %%DATA_OFFSET %6
700 %define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
701 %define %%T1 %8
702 %define %%HASH_KEY %9
703 %define %%T3 %10
704 %define %%T4 %11
705 %define %%T5 %12
706 %define %%CTR %13
707 %define %%XMM1 %14
708 %define %%XMM2 %15
709 %define %%XMM3 %16
710 %define %%XMM4 %17
711 %define %%XMM5 %18
712 %define %%XMM6 %19
713 %define %%XMM7 %20
714 %define %%XMM8 %21
715 %define %%T6 %22
716 %define %%T_key %23
717 %define %%ENC_DEC %24
718
719 %assign i (8-%%num_initial_blocks)
720 vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
721 ; start AES for %%num_initial_blocks blocks
722 vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
723
724
725 %assign i (9-%%num_initial_blocks)
726 %rep %%num_initial_blocks
727 vpaddd %%CTR, [ONE] ; INCR Y0
728 vmovdqa reg(i), %%CTR
729 vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
730 %assign i (i+1)
731 %endrep
732
733 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
734 %assign i (9-%%num_initial_blocks)
735 %rep %%num_initial_blocks
736 vpxor reg(i),%%T_key
737 %assign i (i+1)
738 %endrep
739
740 %assign j 1
741 %rep NROUNDS
742 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
743 %assign i (9-%%num_initial_blocks)
744 %rep %%num_initial_blocks
745 vaesenc reg(i),%%T_key
746 %assign i (i+1)
747 %endrep
748
749 %assign j (j+1)
750 %endrep ; NROUNDS
751
752
753 vmovdqu %%T_key, [%%GDATA_KEY+16*j]
754 %assign i (9-%%num_initial_blocks)
755 %rep %%num_initial_blocks
756 vaesenclast reg(i),%%T_key
757 %assign i (i+1)
758 %endrep
759
760 %assign i (9-%%num_initial_blocks)
761 %rep %%num_initial_blocks
762 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
763 vpxor reg(i), %%T1
764 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
765 add %%DATA_OFFSET, 16
766 %ifidn %%ENC_DEC, DEC
767 vmovdqa reg(i), %%T1
768 %endif
769 vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
770 %assign i (i+1)
771 %endrep
772
773
774 %assign i (8-%%num_initial_blocks)
775 %assign j (9-%%num_initial_blocks)
776
777 %rep %%num_initial_blocks
778 vpxor reg(j), reg(i)
779 GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
780 %assign i (i+1)
781 %assign j (j+1)
782 %endrep
783 ; %%XMM8 has the current Hash Value
784 vmovdqa %%T3, %%XMM8
785
786 cmp %%LENGTH, 128
787 jl %%_initial_blocks_done ; no need for precomputed constants
788
789 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
790 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
791 vpaddd %%CTR, [ONE] ; INCR Y0
792 vmovdqa %%XMM1, %%CTR
793 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
794
795 vpaddd %%CTR, [ONE] ; INCR Y0
796 vmovdqa %%XMM2, %%CTR
797 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
798
799 vpaddd %%CTR, [ONE] ; INCR Y0
800 vmovdqa %%XMM3, %%CTR
801 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
802
803 vpaddd %%CTR, [ONE] ; INCR Y0
804 vmovdqa %%XMM4, %%CTR
805 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
806
807 vpaddd %%CTR, [ONE] ; INCR Y0
808 vmovdqa %%XMM5, %%CTR
809 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
810
811 vpaddd %%CTR, [ONE] ; INCR Y0
812 vmovdqa %%XMM6, %%CTR
813 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
814
815 vpaddd %%CTR, [ONE] ; INCR Y0
816 vmovdqa %%XMM7, %%CTR
817 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
818
819 vpaddd %%CTR, [ONE] ; INCR Y0
820 vmovdqa %%XMM8, %%CTR
821 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
822
823 vmovdqu %%T_key, [%%GDATA_KEY+16*0]
824 vpxor %%XMM1, %%T_key
825 vpxor %%XMM2, %%T_key
826 vpxor %%XMM3, %%T_key
827 vpxor %%XMM4, %%T_key
828 vpxor %%XMM5, %%T_key
829 vpxor %%XMM6, %%T_key
830 vpxor %%XMM7, %%T_key
831 vpxor %%XMM8, %%T_key
832
833
834 %assign i 1
835 %rep NROUNDS
836 vmovdqu %%T_key, [%%GDATA_KEY+16*i]
837 vaesenc %%XMM1, %%T_key
838 vaesenc %%XMM2, %%T_key
839 vaesenc %%XMM3, %%T_key
840 vaesenc %%XMM4, %%T_key
841 vaesenc %%XMM5, %%T_key
842 vaesenc %%XMM6, %%T_key
843 vaesenc %%XMM7, %%T_key
844 vaesenc %%XMM8, %%T_key
845 %assign i (i+1)
846 %endrep
847
848
849 vmovdqu %%T_key, [%%GDATA_KEY+16*i]
850 vaesenclast %%XMM1, %%T_key
851 vaesenclast %%XMM2, %%T_key
852 vaesenclast %%XMM3, %%T_key
853 vaesenclast %%XMM4, %%T_key
854 vaesenclast %%XMM5, %%T_key
855 vaesenclast %%XMM6, %%T_key
856 vaesenclast %%XMM7, %%T_key
857 vaesenclast %%XMM8, %%T_key
858
859 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
860 vpxor %%XMM1, %%T1
861 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
862 %ifidn %%ENC_DEC, DEC
863 vmovdqa %%XMM1, %%T1
864 %endif
865
866 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
867 vpxor %%XMM2, %%T1
868 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
869 %ifidn %%ENC_DEC, DEC
870 vmovdqa %%XMM2, %%T1
871 %endif
872
873 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
874 vpxor %%XMM3, %%T1
875 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
876 %ifidn %%ENC_DEC, DEC
877 vmovdqa %%XMM3, %%T1
878 %endif
879
880 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
881 vpxor %%XMM4, %%T1
882 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
883 %ifidn %%ENC_DEC, DEC
884 vmovdqa %%XMM4, %%T1
885 %endif
886
887 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
888 vpxor %%XMM5, %%T1
889 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
890 %ifidn %%ENC_DEC, DEC
891 vmovdqa %%XMM5, %%T1
892 %endif
893
894 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
895 vpxor %%XMM6, %%T1
896 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
897 %ifidn %%ENC_DEC, DEC
898 vmovdqa %%XMM6, %%T1
899 %endif
900
901 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
902 vpxor %%XMM7, %%T1
903 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
904 %ifidn %%ENC_DEC, DEC
905 vmovdqa %%XMM7, %%T1
906 %endif
907
908 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
909 vpxor %%XMM8, %%T1
910 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
911 %ifidn %%ENC_DEC, DEC
912 vmovdqa %%XMM8, %%T1
913 %endif
914
915 add %%DATA_OFFSET, 128
916
917 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
918 vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
919 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
920 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
921 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
922 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
923 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
924 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
925 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
926
927 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
928
929 %%_initial_blocks_done:
930
931
932 %endmacro
933
934
935 ; encrypt 8 blocks at a time
936 ; ghash the 8 previously encrypted ciphertext blocks
937 ; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
938 ; r11 is the data offset value
939 %macro GHASH_8_ENCRYPT_8_PARALLEL 22
940 %define %%GDATA %1
941 %define %%CYPH_PLAIN_OUT %2
942 %define %%PLAIN_CYPH_IN %3
943 %define %%DATA_OFFSET %4
944 %define %%T1 %5
945 %define %%T2 %6
946 %define %%T3 %7
947 %define %%T4 %8
948 %define %%T5 %9
949 %define %%T6 %10
950 %define %%CTR %11
951 %define %%XMM1 %12
952 %define %%XMM2 %13
953 %define %%XMM3 %14
954 %define %%XMM4 %15
955 %define %%XMM5 %16
956 %define %%XMM6 %17
957 %define %%XMM7 %18
958 %define %%XMM8 %19
959 %define %%T7 %20
960 %define %%loop_idx %21
961 %define %%ENC_DEC %22
962
963 vmovdqa %%T2, %%XMM1
964 vmovdqu [rsp + TMP2], %%XMM2
965 vmovdqu [rsp + TMP3], %%XMM3
966 vmovdqu [rsp + TMP4], %%XMM4
967 vmovdqu [rsp + TMP5], %%XMM5
968 vmovdqu [rsp + TMP6], %%XMM6
969 vmovdqu [rsp + TMP7], %%XMM7
970 vmovdqu [rsp + TMP8], %%XMM8
971
972 %ifidn %%loop_idx, in_order
973 vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
974 vpaddd %%XMM2, %%XMM1, [ONE]
975 vpaddd %%XMM3, %%XMM2, [ONE]
976 vpaddd %%XMM4, %%XMM3, [ONE]
977 vpaddd %%XMM5, %%XMM4, [ONE]
978 vpaddd %%XMM6, %%XMM5, [ONE]
979 vpaddd %%XMM7, %%XMM6, [ONE]
980 vpaddd %%XMM8, %%XMM7, [ONE]
981 vmovdqa %%CTR, %%XMM8
982
983 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
984 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
985 vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
986 vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
987 vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
988 vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
989 vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
990 vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
991 %else
992 vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
993 vpaddd %%XMM2, %%XMM1, [ONEf]
994 vpaddd %%XMM3, %%XMM2, [ONEf]
995 vpaddd %%XMM4, %%XMM3, [ONEf]
996 vpaddd %%XMM5, %%XMM4, [ONEf]
997 vpaddd %%XMM6, %%XMM5, [ONEf]
998 vpaddd %%XMM7, %%XMM6, [ONEf]
999 vpaddd %%XMM8, %%XMM7, [ONEf]
1000 vmovdqa %%CTR, %%XMM8
1001 %endif
1002
1003
1004
1005 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1006
1007 vmovdqu %%T1, [%%GDATA + 16*0]
1008 vpxor %%XMM1, %%T1
1009 vpxor %%XMM2, %%T1
1010 vpxor %%XMM3, %%T1
1011 vpxor %%XMM4, %%T1
1012 vpxor %%XMM5, %%T1
1013 vpxor %%XMM6, %%T1
1014 vpxor %%XMM7, %%T1
1015 vpxor %%XMM8, %%T1
1016
1017 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1018
1019 vmovdqu %%T1, [%%GDATA + 16*1]
1020 vaesenc %%XMM1, %%T1
1021 vaesenc %%XMM2, %%T1
1022 vaesenc %%XMM3, %%T1
1023 vaesenc %%XMM4, %%T1
1024 vaesenc %%XMM5, %%T1
1025 vaesenc %%XMM6, %%T1
1026 vaesenc %%XMM7, %%T1
1027 vaesenc %%XMM8, %%T1
1028
1029
1030 vmovdqu %%T1, [%%GDATA + 16*2]
1031 vaesenc %%XMM1, %%T1
1032 vaesenc %%XMM2, %%T1
1033 vaesenc %%XMM3, %%T1
1034 vaesenc %%XMM4, %%T1
1035 vaesenc %%XMM5, %%T1
1036 vaesenc %%XMM6, %%T1
1037 vaesenc %%XMM7, %%T1
1038 vaesenc %%XMM8, %%T1
1039
1040 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1041
1042 vmovdqu %%T5, [%%GDATA + HashKey_8]
1043 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
1044 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
1045
1046 vpshufd %%T6, %%T2, 01001110b
1047 vpxor %%T6, %%T2
1048
1049 vmovdqu %%T5, [%%GDATA + HashKey_8_k]
1050 vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
1051
1052
1053 vmovdqu %%T1, [%%GDATA + 16*3]
1054 vaesenc %%XMM1, %%T1
1055 vaesenc %%XMM2, %%T1
1056 vaesenc %%XMM3, %%T1
1057 vaesenc %%XMM4, %%T1
1058 vaesenc %%XMM5, %%T1
1059 vaesenc %%XMM6, %%T1
1060 vaesenc %%XMM7, %%T1
1061 vaesenc %%XMM8, %%T1
1062
1063 vmovdqu %%T1, [rsp + TMP2]
1064 vmovdqu %%T5, [%%GDATA + HashKey_7]
1065 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1066 vpxor %%T4, %%T4, %%T3
1067 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1068 vpxor %%T7, %%T7, %%T3
1069
1070 vpshufd %%T3, %%T1, 01001110b
1071 vpxor %%T3, %%T1
1072 vmovdqu %%T5, [%%GDATA + HashKey_7_k]
1073 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1074 vpxor %%T6, %%T6, %%T3
1075
1076 vmovdqu %%T1, [%%GDATA + 16*4]
1077 vaesenc %%XMM1, %%T1
1078 vaesenc %%XMM2, %%T1
1079 vaesenc %%XMM3, %%T1
1080 vaesenc %%XMM4, %%T1
1081 vaesenc %%XMM5, %%T1
1082 vaesenc %%XMM6, %%T1
1083 vaesenc %%XMM7, %%T1
1084 vaesenc %%XMM8, %%T1
1085
1086 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1087 vmovdqu %%T1, [rsp + TMP3]
1088 vmovdqu %%T5, [%%GDATA + HashKey_6]
1089 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1090 vpxor %%T4, %%T4, %%T3
1091 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1092 vpxor %%T7, %%T7, %%T3
1093
1094 vpshufd %%T3, %%T1, 01001110b
1095 vpxor %%T3, %%T1
1096 vmovdqu %%T5, [%%GDATA + HashKey_6_k]
1097 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1098 vpxor %%T6, %%T6, %%T3
1099
1100 vmovdqu %%T1, [%%GDATA + 16*5]
1101 vaesenc %%XMM1, %%T1
1102 vaesenc %%XMM2, %%T1
1103 vaesenc %%XMM3, %%T1
1104 vaesenc %%XMM4, %%T1
1105 vaesenc %%XMM5, %%T1
1106 vaesenc %%XMM6, %%T1
1107 vaesenc %%XMM7, %%T1
1108 vaesenc %%XMM8, %%T1
1109
1110
1111 vmovdqu %%T1, [rsp + TMP4]
1112 vmovdqu %%T5, [%%GDATA + HashKey_5]
1113 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1114 vpxor %%T4, %%T4, %%T3
1115 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1116 vpxor %%T7, %%T7, %%T3
1117
1118 vpshufd %%T3, %%T1, 01001110b
1119 vpxor %%T3, %%T1
1120 vmovdqu %%T5, [%%GDATA + HashKey_5_k]
1121 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1122 vpxor %%T6, %%T6, %%T3
1123
1124 vmovdqu %%T1, [%%GDATA + 16*6]
1125 vaesenc %%XMM1, %%T1
1126 vaesenc %%XMM2, %%T1
1127 vaesenc %%XMM3, %%T1
1128 vaesenc %%XMM4, %%T1
1129 vaesenc %%XMM5, %%T1
1130 vaesenc %%XMM6, %%T1
1131 vaesenc %%XMM7, %%T1
1132 vaesenc %%XMM8, %%T1
1133
1134 vmovdqu %%T1, [rsp + TMP5]
1135 vmovdqu %%T5, [%%GDATA + HashKey_4]
1136 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1137 vpxor %%T4, %%T4, %%T3
1138 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1139 vpxor %%T7, %%T7, %%T3
1140
1141 vpshufd %%T3, %%T1, 01001110b
1142 vpxor %%T3, %%T1
1143 vmovdqu %%T5, [%%GDATA + HashKey_4_k]
1144 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1145 vpxor %%T6, %%T6, %%T3
1146
1147
1148 vmovdqu %%T1, [%%GDATA + 16*7]
1149 vaesenc %%XMM1, %%T1
1150 vaesenc %%XMM2, %%T1
1151 vaesenc %%XMM3, %%T1
1152 vaesenc %%XMM4, %%T1
1153 vaesenc %%XMM5, %%T1
1154 vaesenc %%XMM6, %%T1
1155 vaesenc %%XMM7, %%T1
1156 vaesenc %%XMM8, %%T1
1157
1158 vmovdqu %%T1, [rsp + TMP6]
1159 vmovdqu %%T5, [%%GDATA + HashKey_3]
1160 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1161 vpxor %%T4, %%T4, %%T3
1162 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1163 vpxor %%T7, %%T7, %%T3
1164
1165 vpshufd %%T3, %%T1, 01001110b
1166 vpxor %%T3, %%T1
1167 vmovdqu %%T5, [%%GDATA + HashKey_3_k]
1168 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1169 vpxor %%T6, %%T6, %%T3
1170
1171 vmovdqu %%T1, [%%GDATA + 16*8]
1172 vaesenc %%XMM1, %%T1
1173 vaesenc %%XMM2, %%T1
1174 vaesenc %%XMM3, %%T1
1175 vaesenc %%XMM4, %%T1
1176 vaesenc %%XMM5, %%T1
1177 vaesenc %%XMM6, %%T1
1178 vaesenc %%XMM7, %%T1
1179 vaesenc %%XMM8, %%T1
1180
1181 vmovdqu %%T1, [rsp + TMP7]
1182 vmovdqu %%T5, [%%GDATA + HashKey_2]
1183 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1184 vpxor %%T4, %%T4, %%T3
1185 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1186 vpxor %%T7, %%T7, %%T3
1187
1188 vpshufd %%T3, %%T1, 01001110b
1189 vpxor %%T3, %%T1
1190 vmovdqu %%T5, [%%GDATA + HashKey_2_k]
1191 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1192 vpxor %%T6, %%T6, %%T3
1193 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1194
1195 vmovdqu %%T5, [%%GDATA + 16*9]
1196 vaesenc %%XMM1, %%T5
1197 vaesenc %%XMM2, %%T5
1198 vaesenc %%XMM3, %%T5
1199 vaesenc %%XMM4, %%T5
1200 vaesenc %%XMM5, %%T5
1201 vaesenc %%XMM6, %%T5
1202 vaesenc %%XMM7, %%T5
1203 vaesenc %%XMM8, %%T5
1204
1205 vmovdqu %%T1, [rsp + TMP8]
1206 vmovdqu %%T5, [%%GDATA + HashKey]
1207 vpclmulqdq %%T3, %%T1, %%T5, 0x11
1208 vpxor %%T4, %%T4, %%T3
1209 vpclmulqdq %%T3, %%T1, %%T5, 0x00
1210 vpxor %%T7, %%T7, %%T3
1211
1212 vpshufd %%T3, %%T1, 01001110b
1213 vpxor %%T3, %%T1
1214 vmovdqu %%T5, [%%GDATA + HashKey_k]
1215 vpclmulqdq %%T3, %%T3, %%T5, 0x10
1216 vpxor %%T6, %%T6, %%T3
1217
1218 vpxor %%T6, %%T4
1219 vpxor %%T6, %%T7
1220
1221 %ifdef GCM128_MODE
1222 vmovdqu %%T5, [%%GDATA + 16*10]
1223 %endif
1224 %ifdef GCM192_MODE
1225 vmovdqu %%T5, [%%GDATA + 16*10]
1226 vaesenc %%XMM1, %%T5
1227 vaesenc %%XMM2, %%T5
1228 vaesenc %%XMM3, %%T5
1229 vaesenc %%XMM4, %%T5
1230 vaesenc %%XMM5, %%T5
1231 vaesenc %%XMM6, %%T5
1232 vaesenc %%XMM7, %%T5
1233 vaesenc %%XMM8, %%T5
1234
1235 vmovdqu %%T5, [%%GDATA + 16*11]
1236 vaesenc %%XMM1, %%T5
1237 vaesenc %%XMM2, %%T5
1238 vaesenc %%XMM3, %%T5
1239 vaesenc %%XMM4, %%T5
1240 vaesenc %%XMM5, %%T5
1241 vaesenc %%XMM6, %%T5
1242 vaesenc %%XMM7, %%T5
1243 vaesenc %%XMM8, %%T5
1244
1245 vmovdqu %%T5, [%%GDATA + 16*12]
1246 %endif
1247 %ifdef GCM256_MODE
1248 vmovdqu %%T5, [%%GDATA + 16*10]
1249 vaesenc %%XMM1, %%T5
1250 vaesenc %%XMM2, %%T5
1251 vaesenc %%XMM3, %%T5
1252 vaesenc %%XMM4, %%T5
1253 vaesenc %%XMM5, %%T5
1254 vaesenc %%XMM6, %%T5
1255 vaesenc %%XMM7, %%T5
1256 vaesenc %%XMM8, %%T5
1257
1258 vmovdqu %%T5, [%%GDATA + 16*11]
1259 vaesenc %%XMM1, %%T5
1260 vaesenc %%XMM2, %%T5
1261 vaesenc %%XMM3, %%T5
1262 vaesenc %%XMM4, %%T5
1263 vaesenc %%XMM5, %%T5
1264 vaesenc %%XMM6, %%T5
1265 vaesenc %%XMM7, %%T5
1266 vaesenc %%XMM8, %%T5
1267
1268 vmovdqu %%T5, [%%GDATA + 16*12]
1269 vaesenc %%XMM1, %%T5
1270 vaesenc %%XMM2, %%T5
1271 vaesenc %%XMM3, %%T5
1272 vaesenc %%XMM4, %%T5
1273 vaesenc %%XMM5, %%T5
1274 vaesenc %%XMM6, %%T5
1275 vaesenc %%XMM7, %%T5
1276 vaesenc %%XMM8, %%T5
1277
1278 vmovdqu %%T5, [%%GDATA + 16*13]
1279 vaesenc %%XMM1, %%T5
1280 vaesenc %%XMM2, %%T5
1281 vaesenc %%XMM3, %%T5
1282 vaesenc %%XMM4, %%T5
1283 vaesenc %%XMM5, %%T5
1284 vaesenc %%XMM6, %%T5
1285 vaesenc %%XMM7, %%T5
1286 vaesenc %%XMM8, %%T5
1287
1288 vmovdqu %%T5, [%%GDATA + 16*14]
1289 %endif
1290
1291 %assign i 0
1292 %assign j 1
1293 %rep 8
1294
1295 %ifidn %%ENC_DEC, ENC
1296 %ifdef NT_LD
1297 VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1298 vpxor %%T2, %%T2, %%T5
1299 %else
1300 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1301 %endif ; NT_LD
1302 vaesenclast reg(j), reg(j), %%T2
1303 %else
1304 VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
1305 vpxor %%T2, %%T2, %%T5
1306 vaesenclast %%T3, reg(j), %%T2
1307 vpxor reg(j), %%T2, %%T5
1308 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
1309 %endif ; %%ENC_DEC
1310
1311 %assign i (i+1)
1312 %assign j (j+1)
1313 %endrep
1314
1315 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
1316 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
1317 vpxor %%T7, %%T3
1318 vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
1319
1320
1321 ;first phase of the reduction
1322
1323 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1324 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1325 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1326
1327 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1328 vpxor %%T2, %%T2, %%T4
1329
1330 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1331
1332 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1333 vpxor %%T7, %%T2 ; first phase of the reduction complete
1334 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1335 %ifidn %%ENC_DEC, ENC
1336 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
1337 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
1338 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
1339 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
1340 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
1341 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
1342 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
1343 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
1344 %endif
1345
1346 ;second phase of the reduction
1347
1348 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1349 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1350 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1351 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1352 vpxor %%T2, %%T2,%%T4
1353
1354 vpxor %%T2, %%T2, %%T1
1355 vpxor %%T7, %%T7, %%T2
1356 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1357
1358
1359
1360 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
1361 vpshufb %%XMM2, [SHUF_MASK]
1362 vpshufb %%XMM3, [SHUF_MASK]
1363 vpshufb %%XMM4, [SHUF_MASK]
1364 vpshufb %%XMM5, [SHUF_MASK]
1365 vpshufb %%XMM6, [SHUF_MASK]
1366 vpshufb %%XMM7, [SHUF_MASK]
1367 vpshufb %%XMM8, [SHUF_MASK]
1368
1369
1370 vpxor %%XMM1, %%T6
1371
1372 %endmacro
1373
1374
1375 ; GHASH the last 4 ciphertext blocks.
1376 ; %%GDATA is GCM key data
1377 %macro GHASH_LAST_8 16
1378 %define %%GDATA %1
1379 %define %%T1 %2
1380 %define %%T2 %3
1381 %define %%T3 %4
1382 %define %%T4 %5
1383 %define %%T5 %6
1384 %define %%T6 %7
1385 %define %%T7 %8
1386 %define %%XMM1 %9
1387 %define %%XMM2 %10
1388 %define %%XMM3 %11
1389 %define %%XMM4 %12
1390 %define %%XMM5 %13
1391 %define %%XMM6 %14
1392 %define %%XMM7 %15
1393 %define %%XMM8 %16
1394 ;; Karatsuba Method
1395
1396
1397 vpshufd %%T2, %%XMM1, 01001110b
1398 vpxor %%T2, %%XMM1
1399 vmovdqu %%T5, [%%GDATA + HashKey_8]
1400 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
1401 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
1402
1403 vmovdqu %%T3, [%%GDATA + HashKey_8_k]
1404 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
1405
1406
1407 ;;;;;;;;;;;;;;;;;;;;;;
1408
1409
1410 vpshufd %%T2, %%XMM2, 01001110b
1411 vpxor %%T2, %%XMM2
1412 vmovdqu %%T5, [%%GDATA + HashKey_7]
1413 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
1414 vpxor %%T6, %%T6, %%T4
1415
1416 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
1417 vpxor %%T7, %%T7, %%T4
1418
1419 vmovdqu %%T3, [%%GDATA + HashKey_7_k]
1420 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1421 vpxor %%XMM1, %%XMM1, %%T2
1422
1423 ;;;;;;;;;;;;;;;;;;;;;;
1424
1425
1426 vpshufd %%T2, %%XMM3, 01001110b
1427 vpxor %%T2, %%XMM3
1428 vmovdqu %%T5, [%%GDATA + HashKey_6]
1429 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
1430 vpxor %%T6, %%T6, %%T4
1431
1432 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
1433 vpxor %%T7, %%T7, %%T4
1434
1435 vmovdqu %%T3, [%%GDATA + HashKey_6_k]
1436 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1437 vpxor %%XMM1, %%XMM1, %%T2
1438
1439 ;;;;;;;;;;;;;;;;;;;;;;
1440
1441
1442 vpshufd %%T2, %%XMM4, 01001110b
1443 vpxor %%T2, %%XMM4
1444 vmovdqu %%T5, [%%GDATA + HashKey_5]
1445 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
1446 vpxor %%T6, %%T6, %%T4
1447
1448 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
1449 vpxor %%T7, %%T7, %%T4
1450
1451 vmovdqu %%T3, [%%GDATA + HashKey_5_k]
1452 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1453 vpxor %%XMM1, %%XMM1, %%T2
1454
1455 ;;;;;;;;;;;;;;;;;;;;;;
1456
1457 vpshufd %%T2, %%XMM5, 01001110b
1458 vpxor %%T2, %%XMM5
1459 vmovdqu %%T5, [%%GDATA + HashKey_4]
1460 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
1461 vpxor %%T6, %%T6, %%T4
1462
1463 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
1464 vpxor %%T7, %%T7, %%T4
1465
1466 vmovdqu %%T3, [%%GDATA + HashKey_4_k]
1467 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1468 vpxor %%XMM1, %%XMM1, %%T2
1469
1470 ;;;;;;;;;;;;;;;;;;;;;;
1471
1472 vpshufd %%T2, %%XMM6, 01001110b
1473 vpxor %%T2, %%XMM6
1474 vmovdqu %%T5, [%%GDATA + HashKey_3]
1475
1476 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
1477 vpxor %%T6, %%T6, %%T4
1478
1479 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
1480 vpxor %%T7, %%T7, %%T4
1481
1482 vmovdqu %%T3, [%%GDATA + HashKey_3_k]
1483 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1484 vpxor %%XMM1, %%XMM1, %%T2
1485
1486 ;;;;;;;;;;;;;;;;;;;;;;
1487
1488 vpshufd %%T2, %%XMM7, 01001110b
1489 vpxor %%T2, %%XMM7
1490 vmovdqu %%T5, [%%GDATA + HashKey_2]
1491 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
1492 vpxor %%T6, %%T6, %%T4
1493
1494 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
1495 vpxor %%T7, %%T7, %%T4
1496
1497 vmovdqu %%T3, [%%GDATA + HashKey_2_k]
1498 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1499 vpxor %%XMM1, %%XMM1, %%T2
1500
1501 ;;;;;;;;;;;;;;;;;;;;;;
1502
1503 vpshufd %%T2, %%XMM8, 01001110b
1504 vpxor %%T2, %%XMM8
1505 vmovdqu %%T5, [%%GDATA + HashKey]
1506 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
1507 vpxor %%T6, %%T6, %%T4
1508
1509 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
1510 vpxor %%T7, %%T7, %%T4
1511
1512 vmovdqu %%T3, [%%GDATA + HashKey_k]
1513 vpclmulqdq %%T2, %%T2, %%T3, 0x00
1514
1515 vpxor %%XMM1, %%XMM1, %%T2
1516 vpxor %%XMM1, %%XMM1, %%T6
1517 vpxor %%T2, %%XMM1, %%T7
1518
1519
1520
1521
1522 vpslldq %%T4, %%T2, 8
1523 vpsrldq %%T2, %%T2, 8
1524
1525 vpxor %%T7, %%T4
1526 vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
1527
1528 ;first phase of the reduction
1529
1530 vpslld %%T2, %%T7, 31 ; packed right shifting << 31
1531 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
1532 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
1533
1534 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
1535 vpxor %%T2, %%T2, %%T4
1536
1537 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
1538
1539 vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
1540 vpxor %%T7, %%T2 ; first phase of the reduction complete
1541 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1542
1543 ;second phase of the reduction
1544
1545 vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
1546 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
1547 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
1548 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
1549 vpxor %%T2, %%T2,%%T4
1550
1551 vpxor %%T2, %%T2, %%T1
1552 vpxor %%T7, %%T7, %%T2
1553 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
1554
1555
1556 %endmacro
1557
1558
1559 ; Encryption of a single block
1560 ; %%GDATA is GCM key data
1561 %macro ENCRYPT_SINGLE_BLOCK 2
1562 %define %%GDATA %1
1563 %define %%XMM0 %2
1564
1565 vpxor %%XMM0, [%%GDATA+16*0]
1566 %assign i 1
1567 %rep NROUNDS
1568 vaesenc %%XMM0, [%%GDATA+16*i]
1569 %assign i (i+1)
1570 %endrep ; NROUNDS
1571 vaesenclast %%XMM0, [%%GDATA+16*i]
1572 %endmacro
1573
1574
1575 ;; Start of Stack Setup
1576
1577 %macro FUNC_SAVE 0
1578 ;; Required for Update/GMC_ENC
1579 ;the number of pushes must equal STACK_OFFSET
1580 push r12
1581 push r13
1582 push r14
1583 push r15
1584 mov r14, rsp
1585
1586 sub rsp, VARIABLE_OFFSET
1587 and rsp, ~63
1588
1589 %ifidn __OUTPUT_FORMAT__, win64
1590 ; xmm6:xmm15 need to be maintained for Windows
1591 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
1592 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
1593 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
1594 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
1595 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
1596 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
1597 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
1598 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
1599 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
1600 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
1601 %endif
1602 %endmacro
1603
1604
1605 %macro FUNC_RESTORE 0
1606
1607 %ifidn __OUTPUT_FORMAT__, win64
1608 vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
1609 vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
1610 vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
1611 vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
1612 vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
1613 vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
1614 vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
1615 vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
1616 vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
1617 vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
1618 %endif
1619
1620 ;; Required for Update/GMC_ENC
1621 mov rsp, r14
1622 pop r15
1623 pop r14
1624 pop r13
1625 pop r12
1626 %endmacro
1627
1628
1629 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1630 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
1631 ; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
1632 ; IV, Additional Authentication data (A_IN), Additional
1633 ; Data length (A_LEN)
1634 ; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
1635 ; Clobbers rax, r10-r13, and xmm0-xmm6
1636 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1637 %macro GCM_INIT 5
1638 %define %%GDATA_KEY %1
1639 %define %%GDATA_CTX %2
1640 %define %%IV %3
1641 %define %%A_IN %4
1642 %define %%A_LEN %5
1643 %define %%AAD_HASH xmm0
1644
1645 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
1646 vpxor xmm2, xmm3
1647 mov r10, %%A_LEN
1648
1649 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
1650 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
1651 xor r10, r10
1652 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
1653 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
1654 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
1655 mov r10, %%IV
1656 vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
1657 vpinsrq xmm2, [r10], 0
1658 vpinsrd xmm2, [r10+8], 2
1659 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
1660
1661 vpshufb xmm2, [rel SHUF_MASK]
1662
1663 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
1664 %endmacro
1665
1666
1667 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1668 ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
1669 ; has been initialized by GCM_INIT
1670 ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
1671 ; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
1672 ; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
1673 ; and whether encoding or decoding (ENC_DEC)
1674 ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
1675 ; Clobbers rax, r10-r15, and xmm0-xmm15
1676 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1677 %macro GCM_ENC_DEC 6
1678 %define %%GDATA_KEY %1
1679 %define %%GDATA_CTX %2
1680 %define %%CYPH_PLAIN_OUT %3
1681 %define %%PLAIN_CYPH_IN %4
1682 %define %%PLAIN_CYPH_LEN %5
1683 %define %%ENC_DEC %6
1684 %define %%DATA_OFFSET r11
1685
1686 ; Macro flow:
1687 ; calculate the number of 16byte blocks in the message
1688 ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
1689 ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
1690 ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
1691 cmp %%PLAIN_CYPH_LEN, 0
1692 je %%_multiple_of_16_bytes
1693
1694 xor %%DATA_OFFSET, %%DATA_OFFSET
1695 %ifidn __OUTPUT_FORMAT__, win64
1696 mov rax, %%PLAIN_CYPH_LEN
1697 add [%%GDATA_CTX + InLen], rax ; Update length of data processed
1698 %else
1699 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed
1700 %endif
1701 vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
1702 vmovdqu xmm8, [%%GDATA_CTX + AadHash]
1703
1704
1705 PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
1706
1707
1708 mov r13, %%PLAIN_CYPH_LEN
1709 sub r13, %%DATA_OFFSET
1710 mov r10, r13 ; save the amount of data left to process in r10
1711 and r13, -16 ; r13 = r13 - (r13 mod 16)
1712
1713 mov r12, r13
1714 shr r12, 4
1715 and r12, 7
1716
1717 jz %%_initial_num_blocks_is_0
1718
1719 cmp r12, 7
1720 je %%_initial_num_blocks_is_7
1721 cmp r12, 6
1722 je %%_initial_num_blocks_is_6
1723 cmp r12, 5
1724 je %%_initial_num_blocks_is_5
1725 cmp r12, 4
1726 je %%_initial_num_blocks_is_4
1727 cmp r12, 3
1728 je %%_initial_num_blocks_is_3
1729 cmp r12, 2
1730 je %%_initial_num_blocks_is_2
1731
1732 jmp %%_initial_num_blocks_is_1
1733
1734 %%_initial_num_blocks_is_7:
1735 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1736 sub r13, 16*7
1737 jmp %%_initial_blocks_encrypted
1738
1739 %%_initial_num_blocks_is_6:
1740 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1741 sub r13, 16*6
1742 jmp %%_initial_blocks_encrypted
1743
1744 %%_initial_num_blocks_is_5:
1745 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1746 sub r13, 16*5
1747 jmp %%_initial_blocks_encrypted
1748
1749 %%_initial_num_blocks_is_4:
1750 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1751 sub r13, 16*4
1752 jmp %%_initial_blocks_encrypted
1753
1754
1755 %%_initial_num_blocks_is_3:
1756 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1757 sub r13, 16*3
1758 jmp %%_initial_blocks_encrypted
1759 %%_initial_num_blocks_is_2:
1760 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1761 sub r13, 16*2
1762 jmp %%_initial_blocks_encrypted
1763
1764 %%_initial_num_blocks_is_1:
1765 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1766 sub r13, 16
1767 jmp %%_initial_blocks_encrypted
1768
1769 %%_initial_num_blocks_is_0:
1770 INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
1771
1772
1773 %%_initial_blocks_encrypted:
1774 cmp r13, 0
1775 je %%_zero_cipher_left
1776
1777 sub r13, 128
1778 je %%_eight_cipher_left
1779
1780
1781
1782
1783 vmovd r15d, xmm9
1784 and r15d, 255
1785 vpshufb xmm9, [SHUF_MASK]
1786
1787
1788 %%_encrypt_by_8_new:
1789 cmp r15d, 255-8
1790 jg %%_encrypt_by_8
1791
1792
1793
1794 add r15b, 8
1795 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
1796 add %%DATA_OFFSET, 128
1797 sub r13, 128
1798 jne %%_encrypt_by_8_new
1799
1800 vpshufb xmm9, [SHUF_MASK]
1801 jmp %%_eight_cipher_left
1802
1803 %%_encrypt_by_8:
1804 vpshufb xmm9, [SHUF_MASK]
1805 add r15b, 8
1806 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
1807 vpshufb xmm9, [SHUF_MASK]
1808 add %%DATA_OFFSET, 128
1809 sub r13, 128
1810 jne %%_encrypt_by_8_new
1811
1812 vpshufb xmm9, [SHUF_MASK]
1813
1814
1815
1816
1817 %%_eight_cipher_left:
1818 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
1819
1820
1821 %%_zero_cipher_left:
1822 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14
1823 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9
1824
1825 mov r13, r10
1826 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
1827
1828 je %%_multiple_of_16_bytes
1829
1830 mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13
1831 ; handle the last <16 Byte block seperately
1832
1833 vpaddd xmm9, [ONE] ; INCR CNT to get Yn
1834 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
1835 vpshufb xmm9, [SHUF_MASK]
1836 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn)
1837 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9
1838
1839 cmp %%PLAIN_CYPH_LEN, 16
1840 jge %%_large_enough_update
1841
1842 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1843 READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
1844 lea r12, [SHIFT_MASK + 16]
1845 sub r12, r13
1846 jmp %%_data_read
1847
1848 %%_large_enough_update:
1849 sub %%DATA_OFFSET, 16
1850 add %%DATA_OFFSET, r13
1851
1852 vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
1853
1854 sub %%DATA_OFFSET, r13
1855 add %%DATA_OFFSET, 16
1856
1857
1858 lea r12, [SHIFT_MASK + 16]
1859 sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
1860
1861 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
1862 vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
1863 %%_data_read:
1864 %ifidn %%ENC_DEC, DEC
1865 vmovdqa xmm2, xmm1
1866 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1867 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1868 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1869 vpand xmm2, xmm1
1870 vpshufb xmm2, [SHUF_MASK]
1871 vpxor xmm14, xmm2
1872 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1873
1874 %else
1875 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
1876 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
1877 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
1878 vpshufb xmm9, [SHUF_MASK]
1879 vpxor xmm14, xmm9
1880 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1881
1882 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
1883 %endif
1884
1885 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1886 ; output r13 Bytes
1887 vmovq rax, xmm9
1888 cmp r13, 8
1889 jle %%_less_than_8_bytes_left
1890
1891 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
1892 add %%DATA_OFFSET, 8
1893 vpsrldq xmm9, xmm9, 8
1894 vmovq rax, xmm9
1895 sub r13, 8
1896
1897 %%_less_than_8_bytes_left:
1898 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
1899 add %%DATA_OFFSET, 1
1900 shr rax, 8
1901 sub r13, 1
1902 jne %%_less_than_8_bytes_left
1903 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1904
1905 %%_multiple_of_16_bytes:
1906
1907
1908
1909 %endmacro
1910
1911
1912 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1913 ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
1914 ; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
1915 ; whether encoding or decoding (ENC_DEC).
1916 ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
1917 ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
1918 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1919 %macro GCM_COMPLETE 5
1920 %define %%GDATA_KEY %1
1921 %define %%GDATA_CTX %2
1922 %define %%AUTH_TAG %3
1923 %define %%AUTH_TAG_LEN %4
1924 %define %%ENC_DEC %5
1925 %define %%PLAIN_CYPH_LEN rax
1926
1927 mov r12, [%%GDATA_CTX + PBlockLen]
1928 vmovdqu xmm14, [%%GDATA_CTX + AadHash]
1929 vmovdqu xmm13, [%%GDATA_KEY + HashKey]
1930
1931 cmp r12, 0
1932
1933 je %%_partial_done
1934
1935 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
1936 vmovdqu [%%GDATA_CTX + AadHash], xmm14
1937
1938 %%_partial_done:
1939
1940 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
1941 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
1942
1943 shl r12, 3 ; convert into number of bits
1944 vmovd xmm15, r12d ; len(A) in xmm15
1945
1946 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
1947 vmovq xmm1, %%PLAIN_CYPH_LEN
1948 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
1949 vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
1950
1951 vpxor xmm14, xmm15
1952 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
1953 vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
1954
1955 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
1956
1957 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
1958
1959 vpxor xmm9, xmm14
1960
1961
1962 %%_return_T:
1963 mov r10, %%AUTH_TAG ; r10 = authTag
1964 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
1965
1966 cmp r11, 16
1967 je %%_T_16
1968
1969 cmp r11, 12
1970 je %%_T_12
1971
1972 cmp r11, 8
1973 je %%_T_8
1974
1975 simd_store_avx r10, xmm9, r11, r12, rax
1976 jmp %%_return_T_done
1977 %%_T_8:
1978 vmovq rax, xmm9
1979 mov [r10], rax
1980 jmp %%_return_T_done
1981 %%_T_12:
1982 vmovq rax, xmm9
1983 mov [r10], rax
1984 vpsrldq xmm9, xmm9, 8
1985 vmovd eax, xmm9
1986 mov [r10 + 8], eax
1987 jmp %%_return_T_done
1988 %%_T_16:
1989 vmovdqu [r10], xmm9
1990
1991 %%_return_T_done:
1992 %endmacro ; GCM_COMPLETE
1993
1994
1995 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1996 ;void aes_gcm_precomp_128_avx_gen2
1997 ; (struct gcm_key_data *key_data);
1998 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1999 MKGLOBAL(FN_NAME(precomp,_),function,)
2000 FN_NAME(precomp,_):
2001
2002 push r12
2003 push r13
2004 push r14
2005 push r15
2006
2007 mov r14, rsp
2008
2009
2010
2011 sub rsp, VARIABLE_OFFSET
2012 and rsp, ~63 ; align rsp to 64 bytes
2013
2014 %ifidn __OUTPUT_FORMAT__, win64
2015 ; only xmm6 needs to be maintained
2016 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2017 %endif
2018
2019 vpxor xmm6, xmm6
2020 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
2021
2022 vpshufb xmm6, [SHUF_MASK]
2023 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
2024 vmovdqa xmm2, xmm6
2025 vpsllq xmm6, 1
2026 vpsrlq xmm2, 63
2027 vmovdqa xmm1, xmm2
2028 vpslldq xmm2, xmm2, 8
2029 vpsrldq xmm1, xmm1, 8
2030 vpor xmm6, xmm2
2031 ;reduction
2032 vpshufd xmm2, xmm1, 00100100b
2033 vpcmpeqd xmm2, [TWOONE]
2034 vpand xmm2, [POLY]
2035 vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
2036 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2037 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
2038
2039
2040 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
2041
2042 %ifidn __OUTPUT_FORMAT__, win64
2043 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2044 %endif
2045 mov rsp, r14
2046
2047 pop r15
2048 pop r14
2049 pop r13
2050 pop r12
2051 ret
2052
2053 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2054 ;void aes_gcm_init_128_avx_gen2(
2055 ; const struct gcm_key_data *key_data,
2056 ; struct gcm_context_data *context_data,
2057 ; u8 *iv,
2058 ; const u8 *aad,
2059 ; u64 aad_len);
2060 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2061 MKGLOBAL(FN_NAME(init,_),function,)
2062 FN_NAME(init,_):
2063 push r12
2064 push r13
2065 %ifidn __OUTPUT_FORMAT__, win64
2066 push r14
2067 push r15
2068 mov r14, rsp
2069 ; xmm6:xmm15 need to be maintained for Windows
2070 sub rsp, 1*16
2071 movdqu [rsp + 0*16], xmm6
2072 %endif
2073
2074 GCM_INIT arg1, arg2, arg3, arg4, arg5
2075
2076 %ifidn __OUTPUT_FORMAT__, win64
2077 movdqu xmm6 , [rsp + 0*16]
2078 mov rsp, r14
2079 pop r15
2080 pop r14
2081 %endif
2082 pop r13
2083 pop r12
2084 ret
2085
2086
2087 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2088 ;void aes_gcm_enc_128_update_avx_gen2(
2089 ; const struct gcm_key_data *key_data,
2090 ; struct gcm_context_data *context_data,
2091 ; u8 *out,
2092 ; const u8 *in,
2093 ; u64 plaintext_len);
2094 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2095 MKGLOBAL(FN_NAME(enc,_update_),function,)
2096 FN_NAME(enc,_update_):
2097
2098 FUNC_SAVE
2099
2100 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2101
2102 FUNC_RESTORE
2103
2104 ret
2105
2106
2107 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2108 ;void aes_gcm_dec_128_update_avx_gen2(
2109 ; const struct gcm_key_data *key_data,
2110 ; struct gcm_context_data *context_data,
2111 ; u8 *out,
2112 ; const u8 *in,
2113 ; u64 plaintext_len);
2114 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2115 MKGLOBAL(FN_NAME(dec,_update_),function,)
2116 FN_NAME(dec,_update_):
2117
2118 FUNC_SAVE
2119
2120 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2121
2122 FUNC_RESTORE
2123
2124 ret
2125
2126
2127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2128 ;void aes_gcm_enc_128_finalize_avx_gen2(
2129 ; const struct gcm_key_data *key_data,
2130 ; struct gcm_context_data *context_data,
2131 ; u8 *auth_tag,
2132 ; u64 auth_tag_len);
2133 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2134 MKGLOBAL(FN_NAME(enc,_finalize_),function,)
2135 FN_NAME(enc,_finalize_):
2136
2137 push r12
2138
2139 %ifidn __OUTPUT_FORMAT__, win64
2140 ; xmm6:xmm15 need to be maintained for Windows
2141 sub rsp, 5*16
2142 vmovdqu [rsp + 0*16],xmm6
2143 vmovdqu [rsp + 1*16],xmm9
2144 vmovdqu [rsp + 2*16],xmm11
2145 vmovdqu [rsp + 3*16],xmm14
2146 vmovdqu [rsp + 4*16],xmm15
2147 %endif
2148 GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
2149
2150 %ifidn __OUTPUT_FORMAT__, win64
2151 vmovdqu xmm15 , [rsp + 4*16]
2152 vmovdqu xmm14 , [rsp + 3*16]
2153 vmovdqu xmm11 , [rsp + 2*16]
2154 vmovdqu xmm9 , [rsp + 1*16]
2155 vmovdqu xmm6 , [rsp + 0*16]
2156 add rsp, 5*16
2157 %endif
2158
2159 pop r12
2160 ret
2161
2162
2163 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2164 ;void aes_gcm_dec_128_finalize_avx_gen2(
2165 ; const struct gcm_key_data *key_data,
2166 ; struct gcm_context_data *context_data,
2167 ; u8 *auth_tag,
2168 ; u64 auth_tag_len);
2169 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2170 MKGLOBAL(FN_NAME(dec,_finalize_),function,)
2171 FN_NAME(dec,_finalize_):
2172
2173 push r12
2174
2175 %ifidn __OUTPUT_FORMAT__, win64
2176 ; xmm6:xmm15 need to be maintained for Windows
2177 sub rsp, 5*16
2178 vmovdqu [rsp + 0*16],xmm6
2179 vmovdqu [rsp + 1*16],xmm9
2180 vmovdqu [rsp + 2*16],xmm11
2181 vmovdqu [rsp + 3*16],xmm14
2182 vmovdqu [rsp + 4*16],xmm15
2183 %endif
2184 GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
2185
2186 %ifidn __OUTPUT_FORMAT__, win64
2187 vmovdqu xmm15 , [rsp + 4*16]
2188 vmovdqu xmm14 , [rsp + 3*16]
2189 vmovdqu xmm11 , [rsp + 2*16]
2190 vmovdqu xmm9 , [rsp + 1*16]
2191 vmovdqu xmm6 , [rsp + 0*16]
2192 add rsp, 5*16
2193 %endif
2194
2195 pop r12
2196 ret
2197
2198
2199 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2200 ;void aes_gcm_enc_128_avx_gen2(
2201 ; const struct gcm_key_data *key_data,
2202 ; struct gcm_context_data *context_data,
2203 ; u8 *out,
2204 ; const u8 *in,
2205 ; u64 plaintext_len,
2206 ; u8 *iv,
2207 ; const u8 *aad,
2208 ; u64 aad_len,
2209 ; u8 *auth_tag,
2210 ; u64 auth_tag_len);
2211 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2212 MKGLOBAL(FN_NAME(enc,_),function,)
2213 FN_NAME(enc,_):
2214
2215 FUNC_SAVE
2216
2217 GCM_INIT arg1, arg2, arg6, arg7, arg8
2218
2219 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
2220
2221 GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
2222
2223 FUNC_RESTORE
2224
2225 ret
2226
2227 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2228 ;void aes_gcm_dec_128_avx_gen2(
2229 ; const struct gcm_key_data *key_data,
2230 ; struct gcm_context_data *context_data,
2231 ; u8 *out,
2232 ; const u8 *in,
2233 ; u64 plaintext_len,
2234 ; u8 *iv,
2235 ; const u8 *aad,
2236 ; u64 aad_len,
2237 ; u8 *auth_tag,
2238 ; u64 auth_tag_len);
2239 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2240 MKGLOBAL(FN_NAME(dec,_),function,)
2241 FN_NAME(dec,_):
2242
2243 FUNC_SAVE
2244
2245 GCM_INIT arg1, arg2, arg6, arg7, arg8
2246
2247 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
2248
2249 GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
2250
2251 FUNC_RESTORE
2252
2253 ret
2254
2255 %ifdef LINUX
2256 section .note.GNU-stack noalloc noexec nowrite progbits
2257 %endif