]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2018, Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | ; | |
31 | ; Authors: | |
32 | ; Erdinc Ozturk | |
33 | ; Vinodh Gopal | |
34 | ; James Guilford | |
35 | ; | |
36 | ; | |
37 | ; References: | |
38 | ; This code was derived and highly optimized from the code described in paper: | |
39 | ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 | |
40 | ; The details of the implementation is explained in: | |
41 | ; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. | |
42 | ; | |
43 | ; | |
44 | ; | |
45 | ; | |
46 | ; Assumptions: | |
47 | ; | |
48 | ; | |
49 | ; | |
50 | ; iv: | |
51 | ; 0 1 2 3 | |
52 | ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
53 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
54 | ; | Salt (From the SA) | | |
55 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
56 | ; | Initialization Vector | | |
57 | ; | (This is the sequence number from IPSec header) | | |
58 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
59 | ; | 0x1 | | |
60 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
61 | ; | |
62 | ; | |
63 | ; | |
64 | ; AAD: | |
65 | ; AAD will be padded with 0 to the next 16byte multiple | |
66 | ; for example, assume AAD is a u32 vector | |
67 | ; | |
68 | ; if AAD is 8 bytes: | |
69 | ; AAD[3] = {A0, A1}; | |
70 | ; padded AAD in xmm register = {A1 A0 0 0} | |
71 | ; | |
72 | ; 0 1 2 3 | |
73 | ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
74 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
75 | ; | SPI (A1) | | |
76 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
77 | ; | 32-bit Sequence Number (A0) | | |
78 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
79 | ; | 0x0 | | |
80 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
81 | ; | |
82 | ; AAD Format with 32-bit Sequence Number | |
83 | ; | |
84 | ; if AAD is 12 bytes: | |
85 | ; AAD[3] = {A0, A1, A2}; | |
86 | ; padded AAD in xmm register = {A2 A1 A0 0} | |
87 | ; | |
88 | ; 0 1 2 3 | |
89 | ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
90 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
91 | ; | SPI (A2) | | |
92 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
93 | ; | 64-bit Extended Sequence Number {A1,A0} | | |
94 | ; | | | |
95 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
96 | ; | 0x0 | | |
97 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
98 | ; | |
99 | ; AAD Format with 64-bit Extended Sequence Number | |
100 | ; | |
101 | ; | |
102 | ; aadLen: | |
103 | ; Must be a multiple of 4 bytes and from the definition of the spec. | |
104 | ; The code additionally supports any aadLen length. | |
105 | ; | |
106 | ; TLen: | |
107 | ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | |
108 | ; | |
109 | ; poly = x^128 + x^127 + x^126 + x^121 + 1 | |
110 | ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. | |
111 | ; | |
112 | ||
113 | %include "os.asm" | |
114 | %include "reg_sizes.asm" | |
115 | %include "gcm_defines.asm" | |
9f95a23c | 116 | %include "memcpy.asm" |
11fdf7f2 TL |
117 | |
118 | %ifndef GCM128_MODE | |
119 | %ifndef GCM192_MODE | |
120 | %ifndef GCM256_MODE | |
121 | %error "No GCM mode selected for gcm_avx_gen4.asm!" | |
122 | %endif | |
123 | %endif | |
124 | %endif | |
125 | ||
126 | ;; Decide on AES-GCM key size to compile for | |
127 | %ifdef GCM128_MODE | |
128 | %define NROUNDS 9 | |
129 | %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 | |
130 | %endif | |
131 | ||
132 | %ifdef GCM192_MODE | |
133 | %define NROUNDS 11 | |
134 | %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 | |
135 | %endif | |
136 | ||
137 | %ifdef GCM256_MODE | |
138 | %define NROUNDS 13 | |
139 | %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 | |
140 | %endif | |
141 | ||
142 | section .text | |
143 | default rel | |
144 | ||
145 | ; need to push 4 registers into stack to maintain | |
146 | %define STACK_OFFSET 8*4 | |
147 | ||
148 | %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) | |
149 | %define TMP3 16*1 ; Temporary storage for AES State 3 | |
150 | %define TMP4 16*2 ; Temporary storage for AES State 4 | |
151 | %define TMP5 16*3 ; Temporary storage for AES State 5 | |
152 | %define TMP6 16*4 ; Temporary storage for AES State 6 | |
153 | %define TMP7 16*5 ; Temporary storage for AES State 7 | |
154 | %define TMP8 16*6 ; Temporary storage for AES State 8 | |
155 | ||
156 | %define LOCAL_STORAGE 16*7 | |
157 | ||
158 | %ifidn __OUTPUT_FORMAT__, win64 | |
159 | %define XMM_STORAGE 16*10 | |
160 | %else | |
161 | %define XMM_STORAGE 0 | |
162 | %endif | |
163 | ||
164 | %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE | |
165 | ||
166 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
167 | ; Utility Macros | |
168 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
169 | ||
170 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
171 | ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | |
172 | ; Input: A and B (128-bits each, bit-reflected) | |
173 | ; Output: C = A*B*x mod poly, (i.e. >>1 ) | |
174 | ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | |
175 | ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | |
176 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
177 | %macro GHASH_MUL 7 | |
178 | %define %%GH %1 ; 16 Bytes | |
179 | %define %%HK %2 ; 16 Bytes | |
180 | %define %%T1 %3 | |
181 | %define %%T2 %4 | |
182 | %define %%T3 %5 | |
183 | %define %%T4 %6 | |
184 | %define %%T5 %7 | |
185 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
186 | ||
187 | vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 | |
188 | vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 | |
189 | vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 | |
190 | vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 | |
191 | vpxor %%GH, %%GH, %%T3 | |
192 | ||
193 | ||
194 | vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs | |
195 | vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs | |
196 | ||
197 | vpxor %%T1, %%T1, %%T3 | |
198 | vpxor %%GH, %%GH, %%T2 | |
199 | ||
200 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
201 | ;first phase of the reduction | |
9f95a23c | 202 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
203 | |
204 | vpclmulqdq %%T2, %%T3, %%GH, 0x01 | |
205 | vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs | |
206 | ||
207 | vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete | |
208 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
209 | ;second phase of the reduction | |
210 | vpclmulqdq %%T2, %%T3, %%GH, 0x00 | |
211 | vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
212 | ||
213 | vpclmulqdq %%GH, %%T3, %%GH, 0x10 | |
214 | vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
215 | ||
216 | vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete | |
217 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
218 | vpxor %%GH, %%GH, %%T1 ; the result is in %%GH | |
219 | ||
220 | %endmacro | |
221 | ||
222 | ||
223 | ; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 | |
224 | ; functions, but are kept to allow users to switch cpu architectures between calls | |
225 | ; of pre, init, update, and finalize. | |
226 | %macro PRECOMPUTE 8 | |
227 | %define %%GDATA %1 | |
228 | %define %%HK %2 | |
229 | %define %%T1 %3 | |
230 | %define %%T2 %4 | |
231 | %define %%T3 %5 | |
232 | %define %%T4 %6 | |
233 | %define %%T5 %7 | |
234 | %define %%T6 %8 | |
235 | ||
236 | ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
237 | vmovdqa %%T5, %%HK | |
238 | ||
239 | vpshufd %%T1, %%T5, 01001110b | |
240 | vpxor %%T1, %%T5 | |
241 | vmovdqu [%%GDATA + HashKey_k], %%T1 | |
242 | ||
243 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly | |
244 | vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly | |
245 | vpshufd %%T1, %%T5, 01001110b | |
246 | vpxor %%T1, %%T5 | |
247 | vmovdqu [%%GDATA + HashKey_2_k], %%T1 | |
248 | ||
249 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly | |
250 | vmovdqu [%%GDATA + HashKey_3], %%T5 | |
251 | vpshufd %%T1, %%T5, 01001110b | |
252 | vpxor %%T1, %%T5 | |
253 | vmovdqu [%%GDATA + HashKey_3_k], %%T1 | |
254 | ||
255 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly | |
256 | vmovdqu [%%GDATA + HashKey_4], %%T5 | |
257 | vpshufd %%T1, %%T5, 01001110b | |
258 | vpxor %%T1, %%T5 | |
259 | vmovdqu [%%GDATA + HashKey_4_k], %%T1 | |
260 | ||
261 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly | |
262 | vmovdqu [%%GDATA + HashKey_5], %%T5 | |
263 | vpshufd %%T1, %%T5, 01001110b | |
264 | vpxor %%T1, %%T5 | |
265 | vmovdqu [%%GDATA + HashKey_5_k], %%T1 | |
266 | ||
267 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly | |
268 | vmovdqu [%%GDATA + HashKey_6], %%T5 | |
269 | vpshufd %%T1, %%T5, 01001110b | |
270 | vpxor %%T1, %%T5 | |
271 | vmovdqu [%%GDATA + HashKey_6_k], %%T1 | |
272 | ||
273 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly | |
274 | vmovdqu [%%GDATA + HashKey_7], %%T5 | |
275 | vpshufd %%T1, %%T5, 01001110b | |
276 | vpxor %%T1, %%T5 | |
277 | vmovdqu [%%GDATA + HashKey_7_k], %%T1 | |
278 | ||
279 | GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly | |
280 | vmovdqu [%%GDATA + HashKey_8], %%T5 | |
281 | vpshufd %%T1, %%T5, 01001110b | |
282 | vpxor %%T1, %%T5 | |
283 | vmovdqu [%%GDATA + HashKey_8_k], %%T1 | |
284 | %endmacro | |
285 | ||
286 | ||
287 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
288 | ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. | |
289 | ; Returns 0 if data has length 0. | |
290 | ; Input: The input data (INPUT), that data's length (LENGTH). | |
291 | ; Output: The packed xmm register (OUTPUT). | |
292 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
293 | %macro READ_SMALL_DATA_INPUT 6 | |
294 | %define %%OUTPUT %1 ; %%OUTPUT is an xmm register | |
295 | %define %%INPUT %2 | |
296 | %define %%LENGTH %3 | |
297 | %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers | |
298 | %define %%COUNTER %5 | |
299 | %define %%TMP1 %6 | |
300 | ||
301 | vpxor %%OUTPUT, %%OUTPUT | |
302 | mov %%COUNTER, %%LENGTH | |
303 | mov %%END_READ_LOCATION, %%INPUT | |
304 | add %%END_READ_LOCATION, %%LENGTH | |
305 | xor %%TMP1, %%TMP1 | |
306 | ||
307 | ||
308 | cmp %%COUNTER, 8 | |
309 | jl %%_byte_loop_2 | |
310 | vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists | |
311 | je %%_done | |
312 | ||
313 | sub %%COUNTER, 8 | |
314 | ||
315 | %%_byte_loop_1: ;Read in data 1 byte at a time while data is left | |
316 | shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in | |
317 | dec %%END_READ_LOCATION | |
318 | mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] | |
319 | dec %%COUNTER | |
320 | jg %%_byte_loop_1 | |
321 | vpinsrq %%OUTPUT, %%TMP1, 1 | |
322 | jmp %%_done | |
323 | ||
324 | %%_byte_loop_2: ;Read in data 1 byte at a time while data is left | |
325 | ;; NOTE: in current implementation check for zero length is obsolete here. | |
326 | ;; The adequate checks are done by callers of this macro. | |
327 | ;; cmp %%COUNTER, 0 | |
328 | ;; je %%_done | |
329 | shl %%TMP1, 8 ;This loop handles when no bytes were already read in | |
330 | dec %%END_READ_LOCATION | |
331 | mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] | |
332 | dec %%COUNTER | |
333 | jg %%_byte_loop_2 | |
334 | vpinsrq %%OUTPUT, %%TMP1, 0 | |
335 | %%_done: | |
336 | ||
337 | %endmacro ; READ_SMALL_DATA_INPUT | |
338 | ||
339 | ||
340 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
341 | ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. | |
342 | ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). | |
343 | ; Output: The hash of the data (AAD_HASH). | |
344 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
9f95a23c | 345 | %macro CALC_AAD_HASH 15 |
11fdf7f2 TL |
346 | %define %%A_IN %1 |
347 | %define %%A_LEN %2 | |
348 | %define %%AAD_HASH %3 | |
9f95a23c TL |
349 | %define %%GDATA_KEY %4 |
350 | %define %%XTMP0 %5 ; xmm temp reg 5 | |
351 | %define %%XTMP1 %6 ; xmm temp reg 5 | |
352 | %define %%XTMP2 %7 | |
353 | %define %%XTMP3 %8 | |
354 | %define %%XTMP4 %9 | |
355 | %define %%XTMP5 %10 ; xmm temp reg 5 | |
356 | %define %%T1 %11 ; temp reg 1 | |
357 | %define %%T2 %12 | |
358 | %define %%T3 %13 | |
359 | %define %%T4 %14 | |
360 | %define %%T5 %15 ; temp reg 5 | |
11fdf7f2 TL |
361 | |
362 | ||
363 | mov %%T1, %%A_IN ; T1 = AAD | |
364 | mov %%T2, %%A_LEN ; T2 = aadLen | |
365 | vpxor %%AAD_HASH, %%AAD_HASH | |
366 | ||
9f95a23c TL |
367 | %%_get_AAD_loop128: |
368 | cmp %%T2, 128 | |
369 | jl %%_exit_AAD_loop128 | |
370 | ||
371 | vmovdqu %%XTMP0, [%%T1 + 16*0] | |
372 | vpshufb %%XTMP0, [rel SHUF_MASK] | |
373 | ||
374 | vpxor %%XTMP0, %%AAD_HASH | |
375 | ||
376 | vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8] | |
377 | vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 | |
378 | vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 | |
379 | vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 | |
380 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 | |
381 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 | |
382 | ||
383 | %assign i 1 | |
384 | %assign j 7 | |
385 | %rep 7 | |
386 | vmovdqu %%XTMP0, [%%T1 + 16*i] | |
387 | vpshufb %%XTMP0, [rel SHUF_MASK] | |
388 | ||
389 | vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j] | |
390 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 | |
391 | vpxor %%XTMP1, %%XTMP1, %%XTMP4 | |
392 | ||
393 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 | |
394 | vpxor %%XTMP2, %%XTMP2, %%XTMP4 | |
395 | ||
396 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 | |
397 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 | |
398 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 | |
399 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 | |
400 | %assign i (i + 1) | |
401 | %assign j (j - 1) | |
402 | %endrep | |
403 | ||
404 | vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs | |
405 | vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs | |
406 | vpxor %%XTMP2, %%XTMP2, %%XTMP4 | |
407 | vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) | |
408 | ||
409 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
410 | ;first phase of the reduction | |
411 | vmovdqa %%XTMP5, [rel POLY2] | |
412 | vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 | |
413 | vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs | |
414 | vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete | |
415 | ||
416 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
417 | ;second phase of the reduction | |
418 | vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 | |
419 | vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
420 | ||
421 | vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 | |
422 | vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
423 | ||
424 | vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete | |
425 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
426 | vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 | |
427 | ||
428 | sub %%T2, 128 | |
429 | je %%_CALC_AAD_done | |
430 | ||
431 | add %%T1, 128 | |
432 | jmp %%_get_AAD_loop128 | |
433 | ||
434 | %%_exit_AAD_loop128: | |
11fdf7f2 TL |
435 | cmp %%T2, 16 |
436 | jl %%_get_small_AAD_block | |
437 | ||
9f95a23c TL |
438 | ;; calculate hash_key position to start with |
439 | mov %%T3, %%T2 | |
440 | and %%T3, -16 ; 1 to 7 blocks possible here | |
441 | neg %%T3 | |
442 | add %%T3, HashKey_1 + 16 | |
443 | lea %%T3, [%%GDATA_KEY + %%T3] | |
11fdf7f2 | 444 | |
9f95a23c TL |
445 | vmovdqu %%XTMP0, [%%T1] |
446 | vpshufb %%XTMP0, [rel SHUF_MASK] | |
447 | ||
448 | vpxor %%XTMP0, %%AAD_HASH | |
449 | ||
450 | vmovdqu %%XTMP5, [%%T3] | |
451 | vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 | |
452 | vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 | |
453 | vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 | |
454 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 | |
455 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 | |
11fdf7f2 | 456 | |
9f95a23c TL |
457 | add %%T3, 16 ; move to next hashkey |
458 | add %%T1, 16 ; move to next data block | |
11fdf7f2 | 459 | sub %%T2, 16 |
9f95a23c TL |
460 | cmp %%T2, 16 |
461 | jl %%_AAD_reduce | |
462 | ||
463 | %%_AAD_blocks: | |
464 | vmovdqu %%XTMP0, [%%T1] | |
465 | vpshufb %%XTMP0, [rel SHUF_MASK] | |
466 | ||
467 | vmovdqu %%XTMP5, [%%T3] | |
468 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 | |
469 | vpxor %%XTMP1, %%XTMP1, %%XTMP4 | |
11fdf7f2 | 470 | |
9f95a23c TL |
471 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 |
472 | vpxor %%XTMP2, %%XTMP2, %%XTMP4 | |
473 | ||
474 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 | |
475 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 | |
476 | vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 | |
477 | vpxor %%XTMP3, %%XTMP3, %%XTMP4 | |
478 | ||
479 | add %%T3, 16 ; move to next hashkey | |
11fdf7f2 | 480 | add %%T1, 16 |
9f95a23c | 481 | sub %%T2, 16 |
11fdf7f2 | 482 | cmp %%T2, 16 |
9f95a23c TL |
483 | jl %%_AAD_reduce |
484 | jmp %%_AAD_blocks | |
485 | ||
486 | %%_AAD_reduce: | |
487 | vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs | |
488 | vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs | |
489 | vpxor %%XTMP2, %%XTMP2, %%XTMP4 | |
490 | vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) | |
491 | ||
492 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
493 | ;first phase of the reduction | |
494 | vmovdqa %%XTMP5, [rel POLY2] | |
495 | vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 | |
496 | vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs | |
497 | vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete | |
498 | ||
499 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
500 | ;second phase of the reduction | |
501 | vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 | |
502 | vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
503 | ||
504 | vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 | |
505 | vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
506 | ||
507 | vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete | |
508 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
509 | vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 | |
510 | ||
511 | or %%T2, %%T2 | |
512 | je %%_CALC_AAD_done | |
11fdf7f2 TL |
513 | |
514 | %%_get_small_AAD_block: | |
9f95a23c | 515 | vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey] |
11fdf7f2 TL |
516 | READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 |
517 | ;byte-reflect the AAD data | |
9f95a23c TL |
518 | vpshufb %%XTMP1, [rel SHUF_MASK] |
519 | vpxor %%AAD_HASH, %%XTMP1 | |
520 | GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 | |
11fdf7f2 TL |
521 | |
522 | %%_CALC_AAD_done: | |
523 | ||
524 | %endmacro ; CALC_AAD_HASH | |
525 | ||
526 | ||
527 | ||
528 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
529 | ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. | |
530 | ; Requires the input data be at least 1 byte long. | |
531 | ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), | |
532 | ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), | |
533 | ; and whether encoding or decoding (ENC_DEC) | |
534 | ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX | |
535 | ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 | |
536 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
537 | %macro PARTIAL_BLOCK 8 | |
538 | %define %%GDATA_KEY %1 | |
539 | %define %%GDATA_CTX %2 | |
540 | %define %%CYPH_PLAIN_OUT %3 | |
541 | %define %%PLAIN_CYPH_IN %4 | |
542 | %define %%PLAIN_CYPH_LEN %5 | |
543 | %define %%DATA_OFFSET %6 | |
544 | %define %%AAD_HASH %7 | |
545 | %define %%ENC_DEC %8 | |
546 | ||
547 | mov r13, [%%GDATA_CTX + PBlockLen] | |
548 | cmp r13, 0 | |
549 | je %%_partial_block_done ;Leave Macro if no partial blocks | |
550 | ||
551 | cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading | |
552 | jl %%_fewer_than_16_bytes | |
553 | VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register | |
554 | jmp %%_data_read | |
555 | ||
556 | %%_fewer_than_16_bytes: | |
557 | lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] | |
558 | READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 | |
559 | ||
560 | %%_data_read: ;Finished reading in data | |
561 | ||
562 | ||
563 | vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key | |
564 | vmovdqu xmm13, [%%GDATA_KEY + HashKey] | |
565 | ||
9f95a23c | 566 | lea r12, [rel SHIFT_MASK] |
11fdf7f2 TL |
567 | |
568 | add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) | |
569 | vmovdqu xmm2, [r12] ; get the appropriate shuffle mask | |
570 | vpshufb xmm9, xmm2 ;shift right r13 bytes | |
571 | ||
572 | %ifidn %%ENC_DEC, DEC | |
573 | vmovdqa xmm3, xmm1 | |
574 | vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) | |
575 | ||
576 | mov r15, %%PLAIN_CYPH_LEN | |
577 | add r15, r13 | |
578 | sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block | |
579 | jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly | |
580 | sub r12, r15 | |
581 | %%_no_extra_mask_1: | |
582 | ||
583 | vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 | |
584 | vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 | |
585 | ||
586 | vpand xmm3, xmm1 | |
9f95a23c | 587 | vpshufb xmm3, [rel SHUF_MASK] |
11fdf7f2 TL |
588 | vpshufb xmm3, xmm2 |
589 | vpxor %%AAD_HASH, xmm3 | |
590 | ||
591 | ||
592 | cmp r15,0 | |
593 | jl %%_partial_incomplete_1 | |
594 | ||
595 | GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block | |
596 | xor rax,rax | |
597 | mov [%%GDATA_CTX + PBlockLen], rax | |
598 | jmp %%_dec_done | |
599 | %%_partial_incomplete_1: | |
600 | %ifidn __OUTPUT_FORMAT__, win64 | |
601 | mov rax, %%PLAIN_CYPH_LEN | |
602 | add [%%GDATA_CTX + PBlockLen], rax | |
603 | %else | |
604 | add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN | |
605 | %endif | |
606 | %%_dec_done: | |
607 | vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH | |
608 | ||
609 | %else | |
610 | vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) | |
611 | ||
612 | mov r15, %%PLAIN_CYPH_LEN | |
613 | add r15, r13 | |
614 | sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block | |
615 | jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly | |
616 | sub r12, r15 | |
617 | %%_no_extra_mask_2: | |
618 | ||
619 | vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 | |
620 | vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 | |
621 | ||
9f95a23c | 622 | vpshufb xmm9, [rel SHUF_MASK] |
11fdf7f2 TL |
623 | vpshufb xmm9, xmm2 |
624 | vpxor %%AAD_HASH, xmm9 | |
625 | ||
626 | cmp r15,0 | |
627 | jl %%_partial_incomplete_2 | |
628 | ||
629 | GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block | |
630 | xor rax,rax | |
631 | mov [%%GDATA_CTX + PBlockLen], rax | |
632 | jmp %%_encode_done | |
633 | %%_partial_incomplete_2: | |
634 | %ifidn __OUTPUT_FORMAT__, win64 | |
635 | mov rax, %%PLAIN_CYPH_LEN | |
636 | add [%%GDATA_CTX + PBlockLen], rax | |
637 | %else | |
638 | add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN | |
639 | %endif | |
640 | %%_encode_done: | |
641 | vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH | |
642 | ||
9f95a23c | 643 | vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext |
11fdf7f2 TL |
644 | vpshufb xmm9, xmm2 |
645 | %endif | |
646 | ||
647 | ||
648 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
649 | ; output encrypted Bytes | |
650 | cmp r15,0 | |
651 | jl %%_partial_fill | |
652 | mov r12, r13 | |
653 | mov r13, 16 | |
654 | sub r13, r12 ; Set r13 to be the number of bytes to write out | |
655 | jmp %%_count_set | |
656 | %%_partial_fill: | |
657 | mov r13, %%PLAIN_CYPH_LEN | |
658 | %%_count_set: | |
659 | vmovq rax, xmm9 | |
660 | cmp r13, 8 | |
661 | jle %%_less_than_8_bytes_left | |
662 | ||
663 | mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax | |
664 | add %%DATA_OFFSET, 8 | |
665 | vpsrldq xmm9, xmm9, 8 | |
666 | vmovq rax, xmm9 | |
667 | sub r13, 8 | |
668 | %%_less_than_8_bytes_left: | |
669 | mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al | |
670 | add %%DATA_OFFSET, 1 | |
671 | shr rax, 8 | |
672 | sub r13, 1 | |
673 | jne %%_less_than_8_bytes_left | |
674 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
675 | ||
676 | %%_partial_block_done: | |
677 | %endmacro ; PARTIAL_BLOCK | |
678 | ||
679 | ||
680 | %macro GHASH_SINGLE_MUL 9 | |
681 | %define %%GDATA %1 | |
682 | %define %%HASHKEY %2 | |
683 | %define %%CIPHER %3 | |
684 | %define %%STATE_11 %4 | |
685 | %define %%STATE_00 %5 | |
686 | %define %%STATE_MID %6 | |
687 | %define %%T1 %7 | |
688 | %define %%T2 %8 | |
689 | %define %%FIRST %9 | |
690 | ||
691 | vmovdqu %%T1, [%%GDATA + %%HASHKEY] | |
692 | %ifidn %%FIRST, first | |
693 | vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 | |
694 | vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 | |
695 | vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 | |
696 | vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 | |
697 | vpxor %%STATE_MID, %%STATE_MID, %%T2 | |
698 | %else | |
699 | vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 | |
700 | vpxor %%STATE_11, %%STATE_11, %%T2 | |
701 | ||
702 | vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 | |
703 | vpxor %%STATE_00, %%STATE_00, %%T2 | |
704 | ||
705 | vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 | |
706 | vpxor %%STATE_MID, %%STATE_MID, %%T2 | |
707 | ||
708 | vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 | |
709 | vpxor %%STATE_MID, %%STATE_MID, %%T2 | |
710 | %endif | |
711 | ||
712 | %endmacro | |
713 | ||
714 | ; if a = number of total plaintext bytes | |
715 | ; b = floor(a/16) | |
716 | ; %%num_initial_blocks = b mod 8; | |
717 | ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext | |
718 | ; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. | |
719 | ; Updated AAD_HASH is returned in %%T3 | |
720 | ||
721 | %macro INITIAL_BLOCKS 23 | |
722 | %define %%GDATA_KEY %1 | |
723 | %define %%CYPH_PLAIN_OUT %2 | |
724 | %define %%PLAIN_CYPH_IN %3 | |
725 | %define %%LENGTH %4 | |
726 | %define %%DATA_OFFSET %5 | |
727 | %define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 | |
728 | %define %%T1 %7 | |
729 | %define %%T2 %8 | |
730 | %define %%T3 %9 | |
731 | %define %%T4 %10 | |
732 | %define %%T5 %11 | |
733 | %define %%CTR %12 | |
734 | %define %%XMM1 %13 | |
735 | %define %%XMM2 %14 | |
736 | %define %%XMM3 %15 | |
737 | %define %%XMM4 %16 | |
738 | %define %%XMM5 %17 | |
739 | %define %%XMM6 %18 | |
740 | %define %%XMM7 %19 | |
741 | %define %%XMM8 %20 | |
742 | %define %%T6 %21 | |
743 | %define %%T_key %22 | |
744 | %define %%ENC_DEC %23 | |
745 | ||
746 | %assign i (8-%%num_initial_blocks) | |
747 | ;; Move AAD_HASH to temp reg | |
748 | vmovdqu %%T2, %%XMM8 | |
749 | ;; Start AES for %%num_initial_blocks blocks | |
750 | ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 | |
751 | ||
752 | %assign i (9-%%num_initial_blocks) | |
753 | %rep %%num_initial_blocks | |
9f95a23c | 754 | vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 |
11fdf7f2 | 755 | vmovdqa reg(i), %%CTR |
9f95a23c | 756 | vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap |
11fdf7f2 TL |
757 | %assign i (i+1) |
758 | %endrep | |
759 | ||
760 | %if(%%num_initial_blocks>0) | |
761 | vmovdqu %%T_key, [%%GDATA_KEY+16*0] | |
762 | %assign i (9-%%num_initial_blocks) | |
763 | %rep %%num_initial_blocks | |
764 | vpxor reg(i),reg(i),%%T_key | |
765 | %assign i (i+1) | |
766 | %endrep | |
767 | ||
768 | %assign j 1 | |
769 | %rep NROUNDS | |
770 | vmovdqu %%T_key, [%%GDATA_KEY+16*j] | |
771 | %assign i (9-%%num_initial_blocks) | |
772 | %rep %%num_initial_blocks | |
773 | vaesenc reg(i),%%T_key | |
774 | %assign i (i+1) | |
775 | %endrep | |
776 | ||
777 | %assign j (j+1) | |
778 | %endrep | |
779 | ||
780 | ||
781 | vmovdqu %%T_key, [%%GDATA_KEY+16*j] | |
782 | %assign i (9-%%num_initial_blocks) | |
783 | %rep %%num_initial_blocks | |
784 | vaesenclast reg(i),%%T_key | |
785 | %assign i (i+1) | |
786 | %endrep | |
787 | ||
788 | %endif ; %if(%%num_initial_blocks>0) | |
789 | ||
790 | ||
791 | ||
792 | %assign i (9-%%num_initial_blocks) | |
793 | %rep %%num_initial_blocks | |
794 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] | |
795 | vpxor reg(i), reg(i), %%T1 | |
796 | ;; Write back ciphertext for %%num_initial_blocks blocks | |
797 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) | |
798 | add %%DATA_OFFSET, 16 | |
799 | %ifidn %%ENC_DEC, DEC | |
800 | vmovdqa reg(i), %%T1 | |
801 | %endif | |
802 | ;; Prepare ciphertext for GHASH computations | |
9f95a23c | 803 | vpshufb reg(i), [rel SHUF_MASK] |
11fdf7f2 TL |
804 | %assign i (i+1) |
805 | %endrep | |
806 | ||
807 | ||
808 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
809 | ||
810 | %assign i (9-%%num_initial_blocks) | |
811 | %if(%%num_initial_blocks>0) | |
812 | vmovdqa %%T3, reg(i) | |
813 | %assign i (i+1) | |
814 | %endif | |
9f95a23c | 815 | %if(%%num_initial_blocks>1) |
11fdf7f2 TL |
816 | %rep %%num_initial_blocks-1 |
817 | vmovdqu [rsp + TMP %+ i], reg(i) | |
818 | %assign i (i+1) | |
819 | %endrep | |
9f95a23c | 820 | %endif |
11fdf7f2 TL |
821 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
822 | ;; Haskey_i_k holds XORed values of the low and high parts of | |
823 | ;; the Haskey_i | |
9f95a23c TL |
824 | vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0 |
825 | vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0 | |
826 | vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0 | |
827 | vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0 | |
828 | vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0 | |
829 | vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0 | |
830 | vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0 | |
831 | vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0 | |
11fdf7f2 TL |
832 | vmovdqa %%CTR, %%XMM8 |
833 | ||
9f95a23c TL |
834 | vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap |
835 | vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap | |
836 | vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap | |
837 | vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap | |
838 | vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap | |
839 | vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap | |
840 | vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap | |
841 | vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap | |
11fdf7f2 TL |
842 | |
843 | vmovdqu %%T_key, [%%GDATA_KEY+16*0] | |
844 | vpxor %%XMM1, %%XMM1, %%T_key | |
845 | vpxor %%XMM2, %%XMM2, %%T_key | |
846 | vpxor %%XMM3, %%XMM3, %%T_key | |
847 | vpxor %%XMM4, %%XMM4, %%T_key | |
848 | vpxor %%XMM5, %%XMM5, %%T_key | |
849 | vpxor %%XMM6, %%XMM6, %%T_key | |
850 | vpxor %%XMM7, %%XMM7, %%T_key | |
851 | vpxor %%XMM8, %%XMM8, %%T_key | |
852 | ||
853 | %assign i (8-%%num_initial_blocks) | |
854 | %assign j (9-%%num_initial_blocks) | |
855 | %assign k (%%num_initial_blocks) | |
856 | ||
857 | %define %%T4_2 %%T4 | |
858 | %if(%%num_initial_blocks>0) | |
859 | ;; Hash in AES state | |
860 | ;; T2 - incoming AAD hash | |
861 | vpxor %%T2, %%T3 | |
862 | ||
863 | ;; GDATA, HASHKEY, CIPHER, | |
864 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
865 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
866 | %%T1, %%T4, %%T6, %%T5, %%T3, first | |
867 | %endif | |
868 | ||
869 | vmovdqu %%T_key, [%%GDATA_KEY+16*1] | |
870 | vaesenc %%XMM1, %%T_key | |
871 | vaesenc %%XMM2, %%T_key | |
872 | vaesenc %%XMM3, %%T_key | |
873 | vaesenc %%XMM4, %%T_key | |
874 | vaesenc %%XMM5, %%T_key | |
875 | vaesenc %%XMM6, %%T_key | |
876 | vaesenc %%XMM7, %%T_key | |
877 | vaesenc %%XMM8, %%T_key | |
878 | ||
879 | vmovdqu %%T_key, [%%GDATA_KEY+16*2] | |
880 | vaesenc %%XMM1, %%T_key | |
881 | vaesenc %%XMM2, %%T_key | |
882 | vaesenc %%XMM3, %%T_key | |
883 | vaesenc %%XMM4, %%T_key | |
884 | vaesenc %%XMM5, %%T_key | |
885 | vaesenc %%XMM6, %%T_key | |
886 | vaesenc %%XMM7, %%T_key | |
887 | vaesenc %%XMM8, %%T_key | |
888 | ||
889 | %assign i (i+1) | |
890 | %assign j (j+1) | |
891 | %assign k (k-1) | |
892 | %if(%%num_initial_blocks>1) | |
893 | ;; GDATA, HASHKEY, CIPHER, | |
894 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
895 | vmovdqu %%T2, [rsp + TMP %+ j] | |
896 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
897 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
898 | %endif | |
899 | ||
900 | vmovdqu %%T_key, [%%GDATA_KEY+16*3] | |
901 | vaesenc %%XMM1, %%T_key | |
902 | vaesenc %%XMM2, %%T_key | |
903 | vaesenc %%XMM3, %%T_key | |
904 | vaesenc %%XMM4, %%T_key | |
905 | vaesenc %%XMM5, %%T_key | |
906 | vaesenc %%XMM6, %%T_key | |
907 | vaesenc %%XMM7, %%T_key | |
908 | vaesenc %%XMM8, %%T_key | |
909 | ||
910 | vmovdqu %%T_key, [%%GDATA_KEY+16*4] | |
911 | vaesenc %%XMM1, %%T_key | |
912 | vaesenc %%XMM2, %%T_key | |
913 | vaesenc %%XMM3, %%T_key | |
914 | vaesenc %%XMM4, %%T_key | |
915 | vaesenc %%XMM5, %%T_key | |
916 | vaesenc %%XMM6, %%T_key | |
917 | vaesenc %%XMM7, %%T_key | |
918 | vaesenc %%XMM8, %%T_key | |
919 | ||
920 | %assign i (i+1) | |
921 | %assign j (j+1) | |
922 | %assign k (k-1) | |
923 | %if(%%num_initial_blocks>2) | |
924 | ;; GDATA, HASHKEY, CIPHER, | |
925 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
926 | vmovdqu %%T2, [rsp + TMP %+ j] | |
927 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
928 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
929 | %endif | |
930 | ||
931 | %assign i (i+1) | |
932 | %assign j (j+1) | |
933 | %assign k (k-1) | |
934 | %if(%%num_initial_blocks>3) | |
935 | ;; GDATA, HASHKEY, CIPHER, | |
936 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
937 | vmovdqu %%T2, [rsp + TMP %+ j] | |
938 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
939 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
940 | %endif | |
941 | ||
942 | vmovdqu %%T_key, [%%GDATA_KEY+16*5] | |
943 | vaesenc %%XMM1, %%T_key | |
944 | vaesenc %%XMM2, %%T_key | |
945 | vaesenc %%XMM3, %%T_key | |
946 | vaesenc %%XMM4, %%T_key | |
947 | vaesenc %%XMM5, %%T_key | |
948 | vaesenc %%XMM6, %%T_key | |
949 | vaesenc %%XMM7, %%T_key | |
950 | vaesenc %%XMM8, %%T_key | |
951 | ||
952 | vmovdqu %%T_key, [%%GDATA_KEY+16*6] | |
953 | vaesenc %%XMM1, %%T_key | |
954 | vaesenc %%XMM2, %%T_key | |
955 | vaesenc %%XMM3, %%T_key | |
956 | vaesenc %%XMM4, %%T_key | |
957 | vaesenc %%XMM5, %%T_key | |
958 | vaesenc %%XMM6, %%T_key | |
959 | vaesenc %%XMM7, %%T_key | |
960 | vaesenc %%XMM8, %%T_key | |
961 | ||
962 | %assign i (i+1) | |
963 | %assign j (j+1) | |
964 | %assign k (k-1) | |
965 | %if(%%num_initial_blocks>4) | |
966 | ;; GDATA, HASHKEY, CIPHER, | |
967 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
968 | vmovdqu %%T2, [rsp + TMP %+ j] | |
969 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
970 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
971 | %endif | |
972 | ||
973 | vmovdqu %%T_key, [%%GDATA_KEY+16*7] | |
974 | vaesenc %%XMM1, %%T_key | |
975 | vaesenc %%XMM2, %%T_key | |
976 | vaesenc %%XMM3, %%T_key | |
977 | vaesenc %%XMM4, %%T_key | |
978 | vaesenc %%XMM5, %%T_key | |
979 | vaesenc %%XMM6, %%T_key | |
980 | vaesenc %%XMM7, %%T_key | |
981 | vaesenc %%XMM8, %%T_key | |
982 | ||
983 | vmovdqu %%T_key, [%%GDATA_KEY+16*8] | |
984 | vaesenc %%XMM1, %%T_key | |
985 | vaesenc %%XMM2, %%T_key | |
986 | vaesenc %%XMM3, %%T_key | |
987 | vaesenc %%XMM4, %%T_key | |
988 | vaesenc %%XMM5, %%T_key | |
989 | vaesenc %%XMM6, %%T_key | |
990 | vaesenc %%XMM7, %%T_key | |
991 | vaesenc %%XMM8, %%T_key | |
992 | ||
993 | %assign i (i+1) | |
994 | %assign j (j+1) | |
995 | %assign k (k-1) | |
996 | %if(%%num_initial_blocks>5) | |
997 | ;; GDATA, HASHKEY, CIPHER, | |
998 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
999 | vmovdqu %%T2, [rsp + TMP %+ j] | |
1000 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
1001 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
1002 | %endif | |
1003 | ||
1004 | vmovdqu %%T_key, [%%GDATA_KEY+16*9] | |
1005 | vaesenc %%XMM1, %%T_key | |
1006 | vaesenc %%XMM2, %%T_key | |
1007 | vaesenc %%XMM3, %%T_key | |
1008 | vaesenc %%XMM4, %%T_key | |
1009 | vaesenc %%XMM5, %%T_key | |
1010 | vaesenc %%XMM6, %%T_key | |
1011 | vaesenc %%XMM7, %%T_key | |
1012 | vaesenc %%XMM8, %%T_key | |
1013 | ||
1014 | %ifndef GCM128_MODE | |
1015 | vmovdqu %%T_key, [%%GDATA_KEY+16*10] | |
1016 | vaesenc %%XMM1, %%T_key | |
1017 | vaesenc %%XMM2, %%T_key | |
1018 | vaesenc %%XMM3, %%T_key | |
1019 | vaesenc %%XMM4, %%T_key | |
1020 | vaesenc %%XMM5, %%T_key | |
1021 | vaesenc %%XMM6, %%T_key | |
1022 | vaesenc %%XMM7, %%T_key | |
1023 | vaesenc %%XMM8, %%T_key | |
1024 | %endif | |
1025 | ||
1026 | %assign i (i+1) | |
1027 | %assign j (j+1) | |
1028 | %assign k (k-1) | |
1029 | %if(%%num_initial_blocks>6) | |
1030 | ;; GDATA, HASHKEY, CIPHER, | |
1031 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
1032 | vmovdqu %%T2, [rsp + TMP %+ j] | |
1033 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
1034 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
1035 | %endif | |
1036 | ||
1037 | %ifdef GCM128_MODE | |
1038 | vmovdqu %%T_key, [%%GDATA_KEY+16*10] | |
1039 | vaesenclast %%XMM1, %%T_key | |
1040 | vaesenclast %%XMM2, %%T_key | |
1041 | vaesenclast %%XMM3, %%T_key | |
1042 | vaesenclast %%XMM4, %%T_key | |
1043 | vaesenclast %%XMM5, %%T_key | |
1044 | vaesenclast %%XMM6, %%T_key | |
1045 | vaesenclast %%XMM7, %%T_key | |
1046 | vaesenclast %%XMM8, %%T_key | |
1047 | %endif | |
1048 | ||
1049 | %ifdef GCM192_MODE | |
1050 | vmovdqu %%T_key, [%%GDATA_KEY+16*11] | |
1051 | vaesenc %%XMM1, %%T_key | |
1052 | vaesenc %%XMM2, %%T_key | |
1053 | vaesenc %%XMM3, %%T_key | |
1054 | vaesenc %%XMM4, %%T_key | |
1055 | vaesenc %%XMM5, %%T_key | |
1056 | vaesenc %%XMM6, %%T_key | |
1057 | vaesenc %%XMM7, %%T_key | |
1058 | vaesenc %%XMM8, %%T_key | |
1059 | ||
1060 | vmovdqu %%T_key, [%%GDATA_KEY+16*12] | |
1061 | vaesenclast %%XMM1, %%T_key | |
1062 | vaesenclast %%XMM2, %%T_key | |
1063 | vaesenclast %%XMM3, %%T_key | |
1064 | vaesenclast %%XMM4, %%T_key | |
1065 | vaesenclast %%XMM5, %%T_key | |
1066 | vaesenclast %%XMM6, %%T_key | |
1067 | vaesenclast %%XMM7, %%T_key | |
1068 | vaesenclast %%XMM8, %%T_key | |
1069 | %endif | |
1070 | %ifdef GCM256_MODE | |
1071 | vmovdqu %%T_key, [%%GDATA_KEY+16*11] | |
1072 | vaesenc %%XMM1, %%T_key | |
1073 | vaesenc %%XMM2, %%T_key | |
1074 | vaesenc %%XMM3, %%T_key | |
1075 | vaesenc %%XMM4, %%T_key | |
1076 | vaesenc %%XMM5, %%T_key | |
1077 | vaesenc %%XMM6, %%T_key | |
1078 | vaesenc %%XMM7, %%T_key | |
1079 | vaesenc %%XMM8, %%T_key | |
1080 | ||
1081 | vmovdqu %%T_key, [%%GDATA_KEY+16*12] | |
1082 | vaesenc %%XMM1, %%T_key | |
1083 | vaesenc %%XMM2, %%T_key | |
1084 | vaesenc %%XMM3, %%T_key | |
1085 | vaesenc %%XMM4, %%T_key | |
1086 | vaesenc %%XMM5, %%T_key | |
1087 | vaesenc %%XMM6, %%T_key | |
1088 | vaesenc %%XMM7, %%T_key | |
1089 | vaesenc %%XMM8, %%T_key | |
1090 | %endif | |
1091 | ||
1092 | %assign i (i+1) | |
1093 | %assign j (j+1) | |
1094 | %assign k (k-1) | |
1095 | %if(%%num_initial_blocks>7) | |
1096 | ;; GDATA, HASHKEY, CIPHER, | |
1097 | ;; STATE_11, STATE_00, STATE_MID, T1, T2 | |
1098 | vmovdqu %%T2, [rsp + TMP %+ j] | |
1099 | GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ | |
1100 | %%T1, %%T4, %%T6, %%T5, %%T3, not_first | |
1101 | %endif | |
1102 | ||
1103 | %ifdef GCM256_MODE ; GCM256 | |
1104 | vmovdqu %%T_key, [%%GDATA_KEY+16*13] | |
1105 | vaesenc %%XMM1, %%T_key | |
1106 | vaesenc %%XMM2, %%T_key | |
1107 | vaesenc %%XMM3, %%T_key | |
1108 | vaesenc %%XMM4, %%T_key | |
1109 | vaesenc %%XMM5, %%T_key | |
1110 | vaesenc %%XMM6, %%T_key | |
1111 | vaesenc %%XMM7, %%T_key | |
1112 | vaesenc %%XMM8, %%T_key | |
1113 | ||
1114 | vmovdqu %%T_key, [%%GDATA_KEY+16*14] | |
1115 | vaesenclast %%XMM1, %%T_key | |
1116 | vaesenclast %%XMM2, %%T_key | |
1117 | vaesenclast %%XMM3, %%T_key | |
1118 | vaesenclast %%XMM4, %%T_key | |
1119 | vaesenclast %%XMM5, %%T_key | |
1120 | vaesenclast %%XMM6, %%T_key | |
1121 | vaesenclast %%XMM7, %%T_key | |
1122 | vaesenclast %%XMM8, %%T_key | |
1123 | %endif ; GCM256 mode | |
1124 | ||
1125 | %if(%%num_initial_blocks>0) | |
1126 | vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs | |
1127 | vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs | |
1128 | vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 | |
1129 | vpxor %%T4, %%T6, %%T4 | |
1130 | ||
1131 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1132 | ; First phase of the reduction | |
9f95a23c | 1133 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
1134 | |
1135 | vpclmulqdq %%T2, %%T3, %%T4, 0x01 | |
1136 | vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs | |
1137 | ||
1138 | ;; First phase of the reduction complete | |
1139 | vpxor %%T4, %%T4, %%T2 | |
1140 | ||
1141 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1142 | ; Second phase of the reduction | |
1143 | vpclmulqdq %%T2, %%T3, %%T4, 0x00 | |
1144 | ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
1145 | vpsrldq %%T2, %%T2, 4 | |
1146 | ||
1147 | vpclmulqdq %%T4, %%T3, %%T4, 0x10 | |
1148 | ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
1149 | vpslldq %%T4, %%T4, 4 | |
1150 | ;; Second phase of the reduction complete | |
1151 | vpxor %%T4, %%T4, %%T2 | |
1152 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1153 | ; The result is in %%T3 | |
1154 | vpxor %%T3, %%T1, %%T4 | |
1155 | %else | |
1156 | ;; The hash should end up in T3 | |
1157 | vmovdqa %%T3, %%T2 | |
1158 | %endif | |
1159 | ||
1160 | ;; Final hash is now in T3 | |
1161 | %if %%num_initial_blocks > 0 | |
1162 | ;; NOTE: obsolete in case %%num_initial_blocks = 0 | |
1163 | sub %%LENGTH, 16*%%num_initial_blocks | |
1164 | %endif | |
1165 | ||
1166 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] | |
1167 | vpxor %%XMM1, %%XMM1, %%T1 | |
1168 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 | |
1169 | %ifidn %%ENC_DEC, DEC | |
1170 | vmovdqa %%XMM1, %%T1 | |
1171 | %endif | |
1172 | ||
1173 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] | |
1174 | vpxor %%XMM2, %%XMM2, %%T1 | |
1175 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 | |
1176 | %ifidn %%ENC_DEC, DEC | |
1177 | vmovdqa %%XMM2, %%T1 | |
1178 | %endif | |
1179 | ||
1180 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] | |
1181 | vpxor %%XMM3, %%XMM3, %%T1 | |
1182 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 | |
1183 | %ifidn %%ENC_DEC, DEC | |
1184 | vmovdqa %%XMM3, %%T1 | |
1185 | %endif | |
1186 | ||
1187 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] | |
1188 | vpxor %%XMM4, %%XMM4, %%T1 | |
1189 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 | |
1190 | %ifidn %%ENC_DEC, DEC | |
1191 | vmovdqa %%XMM4, %%T1 | |
1192 | %endif | |
1193 | ||
1194 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] | |
1195 | vpxor %%XMM5, %%XMM5, %%T1 | |
1196 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 | |
1197 | %ifidn %%ENC_DEC, DEC | |
1198 | vmovdqa %%XMM5, %%T1 | |
1199 | %endif | |
1200 | ||
1201 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] | |
1202 | vpxor %%XMM6, %%XMM6, %%T1 | |
1203 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 | |
1204 | %ifidn %%ENC_DEC, DEC | |
1205 | vmovdqa %%XMM6, %%T1 | |
1206 | %endif | |
1207 | ||
1208 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] | |
1209 | vpxor %%XMM7, %%XMM7, %%T1 | |
1210 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 | |
1211 | %ifidn %%ENC_DEC, DEC | |
1212 | vmovdqa %%XMM7, %%T1 | |
1213 | %endif | |
1214 | ||
1215 | %if %%num_initial_blocks > 0 | |
1216 | ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 | |
1217 | ;; This macro is executed for lenght 128 and up, | |
1218 | ;; zero length is checked in GCM_ENC_DEC. | |
1219 | ;; If the last block is partial then the xor will be done later | |
1220 | ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. | |
1221 | ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 | |
1222 | cmp %%LENGTH, 128 | |
1223 | jl %%_initial_skip_last_word_write | |
1224 | %endif | |
1225 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] | |
1226 | vpxor %%XMM8, %%XMM8, %%T1 | |
1227 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 | |
1228 | %ifidn %%ENC_DEC, DEC | |
1229 | vmovdqa %%XMM8, %%T1 | |
1230 | %endif | |
1231 | ||
1232 | ;; Update %%LENGTH with the number of blocks processed | |
1233 | sub %%LENGTH, 16 | |
1234 | add %%DATA_OFFSET, 16 | |
1235 | %%_initial_skip_last_word_write: | |
1236 | sub %%LENGTH, 128-16 | |
1237 | add %%DATA_OFFSET, 128-16 | |
1238 | ||
9f95a23c | 1239 | vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap |
11fdf7f2 TL |
1240 | ;; Combine GHASHed value with the corresponding ciphertext |
1241 | vpxor %%XMM1, %%XMM1, %%T3 | |
9f95a23c TL |
1242 | vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap |
1243 | vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap | |
1244 | vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap | |
1245 | vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap | |
1246 | vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap | |
1247 | vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap | |
1248 | vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap | |
11fdf7f2 TL |
1249 | |
1250 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1251 | ||
1252 | %%_initial_blocks_done: | |
1253 | ||
1254 | ||
1255 | %endmacro | |
1256 | ||
1257 | ;;; INITIAL_BLOCKS macro with support for a partial final block. | |
1258 | ;;; num_initial_blocks is expected to include the partial final block | |
1259 | ;;; in the count. | |
1260 | %macro INITIAL_BLOCKS_PARTIAL 25 | |
1261 | %define %%GDATA_KEY %1 | |
1262 | %define %%GDATA_CTX %2 | |
1263 | %define %%CYPH_PLAIN_OUT %3 | |
1264 | %define %%PLAIN_CYPH_IN %4 | |
1265 | %define %%LENGTH %5 | |
1266 | %define %%DATA_OFFSET %6 | |
1267 | %define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) | |
1268 | %define %%T1 %8 | |
1269 | %define %%T2 %9 | |
1270 | %define %%T3 %10 | |
1271 | %define %%T4 %11 | |
1272 | %define %%T5 %12 | |
1273 | %define %%CTR %13 | |
1274 | %define %%XMM1 %14 | |
1275 | %define %%XMM2 %15 | |
1276 | %define %%XMM3 %16 | |
1277 | %define %%XMM4 %17 | |
1278 | %define %%XMM5 %18 | |
1279 | %define %%XMM6 %19 | |
1280 | %define %%XMM7 %20 | |
1281 | %define %%XMM8 %21 | |
1282 | %define %%T6 %22 | |
1283 | %define %%T_key %23 | |
1284 | %define %%ENC_DEC %24 | |
1285 | %define %%INSTANCE_TYPE %25 | |
1286 | ||
1287 | %assign i (8-%%num_initial_blocks) | |
1288 | ;; Move AAD_HASH to temp reg | |
1289 | vmovdqu %%T2, %%XMM8 | |
1290 | ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 | |
1291 | ||
1292 | %assign i (9-%%num_initial_blocks) | |
1293 | %rep %%num_initial_blocks | |
1294 | ;; Compute AES counters | |
1295 | vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 | |
1296 | vmovdqa reg(i), %%CTR | |
1297 | vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap | |
1298 | %assign i (i+1) | |
1299 | %endrep | |
1300 | ||
1301 | vmovdqu %%T_key, [%%GDATA_KEY+16*0] | |
1302 | %assign i (9-%%num_initial_blocks) | |
1303 | %rep %%num_initial_blocks | |
1304 | ; Start AES for %%num_initial_blocks blocks | |
1305 | vpxor reg(i),reg(i),%%T_key | |
1306 | %assign i (i+1) | |
1307 | %endrep | |
1308 | ||
1309 | %assign j 1 | |
1310 | %rep NROUNDS | |
1311 | vmovdqu %%T_key, [%%GDATA_KEY+16*j] | |
1312 | %assign i (9-%%num_initial_blocks) | |
1313 | %rep %%num_initial_blocks | |
1314 | vaesenc reg(i),%%T_key | |
1315 | %assign i (i+1) | |
1316 | %endrep | |
1317 | ||
1318 | %assign j (j+1) | |
1319 | %endrep | |
1320 | ||
1321 | ||
1322 | vmovdqu %%T_key, [%%GDATA_KEY+16*j] | |
1323 | %assign i (9-%%num_initial_blocks) | |
1324 | %rep %%num_initial_blocks | |
1325 | vaesenclast reg(i),%%T_key | |
1326 | %assign i (i+1) | |
1327 | %endrep | |
1328 | ||
1329 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1330 | ;;; Hash all but the last block of data | |
1331 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1332 | ||
1333 | %assign i (9-%%num_initial_blocks) | |
1334 | %rep %%num_initial_blocks-1 | |
1335 | ;; Encrypt the message for all but the last block | |
1336 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] | |
1337 | vpxor reg(i), reg(i), %%T1 | |
1338 | ;; write back ciphertext for %%num_initial_blocks blocks | |
1339 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) | |
1340 | add %%DATA_OFFSET, 16 | |
1341 | %ifidn %%ENC_DEC, DEC | |
1342 | vmovdqa reg(i), %%T1 | |
1343 | %endif | |
1344 | ;; Prepare ciphertext for GHASH computations | |
1345 | vpshufb reg(i), [rel SHUF_MASK] | |
1346 | %assign i (i+1) | |
1347 | %endrep | |
1348 | ||
1349 | ;; The final block of data may be <16B | |
1350 | sub %%LENGTH, 16*(%%num_initial_blocks-1) | |
1351 | ||
1352 | %if %%num_initial_blocks < 8 | |
1353 | ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. | |
1354 | ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. | |
1355 | cmp %%LENGTH, 16 | |
1356 | jl %%_small_initial_partial_block | |
1357 | ||
1358 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1359 | ;;; Handle a full length final block - encrypt and hash all blocks | |
1360 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1361 | ||
1362 | sub %%LENGTH, 16 | |
1363 | mov [%%GDATA_CTX + PBlockLen], %%LENGTH | |
1364 | ||
1365 | ;; Encrypt the message | |
1366 | VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] | |
1367 | vpxor reg(i), reg(i), %%T1 | |
1368 | ;; write back ciphertext for %%num_initial_blocks blocks | |
1369 | VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) | |
1370 | add %%DATA_OFFSET, 16 | |
1371 | %ifidn %%ENC_DEC, DEC | |
1372 | vmovdqa reg(i), %%T1 | |
1373 | %endif | |
1374 | ;; Prepare ciphertext for GHASH computations | |
1375 | vpshufb reg(i), [rel SHUF_MASK] | |
1376 | ||
1377 | ;; Hash all of the data | |
1378 | %assign i (8-%%num_initial_blocks) | |
1379 | %assign j (9-%%num_initial_blocks) | |
1380 | %assign k (%%num_initial_blocks) | |
1381 | %assign last_block_to_hash 0 | |
1382 | ||
1383 | %if(%%num_initial_blocks>last_block_to_hash) | |
1384 | ;; Hash in AES state | |
1385 | vpxor %%T2, reg(j) | |
1386 | ||
1387 | ;; T2 - incoming AAD hash | |
1388 | ;; reg(i) holds ciphertext | |
1389 | ;; T5 - hash key | |
1390 | ;; T6 - updated xor | |
1391 | ;; reg(1)/xmm1 should now be available for tmp use | |
1392 | vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] | |
1393 | vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 | |
1394 | vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 | |
1395 | vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 | |
1396 | vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 | |
1397 | vpxor %%T6, %%T6, %%T5 | |
1398 | %endif | |
1399 | ||
1400 | %assign i (i+1) | |
1401 | %assign j (j+1) | |
1402 | %assign k (k-1) | |
1403 | %assign rep_count (%%num_initial_blocks-1) | |
1404 | %rep rep_count | |
1405 | ||
1406 | vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] | |
1407 | vpclmulqdq %%T3, reg(j), %%T5, 0x11 | |
1408 | vpxor %%T1, %%T1, %%T3 | |
1409 | ||
1410 | vpclmulqdq %%T3, reg(j), %%T5, 0x00 | |
1411 | vpxor %%T4, %%T4, %%T3 | |
1412 | ||
1413 | vpclmulqdq %%T3, reg(j), %%T5, 0x01 | |
1414 | vpxor %%T6, %%T6, %%T3 | |
1415 | ||
1416 | vpclmulqdq %%T3, reg(j), %%T5, 0x10 | |
1417 | vpxor %%T6, %%T6, %%T3 | |
1418 | ||
1419 | %assign i (i+1) | |
1420 | %assign j (j+1) | |
1421 | %assign k (k-1) | |
1422 | %endrep | |
1423 | ||
1424 | ;; Record that a reduction is needed | |
1425 | mov r12, 1 | |
1426 | ||
1427 | jmp %%_small_initial_compute_hash | |
1428 | ||
1429 | ||
1430 | %endif ; %if %%num_initial_blocks < 8 | |
1431 | ||
1432 | %%_small_initial_partial_block: | |
1433 | ||
1434 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1435 | ;;; Handle ghash for a <16B final block | |
1436 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1437 | ||
1438 | ;; In this case if it's a single call to encrypt we can | |
1439 | ;; hash all of the data but if it's an init / update / finalize | |
1440 | ;; series of call we need to leave the last block if it's | |
1441 | ;; less than a full block of data. | |
1442 | ||
1443 | mov [%%GDATA_CTX + PBlockLen], %%LENGTH | |
1444 | vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) | |
1445 | ;; Handle a partial final block | |
1446 | ;; GDATA, KEY, T1, T2 | |
1447 | ;; r13 - length | |
1448 | ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long | |
1449 | ;; NOTE: could be replaced with %%LENGTH but at this point | |
1450 | ;; %%LENGTH is always less than 16. | |
1451 | ;; No PLAIN_CYPH_LEN argument available in this macro. | |
1452 | ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET | |
9f95a23c | 1453 | vpshufb reg(i), [rel SHUF_MASK] |
11fdf7f2 TL |
1454 | |
1455 | %ifidn %%INSTANCE_TYPE, multi_call | |
1456 | %assign i (8-%%num_initial_blocks) | |
1457 | %assign j (9-%%num_initial_blocks) | |
1458 | %assign k (%%num_initial_blocks-1) | |
1459 | %assign last_block_to_hash 1 | |
1460 | %else | |
1461 | %assign i (8-%%num_initial_blocks) | |
1462 | %assign j (9-%%num_initial_blocks) | |
1463 | %assign k (%%num_initial_blocks) | |
1464 | %assign last_block_to_hash 0 | |
1465 | %endif | |
1466 | ||
1467 | %if(%%num_initial_blocks>last_block_to_hash) | |
1468 | ;; Record that a reduction is needed | |
1469 | mov r12, 1 | |
1470 | ;; Hash in AES state | |
1471 | vpxor %%T2, reg(j) | |
1472 | ||
1473 | ;; T2 - incoming AAD hash | |
1474 | ;; reg(i) holds ciphertext | |
1475 | ;; T5 - hash key | |
1476 | ;; T6 - updated xor | |
1477 | ;; reg(1)/xmm1 should now be available for tmp use | |
1478 | vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] | |
1479 | vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 | |
1480 | vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 | |
1481 | vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 | |
1482 | vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 | |
1483 | vpxor %%T6, %%T6, %%T5 | |
1484 | %else | |
1485 | ;; Record that a reduction is not needed - | |
1486 | ;; In this case no hashes are computed because there | |
1487 | ;; is only one initial block and it is < 16B in length. | |
1488 | mov r12, 0 | |
1489 | %endif | |
1490 | ||
1491 | %assign i (i+1) | |
1492 | %assign j (j+1) | |
1493 | %assign k (k-1) | |
1494 | %ifidn %%INSTANCE_TYPE, multi_call | |
1495 | %assign rep_count (%%num_initial_blocks-2) | |
1496 | %%_multi_call_hash: | |
1497 | %else | |
1498 | %assign rep_count (%%num_initial_blocks-1) | |
1499 | %endif | |
9f95a23c TL |
1500 | |
1501 | %if rep_count < 0 | |
1502 | ;; quick fix for negative rep_count (to be investigated) | |
1503 | %assign rep_count 0 | |
1504 | %endif | |
1505 | ||
11fdf7f2 TL |
1506 | %rep rep_count |
1507 | ||
1508 | vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] | |
1509 | vpclmulqdq %%T3, reg(j), %%T5, 0x11 | |
1510 | vpxor %%T1, %%T1, %%T3 | |
1511 | ||
1512 | vpclmulqdq %%T3, reg(j), %%T5, 0x00 | |
1513 | vpxor %%T4, %%T4, %%T3 | |
1514 | ||
1515 | vpclmulqdq %%T3, reg(j), %%T5, 0x01 | |
1516 | vpxor %%T6, %%T6, %%T3 | |
1517 | ||
1518 | vpclmulqdq %%T3, reg(j), %%T5, 0x10 | |
1519 | vpxor %%T6, %%T6, %%T3 | |
1520 | ||
1521 | %assign i (i+1) | |
1522 | %assign j (j+1) | |
1523 | %assign k (k-1) | |
1524 | %endrep | |
1525 | ||
1526 | %%_small_initial_compute_hash: | |
1527 | ||
1528 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1529 | ;;; Ghash reduction | |
1530 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1531 | ||
1532 | %if(%%num_initial_blocks=1) | |
1533 | %ifidn %%INSTANCE_TYPE, multi_call | |
1534 | ;; We only need to check if a reduction is needed if | |
1535 | ;; initial_blocks == 1 and init/update/final is being used. | |
1536 | ;; In this case we may just have a partial block, and that | |
1537 | ;; gets hashed in finalize. | |
1538 | cmp r12, 0 | |
1539 | je %%_no_reduction_needed | |
1540 | %endif | |
1541 | %endif | |
1542 | ||
1543 | vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs | |
1544 | vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs | |
1545 | vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 | |
1546 | vpxor %%T4, %%T6, %%T4 | |
1547 | ||
1548 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1549 | ;; First phase of the reduction | |
9f95a23c | 1550 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
1551 | |
1552 | vpclmulqdq %%T2, %%T3, %%T4, 0x01 | |
1553 | ;; shift-L xmm2 2 DWs | |
1554 | vpslldq %%T2, %%T2, 8 | |
1555 | vpxor %%T4, %%T4, %%T2 | |
1556 | ||
1557 | ;; First phase of the reduction complete | |
1558 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1559 | ;; Second phase of the reduction | |
1560 | ||
1561 | vpclmulqdq %%T2, %%T3, %%T4, 0x00 | |
1562 | ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
1563 | vpsrldq %%T2, %%T2, 4 | |
1564 | ||
1565 | vpclmulqdq %%T4, %%T3, %%T4, 0x10 | |
1566 | ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
1567 | vpslldq %%T4, %%T4, 4 | |
1568 | ||
1569 | vpxor %%T4, %%T4, %%T2 | |
1570 | ;; Second phase of the reduction complete | |
1571 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1572 | vpxor %%T3, %%T1, %%T4 | |
1573 | ||
1574 | %ifidn %%INSTANCE_TYPE, multi_call | |
1575 | ;; If using init/update/finalize, we need to xor any partial block data | |
1576 | ;; into the hash. | |
1577 | %if %%num_initial_blocks > 1 | |
1578 | ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place | |
1579 | %if %%num_initial_blocks != 8 | |
1580 | ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero | |
1581 | cmp qword [%%GDATA_CTX + PBlockLen], 0 | |
1582 | je %%_no_partial_block_xor | |
1583 | %endif ; %%num_initial_blocks != 8 | |
1584 | vpxor %%T3, %%T3, reg(8) | |
1585 | %%_no_partial_block_xor: | |
1586 | %endif ; %%num_initial_blocks > 1 | |
1587 | %endif ; %%INSTANCE_TYPE, multi_call | |
1588 | ||
1589 | %if(%%num_initial_blocks=1) | |
1590 | %ifidn %%INSTANCE_TYPE, multi_call | |
1591 | ;; NOTE: %%_no_reduction_needed case only valid for | |
1592 | ;; multi_call with initial_blocks = 1. | |
1593 | ;; Look for comment above around '_no_reduction_needed' | |
1594 | ;; The jmp below is obsolete as the code will fall through. | |
1595 | ||
1596 | ;; The result is in %%T3 | |
1597 | jmp %%_after_reduction | |
1598 | ||
1599 | %%_no_reduction_needed: | |
1600 | ;; The hash should end up in T3. The only way we should get here is if | |
1601 | ;; there is a partial block of data, so xor that into the hash. | |
1602 | vpxor %%T3, %%T2, reg(8) | |
1603 | %endif ; %%INSTANCE_TYPE = multi_call | |
1604 | %endif ; %%num_initial_blocks=1 | |
1605 | ||
1606 | %%_after_reduction: | |
1607 | ;; Final hash is now in T3 | |
1608 | ||
1609 | %endmacro ; INITIAL_BLOCKS_PARTIAL | |
1610 | ||
1611 | ||
1612 | ||
1613 | ; encrypt 8 blocks at a time | |
1614 | ; ghash the 8 previously encrypted ciphertext blocks | |
1615 | ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified | |
1616 | ; %%DATA_OFFSET is the data offset value | |
1617 | %macro GHASH_8_ENCRYPT_8_PARALLEL 23 | |
1618 | %define %%GDATA %1 | |
1619 | %define %%CYPH_PLAIN_OUT %2 | |
1620 | %define %%PLAIN_CYPH_IN %3 | |
1621 | %define %%DATA_OFFSET %4 | |
1622 | %define %%T1 %5 | |
1623 | %define %%T2 %6 | |
1624 | %define %%T3 %7 | |
1625 | %define %%T4 %8 | |
1626 | %define %%T5 %9 | |
1627 | %define %%T6 %10 | |
1628 | %define %%CTR %11 | |
1629 | %define %%XMM1 %12 | |
1630 | %define %%XMM2 %13 | |
1631 | %define %%XMM3 %14 | |
1632 | %define %%XMM4 %15 | |
1633 | %define %%XMM5 %16 | |
1634 | %define %%XMM6 %17 | |
1635 | %define %%XMM7 %18 | |
1636 | %define %%XMM8 %19 | |
1637 | %define %%T7 %20 | |
1638 | %define %%loop_idx %21 | |
1639 | %define %%ENC_DEC %22 | |
1640 | %define %%FULL_PARTIAL %23 | |
1641 | ||
1642 | vmovdqa %%T2, %%XMM1 | |
1643 | vmovdqu [rsp + TMP2], %%XMM2 | |
1644 | vmovdqu [rsp + TMP3], %%XMM3 | |
1645 | vmovdqu [rsp + TMP4], %%XMM4 | |
1646 | vmovdqu [rsp + TMP5], %%XMM5 | |
1647 | vmovdqu [rsp + TMP6], %%XMM6 | |
1648 | vmovdqu [rsp + TMP7], %%XMM7 | |
1649 | vmovdqu [rsp + TMP8], %%XMM8 | |
1650 | ||
1651 | %ifidn %%loop_idx, in_order | |
9f95a23c TL |
1652 | vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT |
1653 | vmovdqa %%T5, [rel TWO] | |
11fdf7f2 TL |
1654 | vpaddd %%XMM2, %%CTR, %%T5 |
1655 | vpaddd %%XMM3, %%XMM1, %%T5 | |
1656 | vpaddd %%XMM4, %%XMM2, %%T5 | |
1657 | vpaddd %%XMM5, %%XMM3, %%T5 | |
1658 | vpaddd %%XMM6, %%XMM4, %%T5 | |
1659 | vpaddd %%XMM7, %%XMM5, %%T5 | |
1660 | vpaddd %%XMM8, %%XMM6, %%T5 | |
1661 | vmovdqa %%CTR, %%XMM8 | |
1662 | ||
9f95a23c | 1663 | vmovdqa %%T5, [rel SHUF_MASK] |
11fdf7f2 TL |
1664 | vpshufb %%XMM1, %%T5 ; perform a 16Byte swap |
1665 | vpshufb %%XMM2, %%T5 ; perform a 16Byte swap | |
1666 | vpshufb %%XMM3, %%T5 ; perform a 16Byte swap | |
1667 | vpshufb %%XMM4, %%T5 ; perform a 16Byte swap | |
1668 | vpshufb %%XMM5, %%T5 ; perform a 16Byte swap | |
1669 | vpshufb %%XMM6, %%T5 ; perform a 16Byte swap | |
1670 | vpshufb %%XMM7, %%T5 ; perform a 16Byte swap | |
1671 | vpshufb %%XMM8, %%T5 ; perform a 16Byte swap | |
1672 | %else | |
9f95a23c TL |
1673 | vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT |
1674 | vmovdqa %%T5, [rel TWOf] | |
11fdf7f2 TL |
1675 | vpaddd %%XMM2, %%CTR, %%T5 |
1676 | vpaddd %%XMM3, %%XMM1, %%T5 | |
1677 | vpaddd %%XMM4, %%XMM2, %%T5 | |
1678 | vpaddd %%XMM5, %%XMM3, %%T5 | |
1679 | vpaddd %%XMM6, %%XMM4, %%T5 | |
1680 | vpaddd %%XMM7, %%XMM5, %%T5 | |
1681 | vpaddd %%XMM8, %%XMM6, %%T5 | |
1682 | vmovdqa %%CTR, %%XMM8 | |
1683 | %endif | |
1684 | ||
1685 | ||
1686 | ||
1687 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1688 | ||
1689 | vmovdqu %%T1, [%%GDATA + 16*0] | |
1690 | vpxor %%XMM1, %%XMM1, %%T1 | |
1691 | vpxor %%XMM2, %%XMM2, %%T1 | |
1692 | vpxor %%XMM3, %%XMM3, %%T1 | |
1693 | vpxor %%XMM4, %%XMM4, %%T1 | |
1694 | vpxor %%XMM5, %%XMM5, %%T1 | |
1695 | vpxor %%XMM6, %%XMM6, %%T1 | |
1696 | vpxor %%XMM7, %%XMM7, %%T1 | |
1697 | vpxor %%XMM8, %%XMM8, %%T1 | |
1698 | ||
1699 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1700 | ||
1701 | vmovdqu %%T1, [%%GDATA + 16*1] | |
1702 | vaesenc %%XMM1, %%T1 | |
1703 | vaesenc %%XMM2, %%T1 | |
1704 | vaesenc %%XMM3, %%T1 | |
1705 | vaesenc %%XMM4, %%T1 | |
1706 | vaesenc %%XMM5, %%T1 | |
1707 | vaesenc %%XMM6, %%T1 | |
1708 | vaesenc %%XMM7, %%T1 | |
1709 | vaesenc %%XMM8, %%T1 | |
1710 | ||
1711 | ||
1712 | vmovdqu %%T1, [%%GDATA + 16*2] | |
1713 | vaesenc %%XMM1, %%T1 | |
1714 | vaesenc %%XMM2, %%T1 | |
1715 | vaesenc %%XMM3, %%T1 | |
1716 | vaesenc %%XMM4, %%T1 | |
1717 | vaesenc %%XMM5, %%T1 | |
1718 | vaesenc %%XMM6, %%T1 | |
1719 | vaesenc %%XMM7, %%T1 | |
1720 | vaesenc %%XMM8, %%T1 | |
1721 | ||
1722 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1723 | ||
1724 | vmovdqu %%T5, [%%GDATA + HashKey_8] | |
1725 | vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 | |
1726 | vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 | |
1727 | vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 | |
1728 | vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 | |
1729 | vpxor %%T6, %%T6, %%T5 | |
1730 | ||
1731 | vmovdqu %%T1, [%%GDATA + 16*3] | |
1732 | vaesenc %%XMM1, %%T1 | |
1733 | vaesenc %%XMM2, %%T1 | |
1734 | vaesenc %%XMM3, %%T1 | |
1735 | vaesenc %%XMM4, %%T1 | |
1736 | vaesenc %%XMM5, %%T1 | |
1737 | vaesenc %%XMM6, %%T1 | |
1738 | vaesenc %%XMM7, %%T1 | |
1739 | vaesenc %%XMM8, %%T1 | |
1740 | ||
1741 | vmovdqu %%T1, [rsp + TMP2] | |
1742 | vmovdqu %%T5, [%%GDATA + HashKey_7] | |
1743 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1744 | vpxor %%T4, %%T4, %%T3 | |
1745 | ||
1746 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1747 | vpxor %%T7, %%T7, %%T3 | |
1748 | ||
1749 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1750 | vpxor %%T6, %%T6, %%T3 | |
1751 | ||
1752 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1753 | vpxor %%T6, %%T6, %%T3 | |
1754 | ||
1755 | vmovdqu %%T1, [%%GDATA + 16*4] | |
1756 | vaesenc %%XMM1, %%T1 | |
1757 | vaesenc %%XMM2, %%T1 | |
1758 | vaesenc %%XMM3, %%T1 | |
1759 | vaesenc %%XMM4, %%T1 | |
1760 | vaesenc %%XMM5, %%T1 | |
1761 | vaesenc %%XMM6, %%T1 | |
1762 | vaesenc %%XMM7, %%T1 | |
1763 | vaesenc %%XMM8, %%T1 | |
1764 | ||
1765 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1766 | vmovdqu %%T1, [rsp + TMP3] | |
1767 | vmovdqu %%T5, [%%GDATA + HashKey_6] | |
1768 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1769 | vpxor %%T4, %%T4, %%T3 | |
1770 | ||
1771 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1772 | vpxor %%T7, %%T7, %%T3 | |
1773 | ||
1774 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1775 | vpxor %%T6, %%T6, %%T3 | |
1776 | ||
1777 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1778 | vpxor %%T6, %%T6, %%T3 | |
1779 | ||
1780 | vmovdqu %%T1, [%%GDATA + 16*5] | |
1781 | vaesenc %%XMM1, %%T1 | |
1782 | vaesenc %%XMM2, %%T1 | |
1783 | vaesenc %%XMM3, %%T1 | |
1784 | vaesenc %%XMM4, %%T1 | |
1785 | vaesenc %%XMM5, %%T1 | |
1786 | vaesenc %%XMM6, %%T1 | |
1787 | vaesenc %%XMM7, %%T1 | |
1788 | vaesenc %%XMM8, %%T1 | |
1789 | ||
1790 | ||
1791 | vmovdqu %%T1, [rsp + TMP4] | |
1792 | vmovdqu %%T5, [%%GDATA + HashKey_5] | |
1793 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1794 | vpxor %%T4, %%T4, %%T3 | |
1795 | ||
1796 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1797 | vpxor %%T7, %%T7, %%T3 | |
1798 | ||
1799 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1800 | vpxor %%T6, %%T6, %%T3 | |
1801 | ||
1802 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1803 | vpxor %%T6, %%T6, %%T3 | |
1804 | ||
1805 | vmovdqu %%T1, [%%GDATA + 16*6] | |
1806 | vaesenc %%XMM1, %%T1 | |
1807 | vaesenc %%XMM2, %%T1 | |
1808 | vaesenc %%XMM3, %%T1 | |
1809 | vaesenc %%XMM4, %%T1 | |
1810 | vaesenc %%XMM5, %%T1 | |
1811 | vaesenc %%XMM6, %%T1 | |
1812 | vaesenc %%XMM7, %%T1 | |
1813 | vaesenc %%XMM8, %%T1 | |
1814 | ||
1815 | vmovdqu %%T1, [rsp + TMP5] | |
1816 | vmovdqu %%T5, [%%GDATA + HashKey_4] | |
1817 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1818 | vpxor %%T4, %%T4, %%T3 | |
1819 | ||
1820 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1821 | vpxor %%T7, %%T7, %%T3 | |
1822 | ||
1823 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1824 | vpxor %%T6, %%T6, %%T3 | |
1825 | ||
1826 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1827 | vpxor %%T6, %%T6, %%T3 | |
1828 | ||
1829 | vmovdqu %%T1, [%%GDATA + 16*7] | |
1830 | vaesenc %%XMM1, %%T1 | |
1831 | vaesenc %%XMM2, %%T1 | |
1832 | vaesenc %%XMM3, %%T1 | |
1833 | vaesenc %%XMM4, %%T1 | |
1834 | vaesenc %%XMM5, %%T1 | |
1835 | vaesenc %%XMM6, %%T1 | |
1836 | vaesenc %%XMM7, %%T1 | |
1837 | vaesenc %%XMM8, %%T1 | |
1838 | ||
1839 | vmovdqu %%T1, [rsp + TMP6] | |
1840 | vmovdqu %%T5, [%%GDATA + HashKey_3] | |
1841 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1842 | vpxor %%T4, %%T4, %%T3 | |
1843 | ||
1844 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1845 | vpxor %%T7, %%T7, %%T3 | |
1846 | ||
1847 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1848 | vpxor %%T6, %%T6, %%T3 | |
1849 | ||
1850 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1851 | vpxor %%T6, %%T6, %%T3 | |
1852 | ||
1853 | vmovdqu %%T1, [%%GDATA + 16*8] | |
1854 | vaesenc %%XMM1, %%T1 | |
1855 | vaesenc %%XMM2, %%T1 | |
1856 | vaesenc %%XMM3, %%T1 | |
1857 | vaesenc %%XMM4, %%T1 | |
1858 | vaesenc %%XMM5, %%T1 | |
1859 | vaesenc %%XMM6, %%T1 | |
1860 | vaesenc %%XMM7, %%T1 | |
1861 | vaesenc %%XMM8, %%T1 | |
1862 | ||
1863 | vmovdqu %%T1, [rsp + TMP7] | |
1864 | vmovdqu %%T5, [%%GDATA + HashKey_2] | |
1865 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1866 | vpxor %%T4, %%T4, %%T3 | |
1867 | ||
1868 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1869 | vpxor %%T7, %%T7, %%T3 | |
1870 | ||
1871 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1872 | vpxor %%T6, %%T6, %%T3 | |
1873 | ||
1874 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1875 | vpxor %%T6, %%T6, %%T3 | |
1876 | ||
1877 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1878 | ||
1879 | vmovdqu %%T5, [%%GDATA + 16*9] | |
1880 | vaesenc %%XMM1, %%T5 | |
1881 | vaesenc %%XMM2, %%T5 | |
1882 | vaesenc %%XMM3, %%T5 | |
1883 | vaesenc %%XMM4, %%T5 | |
1884 | vaesenc %%XMM5, %%T5 | |
1885 | vaesenc %%XMM6, %%T5 | |
1886 | vaesenc %%XMM7, %%T5 | |
1887 | vaesenc %%XMM8, %%T5 | |
1888 | ||
1889 | vmovdqu %%T1, [rsp + TMP8] | |
1890 | vmovdqu %%T5, [%%GDATA + HashKey] | |
1891 | ||
1892 | ||
1893 | vpclmulqdq %%T3, %%T1, %%T5, 0x00 | |
1894 | vpxor %%T7, %%T7, %%T3 | |
1895 | ||
1896 | vpclmulqdq %%T3, %%T1, %%T5, 0x01 | |
1897 | vpxor %%T6, %%T6, %%T3 | |
1898 | ||
1899 | vpclmulqdq %%T3, %%T1, %%T5, 0x10 | |
1900 | vpxor %%T6, %%T6, %%T3 | |
1901 | ||
1902 | vpclmulqdq %%T3, %%T1, %%T5, 0x11 | |
1903 | vpxor %%T1, %%T4, %%T3 | |
1904 | ||
1905 | ||
1906 | vmovdqu %%T5, [%%GDATA + 16*10] | |
1907 | %ifndef GCM128_MODE ; GCM192 or GCM256 | |
1908 | vaesenc %%XMM1, %%T5 | |
1909 | vaesenc %%XMM2, %%T5 | |
1910 | vaesenc %%XMM3, %%T5 | |
1911 | vaesenc %%XMM4, %%T5 | |
1912 | vaesenc %%XMM5, %%T5 | |
1913 | vaesenc %%XMM6, %%T5 | |
1914 | vaesenc %%XMM7, %%T5 | |
1915 | vaesenc %%XMM8, %%T5 | |
1916 | ||
1917 | vmovdqu %%T5, [%%GDATA + 16*11] | |
1918 | vaesenc %%XMM1, %%T5 | |
1919 | vaesenc %%XMM2, %%T5 | |
1920 | vaesenc %%XMM3, %%T5 | |
1921 | vaesenc %%XMM4, %%T5 | |
1922 | vaesenc %%XMM5, %%T5 | |
1923 | vaesenc %%XMM6, %%T5 | |
1924 | vaesenc %%XMM7, %%T5 | |
1925 | vaesenc %%XMM8, %%T5 | |
1926 | ||
1927 | vmovdqu %%T5, [%%GDATA + 16*12] | |
1928 | %endif | |
1929 | %ifdef GCM256_MODE | |
1930 | vaesenc %%XMM1, %%T5 | |
1931 | vaesenc %%XMM2, %%T5 | |
1932 | vaesenc %%XMM3, %%T5 | |
1933 | vaesenc %%XMM4, %%T5 | |
1934 | vaesenc %%XMM5, %%T5 | |
1935 | vaesenc %%XMM6, %%T5 | |
1936 | vaesenc %%XMM7, %%T5 | |
1937 | vaesenc %%XMM8, %%T5 | |
1938 | ||
1939 | vmovdqu %%T5, [%%GDATA + 16*13] | |
1940 | vaesenc %%XMM1, %%T5 | |
1941 | vaesenc %%XMM2, %%T5 | |
1942 | vaesenc %%XMM3, %%T5 | |
1943 | vaesenc %%XMM4, %%T5 | |
1944 | vaesenc %%XMM5, %%T5 | |
1945 | vaesenc %%XMM6, %%T5 | |
1946 | vaesenc %%XMM7, %%T5 | |
1947 | vaesenc %%XMM8, %%T5 | |
1948 | ||
1949 | vmovdqu %%T5, [%%GDATA + 16*14] | |
1950 | %endif ; GCM256 | |
1951 | ||
1952 | %assign i 0 | |
1953 | %assign j 1 | |
1954 | %rep 8 | |
1955 | ||
1956 | ;; SNP TBD: This is pretty ugly - consider whether just XORing the | |
1957 | ;; data in after vaesenclast is simpler and performant. Would | |
1958 | ;; also have to ripple it through partial block and ghash_mul_8. | |
1959 | %ifidn %%FULL_PARTIAL, full | |
1960 | %ifdef NT_LD | |
1961 | VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] | |
1962 | vpxor %%T2, %%T2, %%T5 | |
1963 | %else | |
1964 | vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] | |
1965 | %endif | |
1966 | ||
1967 | %ifidn %%ENC_DEC, ENC | |
1968 | vaesenclast reg(j), reg(j), %%T2 | |
1969 | %else | |
1970 | vaesenclast %%T3, reg(j), %%T2 | |
1971 | vpxor reg(j), %%T2, %%T5 | |
1972 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 | |
1973 | %endif | |
1974 | ||
1975 | %else | |
1976 | ; Don't read the final data during partial block processing | |
1977 | %ifdef NT_LD | |
1978 | %if (i<7) | |
1979 | VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] | |
1980 | vpxor %%T2, %%T2, %%T5 | |
1981 | %else | |
1982 | ;; Stage the key directly in T2 rather than hash it with plaintext | |
1983 | vmovdqu %%T2, %%T5 | |
1984 | %endif | |
1985 | %else | |
1986 | %if (i<7) | |
1987 | vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] | |
1988 | %else | |
1989 | ;; Stage the key directly in T2 rather than hash it with plaintext | |
1990 | vmovdqu %%T2, %%T5 | |
1991 | %endif | |
1992 | %endif | |
1993 | ||
1994 | %ifidn %%ENC_DEC, ENC | |
1995 | vaesenclast reg(j), reg(j), %%T2 | |
1996 | %else | |
1997 | %if (i<7) | |
1998 | vaesenclast %%T3, reg(j), %%T2 | |
1999 | vpxor reg(j), %%T2, %%T5 | |
2000 | ;; Do not read the data since it could fault | |
2001 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 | |
2002 | %else | |
2003 | vaesenclast reg(j), reg(j), %%T2 | |
2004 | %endif | |
2005 | %endif | |
2006 | %endif | |
2007 | ||
2008 | %assign i (i+1) | |
2009 | %assign j (j+1) | |
2010 | %endrep | |
2011 | ||
2012 | ||
2013 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2014 | ||
2015 | ||
2016 | vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs | |
2017 | vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs | |
2018 | vpxor %%T7, %%T7, %%T3 | |
2019 | vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 | |
2020 | ||
2021 | ||
2022 | ||
2023 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2024 | ;first phase of the reduction | |
9f95a23c | 2025 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
2026 | |
2027 | vpclmulqdq %%T2, %%T3, %%T7, 0x01 | |
2028 | vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs | |
2029 | ||
2030 | vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete | |
2031 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2032 | ||
2033 | %ifidn %%ENC_DEC, ENC | |
2034 | ; Write to the Ciphertext buffer | |
2035 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 | |
2036 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 | |
2037 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 | |
2038 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 | |
2039 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 | |
2040 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 | |
2041 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 | |
2042 | %ifidn %%FULL_PARTIAL, full | |
2043 | ;; Avoid writing past the buffer if handling a partial block | |
2044 | VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 | |
2045 | %endif | |
2046 | %endif | |
2047 | ||
2048 | ||
2049 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2050 | ;second phase of the reduction | |
2051 | vpclmulqdq %%T2, %%T3, %%T7, 0x00 | |
2052 | vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
2053 | ||
2054 | vpclmulqdq %%T4, %%T3, %%T7, 0x10 | |
2055 | vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
2056 | ||
2057 | vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete | |
2058 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2059 | vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 | |
2060 | ||
9f95a23c TL |
2061 | vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap |
2062 | vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap | |
2063 | vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap | |
2064 | vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap | |
2065 | vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap | |
2066 | vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap | |
2067 | vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap | |
2068 | vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap | |
11fdf7f2 TL |
2069 | |
2070 | ||
2071 | vpxor %%XMM1, %%T1 | |
2072 | ||
2073 | ||
2074 | %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL | |
2075 | ||
2076 | ||
2077 | ; GHASH the last 4 ciphertext blocks. | |
2078 | %macro GHASH_LAST_8 16 | |
2079 | %define %%GDATA %1 | |
2080 | %define %%T1 %2 | |
2081 | %define %%T2 %3 | |
2082 | %define %%T3 %4 | |
2083 | %define %%T4 %5 | |
2084 | %define %%T5 %6 | |
2085 | %define %%T6 %7 | |
2086 | %define %%T7 %8 | |
2087 | %define %%XMM1 %9 | |
2088 | %define %%XMM2 %10 | |
2089 | %define %%XMM3 %11 | |
2090 | %define %%XMM4 %12 | |
2091 | %define %%XMM5 %13 | |
2092 | %define %%XMM6 %14 | |
2093 | %define %%XMM7 %15 | |
2094 | %define %%XMM8 %16 | |
2095 | ||
2096 | ;; Karatsuba Method | |
2097 | ||
2098 | vmovdqu %%T5, [%%GDATA + HashKey_8] | |
2099 | ||
2100 | vpshufd %%T2, %%XMM1, 01001110b | |
2101 | vpshufd %%T3, %%T5, 01001110b | |
2102 | vpxor %%T2, %%T2, %%XMM1 | |
2103 | vpxor %%T3, %%T3, %%T5 | |
2104 | ||
2105 | vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 | |
2106 | vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 | |
2107 | ||
2108 | vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 | |
2109 | ||
2110 | ;;;;;;;;;;;;;;;;;;;;;; | |
2111 | ||
2112 | vmovdqu %%T5, [%%GDATA + HashKey_7] | |
2113 | vpshufd %%T2, %%XMM2, 01001110b | |
2114 | vpshufd %%T3, %%T5, 01001110b | |
2115 | vpxor %%T2, %%T2, %%XMM2 | |
2116 | vpxor %%T3, %%T3, %%T5 | |
2117 | ||
2118 | vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 | |
2119 | vpxor %%T6, %%T6, %%T4 | |
2120 | ||
2121 | vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 | |
2122 | vpxor %%T7, %%T7, %%T4 | |
2123 | ||
2124 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2125 | ||
2126 | vpxor %%XMM1, %%XMM1, %%T2 | |
2127 | ||
2128 | ;;;;;;;;;;;;;;;;;;;;;; | |
2129 | ||
2130 | vmovdqu %%T5, [%%GDATA + HashKey_6] | |
2131 | vpshufd %%T2, %%XMM3, 01001110b | |
2132 | vpshufd %%T3, %%T5, 01001110b | |
2133 | vpxor %%T2, %%T2, %%XMM3 | |
2134 | vpxor %%T3, %%T3, %%T5 | |
2135 | ||
2136 | vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 | |
2137 | vpxor %%T6, %%T6, %%T4 | |
2138 | ||
2139 | vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 | |
2140 | vpxor %%T7, %%T7, %%T4 | |
2141 | ||
2142 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2143 | ||
2144 | vpxor %%XMM1, %%XMM1, %%T2 | |
2145 | ||
2146 | ;;;;;;;;;;;;;;;;;;;;;; | |
2147 | ||
2148 | vmovdqu %%T5, [%%GDATA + HashKey_5] | |
2149 | vpshufd %%T2, %%XMM4, 01001110b | |
2150 | vpshufd %%T3, %%T5, 01001110b | |
2151 | vpxor %%T2, %%T2, %%XMM4 | |
2152 | vpxor %%T3, %%T3, %%T5 | |
2153 | ||
2154 | vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 | |
2155 | vpxor %%T6, %%T6, %%T4 | |
2156 | ||
2157 | vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 | |
2158 | vpxor %%T7, %%T7, %%T4 | |
2159 | ||
2160 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2161 | ||
2162 | vpxor %%XMM1, %%XMM1, %%T2 | |
2163 | ||
2164 | ;;;;;;;;;;;;;;;;;;;;;; | |
2165 | ||
2166 | vmovdqu %%T5, [%%GDATA + HashKey_4] | |
2167 | vpshufd %%T2, %%XMM5, 01001110b | |
2168 | vpshufd %%T3, %%T5, 01001110b | |
2169 | vpxor %%T2, %%T2, %%XMM5 | |
2170 | vpxor %%T3, %%T3, %%T5 | |
2171 | ||
2172 | vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 | |
2173 | vpxor %%T6, %%T6, %%T4 | |
2174 | ||
2175 | vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 | |
2176 | vpxor %%T7, %%T7, %%T4 | |
2177 | ||
2178 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2179 | ||
2180 | vpxor %%XMM1, %%XMM1, %%T2 | |
2181 | ||
2182 | ;;;;;;;;;;;;;;;;;;;;;; | |
2183 | ||
2184 | vmovdqu %%T5, [%%GDATA + HashKey_3] | |
2185 | vpshufd %%T2, %%XMM6, 01001110b | |
2186 | vpshufd %%T3, %%T5, 01001110b | |
2187 | vpxor %%T2, %%T2, %%XMM6 | |
2188 | vpxor %%T3, %%T3, %%T5 | |
2189 | ||
2190 | vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 | |
2191 | vpxor %%T6, %%T6, %%T4 | |
2192 | ||
2193 | vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 | |
2194 | vpxor %%T7, %%T7, %%T4 | |
2195 | ||
2196 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2197 | ||
2198 | vpxor %%XMM1, %%XMM1, %%T2 | |
2199 | ||
2200 | ;;;;;;;;;;;;;;;;;;;;;; | |
2201 | ||
2202 | vmovdqu %%T5, [%%GDATA + HashKey_2] | |
2203 | vpshufd %%T2, %%XMM7, 01001110b | |
2204 | vpshufd %%T3, %%T5, 01001110b | |
2205 | vpxor %%T2, %%T2, %%XMM7 | |
2206 | vpxor %%T3, %%T3, %%T5 | |
2207 | ||
2208 | vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 | |
2209 | vpxor %%T6, %%T6, %%T4 | |
2210 | ||
2211 | vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 | |
2212 | vpxor %%T7, %%T7, %%T4 | |
2213 | ||
2214 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2215 | ||
2216 | vpxor %%XMM1, %%XMM1, %%T2 | |
2217 | ||
2218 | ;;;;;;;;;;;;;;;;;;;;;; | |
2219 | ||
2220 | vmovdqu %%T5, [%%GDATA + HashKey] | |
2221 | vpshufd %%T2, %%XMM8, 01001110b | |
2222 | vpshufd %%T3, %%T5, 01001110b | |
2223 | vpxor %%T2, %%T2, %%XMM8 | |
2224 | vpxor %%T3, %%T3, %%T5 | |
2225 | ||
2226 | vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 | |
2227 | vpxor %%T6, %%T6, %%T4 | |
2228 | ||
2229 | vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 | |
2230 | vpxor %%T7, %%T7, %%T4 | |
2231 | ||
2232 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2233 | ||
2234 | vpxor %%XMM1, %%XMM1, %%T2 | |
2235 | vpxor %%XMM1, %%XMM1, %%T6 | |
2236 | vpxor %%T2, %%XMM1, %%T7 | |
2237 | ||
2238 | ||
2239 | ||
2240 | ||
2241 | vpslldq %%T4, %%T2, 8 | |
2242 | vpsrldq %%T2, %%T2, 8 | |
2243 | ||
2244 | vpxor %%T7, %%T7, %%T4 | |
2245 | vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications | |
2246 | ||
2247 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2248 | ;first phase of the reduction | |
9f95a23c | 2249 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
2250 | |
2251 | vpclmulqdq %%T2, %%T3, %%T7, 0x01 | |
2252 | vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs | |
2253 | ||
2254 | vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete | |
2255 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2256 | ||
2257 | ||
2258 | ;second phase of the reduction | |
2259 | vpclmulqdq %%T2, %%T3, %%T7, 0x00 | |
2260 | vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
2261 | ||
2262 | vpclmulqdq %%T4, %%T3, %%T7, 0x10 | |
2263 | vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
2264 | ||
2265 | vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete | |
2266 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2267 | vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 | |
2268 | %endmacro | |
2269 | ||
2270 | ||
2271 | ; GHASH the last 4 ciphertext blocks. | |
2272 | %macro GHASH_LAST_7 15 | |
2273 | %define %%GDATA %1 | |
2274 | %define %%T1 %2 | |
2275 | %define %%T2 %3 | |
2276 | %define %%T3 %4 | |
2277 | %define %%T4 %5 | |
2278 | %define %%T5 %6 | |
2279 | %define %%T6 %7 | |
2280 | %define %%T7 %8 | |
2281 | %define %%XMM1 %9 | |
2282 | %define %%XMM2 %10 | |
2283 | %define %%XMM3 %11 | |
2284 | %define %%XMM4 %12 | |
2285 | %define %%XMM5 %13 | |
2286 | %define %%XMM6 %14 | |
2287 | %define %%XMM7 %15 | |
2288 | ||
2289 | ;; Karatsuba Method | |
2290 | ||
2291 | vmovdqu %%T5, [%%GDATA + HashKey_7] | |
2292 | ||
2293 | vpshufd %%T2, %%XMM1, 01001110b | |
2294 | vpshufd %%T3, %%T5, 01001110b | |
2295 | vpxor %%T2, %%T2, %%XMM1 | |
2296 | vpxor %%T3, %%T3, %%T5 | |
2297 | ||
2298 | vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 | |
2299 | vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 | |
2300 | ||
2301 | vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 | |
2302 | ||
2303 | ;;;;;;;;;;;;;;;;;;;;;; | |
2304 | ||
2305 | vmovdqu %%T5, [%%GDATA + HashKey_6] | |
2306 | vpshufd %%T2, %%XMM2, 01001110b | |
2307 | vpshufd %%T3, %%T5, 01001110b | |
2308 | vpxor %%T2, %%T2, %%XMM2 | |
2309 | vpxor %%T3, %%T3, %%T5 | |
2310 | ||
2311 | vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 | |
2312 | vpxor %%T6, %%T6, %%T4 | |
2313 | ||
2314 | vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 | |
2315 | vpxor %%T7, %%T7, %%T4 | |
2316 | ||
2317 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2318 | ||
2319 | vpxor %%XMM1, %%XMM1, %%T2 | |
2320 | ||
2321 | ;;;;;;;;;;;;;;;;;;;;;; | |
2322 | ||
2323 | vmovdqu %%T5, [%%GDATA + HashKey_5] | |
2324 | vpshufd %%T2, %%XMM3, 01001110b | |
2325 | vpshufd %%T3, %%T5, 01001110b | |
2326 | vpxor %%T2, %%T2, %%XMM3 | |
2327 | vpxor %%T3, %%T3, %%T5 | |
2328 | ||
2329 | vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 | |
2330 | vpxor %%T6, %%T6, %%T4 | |
2331 | ||
2332 | vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 | |
2333 | vpxor %%T7, %%T7, %%T4 | |
2334 | ||
2335 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2336 | ||
2337 | vpxor %%XMM1, %%XMM1, %%T2 | |
2338 | ||
2339 | ;;;;;;;;;;;;;;;;;;;;;; | |
2340 | ||
2341 | vmovdqu %%T5, [%%GDATA + HashKey_4] | |
2342 | vpshufd %%T2, %%XMM4, 01001110b | |
2343 | vpshufd %%T3, %%T5, 01001110b | |
2344 | vpxor %%T2, %%T2, %%XMM4 | |
2345 | vpxor %%T3, %%T3, %%T5 | |
2346 | ||
2347 | vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 | |
2348 | vpxor %%T6, %%T6, %%T4 | |
2349 | ||
2350 | vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 | |
2351 | vpxor %%T7, %%T7, %%T4 | |
2352 | ||
2353 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2354 | ||
2355 | vpxor %%XMM1, %%XMM1, %%T2 | |
2356 | ||
2357 | ;;;;;;;;;;;;;;;;;;;;;; | |
2358 | ||
2359 | vmovdqu %%T5, [%%GDATA + HashKey_3] | |
2360 | vpshufd %%T2, %%XMM5, 01001110b | |
2361 | vpshufd %%T3, %%T5, 01001110b | |
2362 | vpxor %%T2, %%T2, %%XMM5 | |
2363 | vpxor %%T3, %%T3, %%T5 | |
2364 | ||
2365 | vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 | |
2366 | vpxor %%T6, %%T6, %%T4 | |
2367 | ||
2368 | vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 | |
2369 | vpxor %%T7, %%T7, %%T4 | |
2370 | ||
2371 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2372 | ||
2373 | vpxor %%XMM1, %%XMM1, %%T2 | |
2374 | ||
2375 | ;;;;;;;;;;;;;;;;;;;;;; | |
2376 | ||
2377 | vmovdqu %%T5, [%%GDATA + HashKey_2] | |
2378 | vpshufd %%T2, %%XMM6, 01001110b | |
2379 | vpshufd %%T3, %%T5, 01001110b | |
2380 | vpxor %%T2, %%T2, %%XMM6 | |
2381 | vpxor %%T3, %%T3, %%T5 | |
2382 | ||
2383 | vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 | |
2384 | vpxor %%T6, %%T6, %%T4 | |
2385 | ||
2386 | vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 | |
2387 | vpxor %%T7, %%T7, %%T4 | |
2388 | ||
2389 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2390 | ||
2391 | vpxor %%XMM1, %%XMM1, %%T2 | |
2392 | ||
2393 | ;;;;;;;;;;;;;;;;;;;;;; | |
2394 | ||
2395 | vmovdqu %%T5, [%%GDATA + HashKey_1] | |
2396 | vpshufd %%T2, %%XMM7, 01001110b | |
2397 | vpshufd %%T3, %%T5, 01001110b | |
2398 | vpxor %%T2, %%T2, %%XMM7 | |
2399 | vpxor %%T3, %%T3, %%T5 | |
2400 | ||
2401 | vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 | |
2402 | vpxor %%T6, %%T6, %%T4 | |
2403 | ||
2404 | vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 | |
2405 | vpxor %%T7, %%T7, %%T4 | |
2406 | ||
2407 | vpclmulqdq %%T2, %%T2, %%T3, 0x00 | |
2408 | ||
2409 | vpxor %%XMM1, %%XMM1, %%T2 | |
2410 | ||
2411 | ;;;;;;;;;;;;;;;;;;;;;; | |
2412 | ||
2413 | vpxor %%XMM1, %%XMM1, %%T6 | |
2414 | vpxor %%T2, %%XMM1, %%T7 | |
2415 | ||
2416 | ||
2417 | ||
2418 | ||
2419 | vpslldq %%T4, %%T2, 8 | |
2420 | vpsrldq %%T2, %%T2, 8 | |
2421 | ||
2422 | vpxor %%T7, %%T7, %%T4 | |
2423 | vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications | |
2424 | ||
2425 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2426 | ;first phase of the reduction | |
9f95a23c | 2427 | vmovdqa %%T3, [rel POLY2] |
11fdf7f2 TL |
2428 | |
2429 | vpclmulqdq %%T2, %%T3, %%T7, 0x01 | |
2430 | vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs | |
2431 | ||
2432 | vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete | |
2433 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2434 | ||
2435 | ||
2436 | ;second phase of the reduction | |
2437 | vpclmulqdq %%T2, %%T3, %%T7, 0x00 | |
2438 | vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
2439 | ||
2440 | vpclmulqdq %%T4, %%T3, %%T7, 0x10 | |
2441 | vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
2442 | ||
2443 | vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete | |
2444 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2445 | vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 | |
2446 | %endmacro | |
2447 | ||
2448 | ||
2449 | ||
2450 | ;;; Handle encryption of the final partial block | |
2451 | ;;; IN: | |
2452 | ;;; r13 - Number of bytes to read | |
2453 | ;;; MODIFIES: | |
2454 | ;;; KEY - Key for encrypting the partial block | |
2455 | ;;; HASH - Current hash value | |
2456 | ;;; SMASHES: | |
2457 | ;;; r10, r12, r15, rax | |
2458 | ;;; T1, T2 | |
2459 | ;;; Note: | |
2460 | ;;; PLAIN_CYPH_LEN, %7, is passed only to determine | |
2461 | ;;; if buffer is big enough to do a 16 byte read & shift. | |
2462 | ;;; 'LT16' is passed here only if buffer is known to be smaller | |
2463 | ;;; than 16 bytes. | |
2464 | ;;; Any other value passed here will result in 16 byte read | |
2465 | ;;; code path. | |
2466 | ;;; TBD: Remove HASH from the instantiation | |
2467 | %macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 | |
2468 | %define %%KEY %1 | |
2469 | %define %%T1 %2 | |
2470 | %define %%T2 %3 | |
2471 | %define %%CYPH_PLAIN_OUT %4 | |
2472 | %define %%PLAIN_CYPH_IN %5 | |
2473 | %define %%PLAIN_CYPH_LEN %6 | |
2474 | %define %%ENC_DEC %7 | |
2475 | %define %%DATA_OFFSET %8 | |
2476 | ||
2477 | ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting | |
2478 | %ifidn %%PLAIN_CYPH_LEN, LT16 | |
2479 | ;; Handle the case where the message is < 16 bytes | |
2480 | lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] | |
2481 | ||
2482 | ;; T1 - packed output | |
2483 | ;; r10 - input data address | |
2484 | ;; r13 - input data length | |
2485 | ;; r12, r15, rax - temp registers | |
2486 | READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax | |
2487 | ||
2488 | lea r12, [SHIFT_MASK + 16] | |
2489 | sub r12, r13 | |
2490 | %else | |
2491 | ;; Handle the case where the message is >= 16 bytes | |
2492 | sub %%DATA_OFFSET, 16 | |
2493 | add %%DATA_OFFSET, r13 | |
2494 | ;; Receive the last <16 Byte block | |
2495 | vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] | |
2496 | sub %%DATA_OFFSET, r13 | |
2497 | add %%DATA_OFFSET, 16 | |
2498 | ||
2499 | lea r12, [SHIFT_MASK + 16] | |
2500 | ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes | |
2501 | ;; (r13 is the number of bytes in plaintext mod 16) | |
2502 | sub r12, r13 | |
2503 | ;; Get the appropriate shuffle mask | |
2504 | vmovdqu %%T2, [r12] | |
2505 | ;; shift right 16-r13 bytes | |
2506 | vpshufb %%T1, %%T2 | |
2507 | %endif ; %%PLAIN_CYPH_LEN, LT16 | |
2508 | ||
2509 | ;; At this point T1 contains the partial block data | |
2510 | %ifidn %%ENC_DEC, DEC | |
2511 | ;; Plaintext XOR E(K, Yn) | |
2512 | ;; Set aside the ciphertext | |
2513 | vmovdqa %%T2, %%T1 | |
2514 | vpxor %%KEY, %%KEY, %%T1 | |
2515 | ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext | |
2516 | vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] | |
2517 | ;; Mask out top 16-r13 bytes of ciphertext | |
2518 | vpand %%KEY, %%KEY, %%T1 | |
2519 | ||
2520 | ;; Prepare the ciphertext for the hash | |
2521 | ;; mask out top 16-r13 bytes of the plaintext | |
2522 | vpand %%T2, %%T2, %%T1 | |
2523 | %else | |
2524 | ;; Plaintext XOR E(K, Yn) | |
2525 | vpxor %%KEY, %%KEY, %%T1 | |
2526 | ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY | |
2527 | vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] | |
2528 | ;; Mask out top 16-r13 bytes of %%KEY | |
2529 | vpand %%KEY, %%KEY, %%T1 | |
2530 | %endif | |
2531 | ||
2532 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2533 | ;; Output r13 Bytes | |
2534 | vmovq rax, %%KEY | |
2535 | cmp r13, 8 | |
2536 | jle %%_less_than_8_bytes_left | |
2537 | ||
2538 | mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax | |
2539 | add %%DATA_OFFSET, 8 | |
2540 | vpsrldq %%T1, %%KEY, 8 | |
2541 | vmovq rax, %%T1 | |
2542 | sub r13, 8 | |
2543 | ||
2544 | %%_less_than_8_bytes_left: | |
2545 | mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al | |
2546 | add %%DATA_OFFSET, 1 | |
2547 | shr rax, 8 | |
2548 | sub r13, 1 | |
2549 | jne %%_less_than_8_bytes_left | |
2550 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2551 | ||
2552 | %ifidn %%ENC_DEC, DEC | |
2553 | ;; If decrypt, restore the ciphertext into %%KEY | |
2554 | vmovdqu %%KEY, %%T2 | |
2555 | %endif | |
2556 | %endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK | |
2557 | ||
2558 | ||
2559 | ||
2560 | ; Encryption of a single block | |
2561 | %macro ENCRYPT_SINGLE_BLOCK 2 | |
2562 | %define %%GDATA %1 | |
2563 | %define %%XMM0 %2 | |
2564 | ||
2565 | vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] | |
2566 | %assign i 1 | |
2567 | %rep NROUNDS | |
2568 | vaesenc %%XMM0, [%%GDATA+16*i] | |
2569 | %assign i (i+1) | |
2570 | %endrep | |
2571 | vaesenclast %%XMM0, [%%GDATA+16*i] | |
2572 | %endmacro | |
2573 | ||
2574 | ||
2575 | ;; Start of Stack Setup | |
2576 | ||
2577 | %macro FUNC_SAVE 0 | |
2578 | ;; Required for Update/GMC_ENC | |
2579 | ;the number of pushes must equal STACK_OFFSET | |
2580 | push r12 | |
2581 | push r13 | |
2582 | push r14 | |
2583 | push r15 | |
2584 | mov r14, rsp | |
2585 | ||
2586 | sub rsp, VARIABLE_OFFSET | |
2587 | and rsp, ~63 | |
2588 | ||
2589 | %ifidn __OUTPUT_FORMAT__, win64 | |
2590 | ; xmm6:xmm15 need to be maintained for Windows | |
2591 | vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 | |
2592 | vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 | |
2593 | vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 | |
2594 | vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 | |
2595 | vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 | |
2596 | vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 | |
2597 | vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 | |
2598 | vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 | |
2599 | vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 | |
2600 | vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 | |
2601 | %endif | |
2602 | %endmacro | |
2603 | ||
2604 | ||
2605 | %macro FUNC_RESTORE 0 | |
2606 | ||
2607 | %ifidn __OUTPUT_FORMAT__, win64 | |
2608 | vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] | |
2609 | vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] | |
2610 | vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] | |
2611 | vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] | |
2612 | vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] | |
2613 | vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] | |
2614 | vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] | |
2615 | vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] | |
2616 | vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] | |
2617 | vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] | |
2618 | %endif | |
2619 | ||
2620 | ;; Required for Update/GMC_ENC | |
2621 | mov rsp, r14 | |
2622 | pop r15 | |
2623 | pop r14 | |
2624 | pop r13 | |
2625 | pop r12 | |
2626 | %endmacro | |
2627 | ||
2628 | ||
2629 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2630 | ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. | |
2631 | ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, | |
2632 | ; Additional Authentication data (A_IN), Additional Data length (A_LEN). | |
2633 | ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. | |
2634 | ; Clobbers rax, r10-r13, and xmm0-xmm6 | |
2635 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2636 | %macro GCM_INIT 5 | |
2637 | %define %%GDATA_KEY %1 | |
2638 | %define %%GDATA_CTX %2 | |
2639 | %define %%IV %3 | |
2640 | %define %%A_IN %4 | |
2641 | %define %%A_LEN %5 | |
2642 | %define %%AAD_HASH xmm14 | |
11fdf7f2 | 2643 | |
11fdf7f2 TL |
2644 | |
2645 | mov r10, %%A_LEN | |
2646 | cmp r10, 0 | |
2647 | je %%_aad_is_zero | |
2648 | ||
9f95a23c | 2649 | CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax |
11fdf7f2 TL |
2650 | jmp %%_after_aad |
2651 | ||
2652 | %%_aad_is_zero: | |
2653 | vpxor %%AAD_HASH, %%AAD_HASH | |
2654 | ||
2655 | %%_after_aad: | |
2656 | mov r10, %%A_LEN | |
2657 | vpxor xmm2, xmm3 | |
2658 | ||
2659 | vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash | |
2660 | mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length | |
2661 | xor r10, r10 | |
2662 | mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 | |
2663 | mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 | |
2664 | vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 | |
2665 | mov r10, %%IV | |
2666 | vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 | |
2667 | vpinsrq xmm2, [r10], 0 | |
2668 | vpinsrd xmm2, [r10+8], 2 | |
2669 | vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv | |
2670 | ||
9f95a23c | 2671 | vpshufb xmm2, [rel SHUF_MASK] |
11fdf7f2 TL |
2672 | |
2673 | vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv | |
2674 | %endmacro | |
2675 | ||
2676 | %macro GCM_ENC_DEC_SMALL 12 | |
2677 | %define %%GDATA_KEY %1 | |
2678 | %define %%GDATA_CTX %2 | |
2679 | %define %%CYPH_PLAIN_OUT %3 | |
2680 | %define %%PLAIN_CYPH_IN %4 | |
2681 | %define %%PLAIN_CYPH_LEN %5 | |
2682 | %define %%ENC_DEC %6 | |
2683 | %define %%DATA_OFFSET %7 | |
2684 | %define %%LENGTH %8 | |
2685 | %define %%NUM_BLOCKS %9 | |
2686 | %define %%CTR %10 | |
2687 | %define %%HASH %11 | |
2688 | %define %%INSTANCE_TYPE %12 | |
2689 | ||
2690 | ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. | |
2691 | ;; cmp %%NUM_BLOCKS, 0 | |
2692 | ;; je %%_small_initial_blocks_encrypted | |
2693 | cmp %%NUM_BLOCKS, 8 | |
2694 | je %%_small_initial_num_blocks_is_8 | |
2695 | cmp %%NUM_BLOCKS, 7 | |
2696 | je %%_small_initial_num_blocks_is_7 | |
2697 | cmp %%NUM_BLOCKS, 6 | |
2698 | je %%_small_initial_num_blocks_is_6 | |
2699 | cmp %%NUM_BLOCKS, 5 | |
2700 | je %%_small_initial_num_blocks_is_5 | |
2701 | cmp %%NUM_BLOCKS, 4 | |
2702 | je %%_small_initial_num_blocks_is_4 | |
2703 | cmp %%NUM_BLOCKS, 3 | |
2704 | je %%_small_initial_num_blocks_is_3 | |
2705 | cmp %%NUM_BLOCKS, 2 | |
2706 | je %%_small_initial_num_blocks_is_2 | |
2707 | ||
2708 | jmp %%_small_initial_num_blocks_is_1 | |
2709 | ||
2710 | ||
2711 | %%_small_initial_num_blocks_is_8: | |
2712 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2713 | jmp %%_small_initial_blocks_encrypted | |
2714 | ||
2715 | %%_small_initial_num_blocks_is_7: | |
2716 | ;; r13 - %%LENGTH | |
2717 | ;; xmm12 - T1 | |
2718 | ;; xmm13 - T2 | |
2719 | ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys | |
2720 | ;; xmm15 - T4 | |
2721 | ;; xmm11 - T5 | |
2722 | ;; xmm9 - CTR | |
2723 | ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys | |
2724 | ;; xmm2 - XMM2 | |
2725 | ;; xmm3 - XMM3 | |
2726 | ;; xmm4 - XMM4 | |
2727 | ;; xmm5 - XMM5 | |
2728 | ;; xmm6 - XMM6 | |
2729 | ;; xmm7 - XMM7 | |
2730 | ;; xmm8 - XMM8 - AAD HASH IN | |
2731 | ;; xmm10 - T6 | |
2732 | ;; xmm0 - T_key | |
2733 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2734 | jmp %%_small_initial_blocks_encrypted | |
2735 | ||
2736 | %%_small_initial_num_blocks_is_6: | |
2737 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2738 | jmp %%_small_initial_blocks_encrypted | |
2739 | ||
2740 | %%_small_initial_num_blocks_is_5: | |
2741 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2742 | jmp %%_small_initial_blocks_encrypted | |
2743 | ||
2744 | %%_small_initial_num_blocks_is_4: | |
2745 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2746 | jmp %%_small_initial_blocks_encrypted | |
2747 | ||
2748 | %%_small_initial_num_blocks_is_3: | |
2749 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2750 | jmp %%_small_initial_blocks_encrypted | |
2751 | ||
2752 | %%_small_initial_num_blocks_is_2: | |
2753 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2754 | jmp %%_small_initial_blocks_encrypted | |
2755 | ||
2756 | %%_small_initial_num_blocks_is_1: | |
2757 | INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE | |
2758 | ||
2759 | ;; Note: zero initial blocks not allowed. | |
2760 | ||
2761 | %%_small_initial_blocks_encrypted: | |
2762 | ||
2763 | %endmacro ; GCM_ENC_DEC_SMALL | |
2764 | ||
2765 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2766 | ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct | |
2767 | ; has been initialized by GCM_INIT | |
2768 | ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. | |
2769 | ; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), | |
2770 | ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). | |
2771 | ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX | |
2772 | ; Clobbers rax, r10-r15, and xmm0-xmm15 | |
2773 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2774 | %macro GCM_ENC_DEC 7 | |
2775 | %define %%GDATA_KEY %1 | |
2776 | %define %%GDATA_CTX %2 | |
2777 | %define %%CYPH_PLAIN_OUT %3 | |
2778 | %define %%PLAIN_CYPH_IN %4 | |
2779 | %define %%PLAIN_CYPH_LEN %5 | |
2780 | %define %%ENC_DEC %6 | |
2781 | %define %%INSTANCE_TYPE %7 | |
2782 | %define %%DATA_OFFSET r11 | |
2783 | ||
2784 | ; Macro flow: | |
2785 | ; calculate the number of 16byte blocks in the message | |
2786 | ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' | |
2787 | ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' | |
2788 | ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' | |
2789 | ||
2790 | cmp %%PLAIN_CYPH_LEN, 0 | |
2791 | je %%_enc_dec_done | |
2792 | ||
2793 | xor %%DATA_OFFSET, %%DATA_OFFSET | |
2794 | ;; Update length of data processed | |
2795 | %ifidn __OUTPUT_FORMAT__, win64 | |
2796 | mov rax, %%PLAIN_CYPH_LEN | |
2797 | add [%%GDATA_CTX + InLen], rax | |
2798 | %else | |
2799 | add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN | |
2800 | %endif | |
2801 | vmovdqu xmm13, [%%GDATA_KEY + HashKey] | |
2802 | vmovdqu xmm8, [%%GDATA_CTX + AadHash] | |
2803 | ||
2804 | %ifidn %%INSTANCE_TYPE, multi_call | |
2805 | ;; NOTE: partial block processing makes only sense for multi_call here. | |
2806 | ;; Used for the update flow - if there was a previous partial | |
2807 | ;; block fill the remaining bytes here. | |
2808 | PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC | |
2809 | %endif | |
2810 | ||
2811 | ;; lift CTR set from initial_blocks to here | |
2812 | %ifidn %%INSTANCE_TYPE, single_call | |
2813 | vmovdqu xmm9, xmm2 | |
2814 | %else | |
2815 | vmovdqu xmm9, [%%GDATA_CTX + CurCount] | |
2816 | %endif | |
2817 | ||
2818 | ;; Save the amount of data left to process in r10 | |
2819 | mov r13, %%PLAIN_CYPH_LEN | |
2820 | %ifidn %%INSTANCE_TYPE, multi_call | |
2821 | ;; NOTE: %%DATA_OFFSET is zero in single_call case. | |
2822 | ;; Consequently PLAIN_CYPH_LEN will never be zero after | |
2823 | ;; %%DATA_OFFSET subtraction below. | |
2824 | sub r13, %%DATA_OFFSET | |
2825 | ||
2826 | ;; There may be no more data if it was consumed in the partial block. | |
2827 | cmp r13, 0 | |
2828 | je %%_enc_dec_done | |
2829 | %endif ; %%INSTANCE_TYPE, multi_call | |
2830 | mov r10, r13 | |
2831 | ||
2832 | ;; Determine how many blocks to process in INITIAL | |
2833 | mov r12, r13 | |
2834 | shr r12, 4 | |
2835 | and r12, 7 | |
2836 | ||
2837 | ;; Process one additional block in INITIAL if there is a partial block | |
2838 | and r10, 0xf | |
2839 | blsmsk r10, r10 ; Set CF if zero | |
2840 | cmc ; Flip CF | |
2841 | adc r12, 0x0 ; Process an additional INITIAL block if CF set | |
2842 | ||
2843 | ;; Less than 127B will be handled by the small message code, which | |
2844 | ;; can process up to 7 16B blocks. | |
2845 | cmp r13, 128 | |
2846 | jge %%_large_message_path | |
2847 | ||
2848 | GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE | |
2849 | jmp %%_ghash_done | |
2850 | ||
2851 | %%_large_message_path: | |
2852 | and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will | |
2853 | ; can be handled by the x8 partial loop. | |
2854 | ||
2855 | cmp r12, 0 | |
2856 | je %%_initial_num_blocks_is_0 | |
2857 | cmp r12, 7 | |
2858 | je %%_initial_num_blocks_is_7 | |
2859 | cmp r12, 6 | |
2860 | je %%_initial_num_blocks_is_6 | |
2861 | cmp r12, 5 | |
2862 | je %%_initial_num_blocks_is_5 | |
2863 | cmp r12, 4 | |
2864 | je %%_initial_num_blocks_is_4 | |
2865 | cmp r12, 3 | |
2866 | je %%_initial_num_blocks_is_3 | |
2867 | cmp r12, 2 | |
2868 | je %%_initial_num_blocks_is_2 | |
2869 | ||
2870 | jmp %%_initial_num_blocks_is_1 | |
2871 | ||
2872 | %%_initial_num_blocks_is_7: | |
2873 | ;; r13 - %%LENGTH | |
2874 | ;; xmm12 - T1 | |
2875 | ;; xmm13 - T2 | |
2876 | ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys | |
2877 | ;; xmm15 - T4 | |
2878 | ;; xmm11 - T5 | |
2879 | ;; xmm9 - CTR | |
2880 | ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys | |
2881 | ;; xmm2 - XMM2 | |
2882 | ;; xmm3 - XMM3 | |
2883 | ;; xmm4 - XMM4 | |
2884 | ;; xmm5 - XMM5 | |
2885 | ;; xmm6 - XMM6 | |
2886 | ;; xmm7 - XMM7 | |
2887 | ;; xmm8 - XMM8 - AAD HASH IN | |
2888 | ;; xmm10 - T6 | |
2889 | ;; xmm0 - T_key | |
2890 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2891 | jmp %%_initial_blocks_encrypted | |
2892 | ||
2893 | %%_initial_num_blocks_is_6: | |
2894 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2895 | jmp %%_initial_blocks_encrypted | |
2896 | ||
2897 | %%_initial_num_blocks_is_5: | |
2898 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2899 | jmp %%_initial_blocks_encrypted | |
2900 | ||
2901 | %%_initial_num_blocks_is_4: | |
2902 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2903 | jmp %%_initial_blocks_encrypted | |
2904 | ||
2905 | %%_initial_num_blocks_is_3: | |
2906 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2907 | jmp %%_initial_blocks_encrypted | |
2908 | ||
2909 | %%_initial_num_blocks_is_2: | |
2910 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2911 | jmp %%_initial_blocks_encrypted | |
2912 | ||
2913 | %%_initial_num_blocks_is_1: | |
2914 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2915 | jmp %%_initial_blocks_encrypted | |
2916 | ||
2917 | %%_initial_num_blocks_is_0: | |
2918 | INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC | |
2919 | ||
2920 | ||
2921 | %%_initial_blocks_encrypted: | |
2922 | ;; The entire message was encrypted processed in initial and now need to be hashed | |
2923 | cmp r13, 0 | |
2924 | je %%_encrypt_done | |
2925 | ||
2926 | ;; Encrypt the final <16 byte (partial) block, then hash | |
2927 | cmp r13, 16 | |
2928 | jl %%_encrypt_final_partial | |
2929 | ||
2930 | ;; Process 7 full blocks plus a partial block | |
2931 | cmp r13, 128 | |
2932 | jl %%_encrypt_by_8_partial | |
2933 | ||
2934 | ||
2935 | %%_encrypt_by_8_parallel: | |
2936 | ;; in_order vs. out_order is an optimization to increment the counter without shuffling | |
2937 | ;; it back into little endian. r15d keeps track of when we need to increent in order so | |
2938 | ;; that the carry is handled correctly. | |
2939 | vmovd r15d, xmm9 | |
2940 | and r15d, 255 | |
2941 | vpshufb xmm9, [rel SHUF_MASK] | |
2942 | ||
2943 | ||
2944 | %%_encrypt_by_8_new: | |
2945 | cmp r15d, 255-8 | |
2946 | jg %%_encrypt_by_8 | |
2947 | ||
2948 | ||
2949 | ||
2950 | ;; xmm0 - T1 | |
2951 | ;; xmm10 - T2 | |
2952 | ;; xmm11 - T3 | |
2953 | ;; xmm12 - T4 | |
2954 | ;; xmm13 - T5 | |
2955 | ;; xmm14 - T6 | |
2956 | ;; xmm9 - CTR | |
2957 | ;; xmm1 - XMM1 | |
2958 | ;; xmm2 - XMM2 | |
2959 | ;; xmm3 - XMM3 | |
2960 | ;; xmm4 - XMM4 | |
2961 | ;; xmm5 - XMM5 | |
2962 | ;; xmm6 - XMM6 | |
2963 | ;; xmm7 - XMM7 | |
2964 | ;; xmm8 - XMM8 | |
2965 | ;; xmm15 - T7 | |
2966 | add r15b, 8 | |
2967 | GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full | |
2968 | add %%DATA_OFFSET, 128 | |
2969 | sub r13, 128 | |
2970 | cmp r13, 128 | |
2971 | jge %%_encrypt_by_8_new | |
2972 | ||
9f95a23c | 2973 | vpshufb xmm9, [rel SHUF_MASK] |
11fdf7f2 TL |
2974 | jmp %%_encrypt_by_8_parallel_done |
2975 | ||
2976 | %%_encrypt_by_8: | |
9f95a23c | 2977 | vpshufb xmm9, [rel SHUF_MASK] |
11fdf7f2 TL |
2978 | add r15b, 8 |
2979 | GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full | |
9f95a23c | 2980 | vpshufb xmm9, [rel SHUF_MASK] |
11fdf7f2 TL |
2981 | add %%DATA_OFFSET, 128 |
2982 | sub r13, 128 | |
2983 | cmp r13, 128 | |
2984 | jge %%_encrypt_by_8_new | |
9f95a23c | 2985 | vpshufb xmm9, [rel SHUF_MASK] |
11fdf7f2 TL |
2986 | |
2987 | ||
2988 | %%_encrypt_by_8_parallel_done: | |
2989 | ;; Test to see if we need a by 8 with partial block. At this point | |
2990 | ;; bytes remaining should be either zero or between 113-127. | |
2991 | cmp r13, 0 | |
2992 | je %%_encrypt_done | |
2993 | ||
2994 | %%_encrypt_by_8_partial: | |
2995 | ;; Shuffle needed to align key for partial block xor. out_order | |
2996 | ;; is a little faster because it avoids extra shuffles. | |
2997 | ;; TBD: Might need to account for when we don't have room to increment the counter. | |
2998 | ||
2999 | ||
3000 | ;; Process parallel buffers with a final partial block. | |
3001 | GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial | |
3002 | ||
3003 | ||
3004 | add %%DATA_OFFSET, 128-16 | |
3005 | sub r13, 128-16 | |
3006 | ||
3007 | %%_encrypt_final_partial: | |
3008 | ||
9f95a23c | 3009 | vpshufb xmm8, [rel SHUF_MASK] |
11fdf7f2 TL |
3010 | mov [%%GDATA_CTX + PBlockLen], r13 |
3011 | vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 | |
3012 | ||
3013 | ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext | |
3014 | ;; GDATA, KEY, T1, T2 | |
3015 | ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET | |
3016 | ||
9f95a23c | 3017 | vpshufb xmm8, [rel SHUF_MASK] |
11fdf7f2 TL |
3018 | |
3019 | ||
3020 | %%_encrypt_done: | |
3021 | ||
3022 | ;; Mapping to macro parameters | |
3023 | ;; IN: | |
3024 | ;; xmm9 contains the counter | |
3025 | ;; xmm1-xmm8 contain the xor'd ciphertext | |
3026 | ;; OUT: | |
3027 | ;; xmm14 contains the final hash | |
3028 | ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 | |
3029 | %ifidn %%INSTANCE_TYPE, multi_call | |
3030 | mov r13, [%%GDATA_CTX + PBlockLen] | |
3031 | cmp r13, 0 | |
3032 | jz %%_hash_last_8 | |
3033 | GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 | |
3034 | ;; XOR the partial word into the hash | |
3035 | vpxor xmm14, xmm14, xmm8 | |
3036 | jmp %%_ghash_done | |
3037 | %endif | |
3038 | %%_hash_last_8: | |
3039 | GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 | |
3040 | ||
3041 | %%_ghash_done: | |
3042 | vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 | |
3043 | vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 | |
3044 | ||
3045 | %%_enc_dec_done: | |
3046 | ||
3047 | ||
3048 | %endmacro | |
3049 | ||
3050 | ||
3051 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3052 | ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. | |
3053 | ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). | |
3054 | ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) | |
3055 | ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 | |
3056 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3057 | %macro GCM_COMPLETE 6 | |
3058 | %define %%GDATA_KEY %1 | |
3059 | %define %%GDATA_CTX %2 | |
3060 | %define %%AUTH_TAG %3 | |
3061 | %define %%AUTH_TAG_LEN %4 | |
3062 | %define %%ENC_DEC %5 | |
3063 | %define %%INSTANCE_TYPE %6 | |
3064 | %define %%PLAIN_CYPH_LEN rax | |
3065 | ||
3066 | vmovdqu xmm13, [%%GDATA_KEY + HashKey] | |
3067 | ;; Start AES as early as possible | |
3068 | vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 | |
3069 | ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) | |
3070 | ||
3071 | %ifidn %%INSTANCE_TYPE, multi_call | |
3072 | ;; If the GCM function is called as a single function call rather | |
3073 | ;; than invoking the individual parts (init, update, finalize) we | |
3074 | ;; can remove a write to read dependency on AadHash. | |
3075 | vmovdqu xmm14, [%%GDATA_CTX + AadHash] | |
3076 | ||
3077 | ;; Encrypt the final partial block. If we did this as a single call then | |
3078 | ;; the partial block was handled in the main GCM_ENC_DEC macro. | |
3079 | mov r12, [%%GDATA_CTX + PBlockLen] | |
3080 | cmp r12, 0 | |
3081 | ||
3082 | je %%_partial_done | |
3083 | ||
3084 | GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block | |
3085 | vmovdqu [%%GDATA_CTX + AadHash], xmm14 | |
3086 | ||
3087 | %%_partial_done: | |
3088 | ||
3089 | %endif | |
3090 | ||
3091 | mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) | |
3092 | mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] | |
3093 | ||
3094 | shl r12, 3 ; convert into number of bits | |
3095 | vmovd xmm15, r12d ; len(A) in xmm15 | |
3096 | ||
3097 | shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) | |
3098 | vmovq xmm1, %%PLAIN_CYPH_LEN | |
3099 | vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 | |
3100 | vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) | |
3101 | ||
3102 | vpxor xmm14, xmm15 | |
3103 | GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 | |
9f95a23c | 3104 | vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap |
11fdf7f2 TL |
3105 | |
3106 | vpxor xmm9, xmm9, xmm14 | |
3107 | ||
3108 | ||
3109 | %%_return_T: | |
3110 | mov r10, %%AUTH_TAG ; r10 = authTag | |
3111 | mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len | |
3112 | ||
3113 | cmp r11, 16 | |
3114 | je %%_T_16 | |
3115 | ||
3116 | cmp r11, 12 | |
3117 | je %%_T_12 | |
3118 | ||
9f95a23c TL |
3119 | cmp r11, 8 |
3120 | je %%_T_8 | |
3121 | ||
3122 | simd_store_avx r10, xmm9, r11, r12, rax | |
3123 | jmp %%_return_T_done | |
11fdf7f2 | 3124 | %%_T_8: |
9f95a23c | 3125 | vmovq rax, xmm9 |
11fdf7f2 TL |
3126 | mov [r10], rax |
3127 | jmp %%_return_T_done | |
3128 | %%_T_12: | |
9f95a23c | 3129 | vmovq rax, xmm9 |
11fdf7f2 TL |
3130 | mov [r10], rax |
3131 | vpsrldq xmm9, xmm9, 8 | |
9f95a23c | 3132 | vmovd eax, xmm9 |
11fdf7f2 TL |
3133 | mov [r10 + 8], eax |
3134 | jmp %%_return_T_done | |
11fdf7f2 TL |
3135 | %%_T_16: |
3136 | vmovdqu [r10], xmm9 | |
3137 | ||
3138 | %%_return_T_done: | |
3139 | %endmacro ; GCM_COMPLETE | |
3140 | ||
3141 | ||
3142 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3143 | ;void aes_gcm_precomp_128_avx_gen4 / | |
3144 | ; aes_gcm_precomp_192_avx_gen4 / | |
3145 | ; aes_gcm_precomp_256_avx_gen4 | |
3146 | ; (struct gcm_key_data *key_data) | |
3147 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3148 | MKGLOBAL(FN_NAME(precomp,_),function,) | |
3149 | FN_NAME(precomp,_): | |
3150 | push r12 | |
3151 | push r13 | |
3152 | push r14 | |
3153 | push r15 | |
3154 | ||
3155 | mov r14, rsp | |
3156 | ||
3157 | ||
3158 | ||
3159 | sub rsp, VARIABLE_OFFSET | |
3160 | and rsp, ~63 ; align rsp to 64 bytes | |
3161 | ||
3162 | %ifidn __OUTPUT_FORMAT__, win64 | |
3163 | ; only xmm6 needs to be maintained | |
3164 | vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 | |
3165 | %endif | |
3166 | ||
3167 | vpxor xmm6, xmm6 | |
3168 | ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey | |
3169 | ||
3170 | vpshufb xmm6, [rel SHUF_MASK] | |
3171 | ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; | |
3172 | vmovdqa xmm2, xmm6 | |
3173 | vpsllq xmm6, xmm6, 1 | |
3174 | vpsrlq xmm2, xmm2, 63 | |
3175 | vmovdqa xmm1, xmm2 | |
3176 | vpslldq xmm2, xmm2, 8 | |
3177 | vpsrldq xmm1, xmm1, 8 | |
3178 | vpor xmm6, xmm6, xmm2 | |
3179 | ;reduction | |
3180 | vpshufd xmm2, xmm1, 00100100b | |
9f95a23c TL |
3181 | vpcmpeqd xmm2, [rel TWOONE] |
3182 | vpand xmm2, xmm2, [rel POLY] | |
11fdf7f2 TL |
3183 | vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly |
3184 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3185 | vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly | |
3186 | ||
3187 | ||
3188 | PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 | |
3189 | ||
3190 | %ifidn __OUTPUT_FORMAT__, win64 | |
3191 | vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] | |
3192 | %endif | |
3193 | mov rsp, r14 | |
3194 | ||
3195 | pop r15 | |
3196 | pop r14 | |
3197 | pop r13 | |
3198 | pop r12 | |
3199 | ret | |
3200 | ||
3201 | ||
3202 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3203 | ;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 | |
3204 | ; (const struct gcm_key_data *key_data, | |
3205 | ; struct gcm_context_data *context_data, | |
3206 | ; u8 *iv, | |
3207 | ; const u8 *aad, | |
3208 | ; u64 aad_len); | |
3209 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3210 | MKGLOBAL(FN_NAME(init,_),function,) | |
3211 | FN_NAME(init,_): | |
3212 | push r12 | |
3213 | push r13 | |
3214 | %ifidn __OUTPUT_FORMAT__, win64 | |
3215 | push r14 | |
3216 | push r15 | |
3217 | mov r14, rsp | |
3218 | ; xmm6:xmm15 need to be maintained for Windows | |
3219 | sub rsp, 1*16 | |
3220 | movdqu [rsp + 0*16], xmm6 | |
3221 | %endif | |
3222 | ||
3223 | GCM_INIT arg1, arg2, arg3, arg4, arg5 | |
3224 | ||
3225 | %ifidn __OUTPUT_FORMAT__, win64 | |
3226 | movdqu xmm6 , [rsp + 0*16] | |
3227 | mov rsp, r14 | |
3228 | pop r15 | |
3229 | pop r14 | |
3230 | %endif | |
3231 | pop r13 | |
3232 | pop r12 | |
3233 | ret | |
3234 | ||
3235 | ||
3236 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3237 | ;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / | |
3238 | ; aes_gcm_enc_128_update_avx_gen4 | |
3239 | ; (const struct gcm_key_data *key_data, | |
3240 | ; struct gcm_context_data *context_data, | |
3241 | ; u8 *out, | |
3242 | ; const u8 *in, | |
3243 | ; u64 plaintext_len); | |
3244 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3245 | MKGLOBAL(FN_NAME(enc,_update_),function,) | |
3246 | FN_NAME(enc,_update_): | |
3247 | ||
3248 | FUNC_SAVE | |
3249 | ||
3250 | GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call | |
3251 | ||
3252 | FUNC_RESTORE | |
3253 | ||
3254 | ret | |
3255 | ||
3256 | ||
3257 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3258 | ;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / | |
3259 | ; aes_gcm_dec_256_update_avx_gen4 | |
3260 | ; (const struct gcm_key_data *key_data, | |
3261 | ; struct gcm_context_data *context_data, | |
3262 | ; u8 *out, | |
3263 | ; const u8 *in, | |
3264 | ; u64 plaintext_len); | |
3265 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3266 | MKGLOBAL(FN_NAME(dec,_update_),function,) | |
3267 | FN_NAME(dec,_update_): | |
3268 | ||
3269 | FUNC_SAVE | |
3270 | ||
3271 | GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call | |
3272 | ||
3273 | FUNC_RESTORE | |
3274 | ||
3275 | ret | |
3276 | ||
3277 | ||
3278 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3279 | ;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / | |
3280 | ; aes_gcm_enc_256_finalize_avx_gen4 | |
3281 | ; (const struct gcm_key_data *key_data, | |
3282 | ; struct gcm_context_data *context_data, | |
3283 | ; u8 *auth_tag, | |
3284 | ; u64 auth_tag_len); | |
3285 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3286 | MKGLOBAL(FN_NAME(enc,_finalize_),function,) | |
3287 | FN_NAME(enc,_finalize_): | |
3288 | ||
3289 | push r12 | |
3290 | ||
3291 | %ifidn __OUTPUT_FORMAT__, win64 | |
3292 | ; xmm6:xmm15 need to be maintained for Windows | |
3293 | sub rsp, 5*16 | |
3294 | vmovdqu [rsp + 0*16], xmm6 | |
3295 | vmovdqu [rsp + 1*16], xmm9 | |
3296 | vmovdqu [rsp + 2*16], xmm11 | |
3297 | vmovdqu [rsp + 3*16], xmm14 | |
3298 | vmovdqu [rsp + 4*16], xmm15 | |
3299 | %endif | |
3300 | GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call | |
3301 | ||
3302 | %ifidn __OUTPUT_FORMAT__, win64 | |
3303 | vmovdqu xmm15, [rsp + 4*16] | |
3304 | vmovdqu xmm14, [rsp + 3*16] | |
3305 | vmovdqu xmm11, [rsp + 2*16] | |
3306 | vmovdqu xmm9, [rsp + 1*16] | |
3307 | vmovdqu xmm6, [rsp + 0*16] | |
3308 | add rsp, 5*16 | |
3309 | %endif | |
3310 | ||
3311 | pop r12 | |
3312 | ret | |
3313 | ||
3314 | ||
3315 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3316 | ;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 | |
3317 | ; aes_gcm_dec_256_finalize_avx_gen4 | |
3318 | ; (const struct gcm_key_data *key_data, | |
3319 | ; struct gcm_context_data *context_data, | |
3320 | ; u8 *auth_tag, | |
3321 | ; u64 auth_tag_len); | |
3322 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3323 | MKGLOBAL(FN_NAME(dec,_finalize_),function,) | |
3324 | FN_NAME(dec,_finalize_): | |
3325 | ||
3326 | push r12 | |
3327 | ||
3328 | %ifidn __OUTPUT_FORMAT__, win64 | |
3329 | ; xmm6:xmm15 need to be maintained for Windows | |
3330 | sub rsp, 5*16 | |
3331 | vmovdqu [rsp + 0*16], xmm6 | |
3332 | vmovdqu [rsp + 1*16], xmm9 | |
3333 | vmovdqu [rsp + 2*16], xmm11 | |
3334 | vmovdqu [rsp + 3*16], xmm14 | |
3335 | vmovdqu [rsp + 4*16], xmm15 | |
3336 | %endif | |
3337 | GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call | |
3338 | ||
3339 | %ifidn __OUTPUT_FORMAT__, win64 | |
3340 | vmovdqu xmm15, [rsp + 4*16] | |
3341 | vmovdqu xmm14, [rsp + 3*16] | |
3342 | vmovdqu xmm11, [rsp + 2*16] | |
3343 | vmovdqu xmm9, [rsp + 1*16] | |
3344 | vmovdqu xmm6, [rsp + 0*16] | |
3345 | add rsp, 5*16 | |
3346 | %endif | |
3347 | ||
3348 | pop r12 | |
3349 | ret | |
3350 | ||
3351 | ||
3352 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3353 | ;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 | |
3354 | ; (const struct gcm_key_data *key_data, | |
3355 | ; struct gcm_context_data *context_data, | |
3356 | ; u8 *out, | |
3357 | ; const u8 *in, | |
3358 | ; u64 plaintext_len, | |
3359 | ; u8 *iv, | |
3360 | ; const u8 *aad, | |
3361 | ; u64 aad_len, | |
3362 | ; u8 *auth_tag, | |
3363 | ; u64 auth_tag_len); | |
3364 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3365 | MKGLOBAL(FN_NAME(enc,_),function,) | |
3366 | FN_NAME(enc,_): | |
3367 | ||
3368 | FUNC_SAVE | |
3369 | ||
3370 | GCM_INIT arg1, arg2, arg6, arg7, arg8 | |
3371 | ||
3372 | GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call | |
3373 | ||
3374 | GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call | |
3375 | ||
3376 | FUNC_RESTORE | |
3377 | ||
3378 | ret | |
3379 | ||
3380 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3381 | ;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 | |
3382 | ; (const struct gcm_key_data *key_data, | |
3383 | ; struct gcm_context_data *context_data, | |
3384 | ; u8 *out, | |
3385 | ; const u8 *in, | |
3386 | ; u64 plaintext_len, | |
3387 | ; u8 *iv, | |
3388 | ; const u8 *aad, | |
3389 | ; u64 aad_len, | |
3390 | ; u8 *auth_tag, | |
3391 | ; u64 auth_tag_len); | |
3392 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3393 | MKGLOBAL(FN_NAME(dec,_),function,) | |
3394 | FN_NAME(dec,_): | |
3395 | ||
3396 | FUNC_SAVE | |
3397 | ||
3398 | GCM_INIT arg1, arg2, arg6, arg7, arg8 | |
3399 | ||
3400 | GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call | |
3401 | ||
3402 | GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call | |
3403 | ||
3404 | FUNC_RESTORE | |
3405 | ||
3406 | ret | |
3407 | ||
3408 | %ifdef LINUX | |
3409 | section .note.GNU-stack noalloc noexec nowrite progbits | |
3410 | %endif |