]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | %include "os.asm" | |
29 | %include "memcpy.asm" | |
30 | ||
31 | ; routine to do AES256 CNTR enc/decrypt "by4" | |
32 | ; XMM registers are clobbered. Saving/restoring must be done at a higher level | |
33 | ||
9f95a23c TL |
34 | %ifndef AES_CNTR_256 |
35 | %define AES_CNTR_256 aes_cntr_256_sse | |
36 | %endif | |
37 | ||
11fdf7f2 TL |
38 | extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 |
39 | ||
40 | %define CONCAT(a,b) a %+ b | |
41 | %define MOVDQ movdqu | |
42 | ||
43 | %define xdata0 xmm0 | |
44 | %define xdata1 xmm1 | |
45 | %define xdata2 xmm2 | |
46 | %define xdata3 xmm3 | |
47 | %define xdata4 xmm4 | |
48 | %define xdata5 xmm5 | |
49 | %define xdata6 xmm6 | |
50 | %define xdata7 xmm7 | |
51 | %define xcounter xmm8 | |
52 | %define xbyteswap xmm9 | |
53 | %define xkey0 xmm10 | |
54 | %define xkey4 xmm11 | |
55 | %define xkey8 xmm12 | |
56 | %define xkey12 xmm13 | |
57 | %define xkeyA xmm14 | |
58 | %define xkeyB xmm15 | |
59 | ||
60 | %ifdef LINUX | |
61 | %define p_in rdi | |
62 | %define p_IV rsi | |
63 | %define p_keys rdx | |
64 | %define p_out rcx | |
65 | %define num_bytes r8 | |
66 | %define p_ivlen r9 | |
67 | %else | |
68 | %define p_in rcx | |
69 | %define p_IV rdx | |
70 | %define p_keys r8 | |
71 | %define p_out r9 | |
72 | %define num_bytes r10 | |
73 | %define p_ivlen qword [rsp + 8*6] | |
74 | %endif | |
75 | ||
76 | %define tmp r11 | |
77 | %define p_tmp rsp + _buffer | |
78 | ||
79 | %macro do_aes_load 1 | |
80 | do_aes %1, 1 | |
81 | %endmacro | |
82 | ||
83 | %macro do_aes_noload 1 | |
84 | do_aes %1, 0 | |
85 | %endmacro | |
86 | ||
87 | ||
88 | ; do_aes num_in_par load_keys | |
89 | ; This increments p_in, but not p_out | |
90 | %macro do_aes 2 | |
91 | %define %%by %1 | |
92 | %define %%load_keys %2 | |
93 | ||
94 | %if (%%load_keys) | |
95 | movdqa xkey0, [p_keys + 0*16] | |
96 | %endif | |
97 | ||
98 | movdqa xdata0, xcounter | |
99 | pshufb xdata0, xbyteswap | |
100 | %assign i 1 | |
101 | %rep (%%by - 1) | |
102 | movdqa CONCAT(xdata,i), xcounter | |
103 | paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] | |
104 | pshufb CONCAT(xdata,i), xbyteswap | |
105 | %assign i (i + 1) | |
106 | %endrep | |
107 | ||
108 | movdqa xkeyA, [p_keys + 1*16] | |
109 | ||
110 | pxor xdata0, xkey0 | |
111 | paddd xcounter, [rel CONCAT(ddq_add_,%%by)] | |
112 | %assign i 1 | |
113 | %rep (%%by - 1) | |
114 | pxor CONCAT(xdata,i), xkey0 | |
115 | %assign i (i + 1) | |
116 | %endrep | |
117 | ||
118 | movdqa xkeyB, [p_keys + 2*16] | |
119 | %assign i 0 | |
120 | %rep %%by | |
121 | aesenc CONCAT(xdata,i), xkeyA ; key 1 | |
122 | %assign i (i+1) | |
123 | %endrep | |
124 | ||
125 | movdqa xkeyA, [p_keys + 3*16] | |
126 | %assign i 0 | |
127 | %rep %%by | |
128 | aesenc CONCAT(xdata,i), xkeyB ; key 2 | |
129 | %assign i (i+1) | |
130 | %endrep | |
131 | ||
132 | add p_in, 16*%%by | |
133 | ||
134 | %if (%%load_keys) | |
135 | movdqa xkey4, [p_keys + 4*16] | |
136 | %endif | |
137 | %assign i 0 | |
138 | %rep %%by | |
139 | aesenc CONCAT(xdata,i), xkeyA ; key 3 | |
140 | %assign i (i+1) | |
141 | %endrep | |
142 | ||
143 | movdqa xkeyA, [p_keys + 5*16] | |
144 | %assign i 0 | |
145 | %rep %%by | |
146 | aesenc CONCAT(xdata,i), xkey4 ; key 4 | |
147 | %assign i (i+1) | |
148 | %endrep | |
149 | ||
150 | movdqa xkeyB, [p_keys + 6*16] | |
151 | %assign i 0 | |
152 | %rep %%by | |
153 | aesenc CONCAT(xdata,i), xkeyA ; key 5 | |
154 | %assign i (i+1) | |
155 | %endrep | |
156 | ||
157 | movdqa xkeyA, [p_keys + 7*16] | |
158 | %assign i 0 | |
159 | %rep %%by | |
160 | aesenc CONCAT(xdata,i), xkeyB ; key 6 | |
161 | %assign i (i+1) | |
162 | %endrep | |
163 | ||
164 | %if (%%load_keys) | |
165 | movdqa xkey8, [p_keys + 8*16] | |
166 | %endif | |
167 | %assign i 0 | |
168 | %rep %%by | |
169 | aesenc CONCAT(xdata,i), xkeyA ; key 7 | |
170 | %assign i (i+1) | |
171 | %endrep | |
172 | ||
173 | movdqa xkeyA, [p_keys + 9*16] | |
174 | %assign i 0 | |
175 | %rep %%by | |
176 | aesenc CONCAT(xdata,i), xkey8 ; key 8 | |
177 | %assign i (i+1) | |
178 | %endrep | |
179 | ||
180 | movdqa xkeyB, [p_keys + 10*16] | |
181 | %assign i 0 | |
182 | %rep %%by | |
183 | aesenc CONCAT(xdata,i), xkeyA ; key 9 | |
184 | %assign i (i+1) | |
185 | %endrep | |
186 | ||
187 | movdqa xkeyA, [p_keys + 11*16] | |
188 | %assign i 0 | |
189 | %rep %%by | |
190 | aesenc CONCAT(xdata,i), xkeyB ; key 10 | |
191 | %assign i (i+1) | |
192 | %endrep | |
193 | ||
194 | %if (%%load_keys) | |
195 | movdqa xkey12, [p_keys + 12*16] | |
196 | %endif | |
197 | %assign i 0 | |
198 | %rep %%by | |
199 | aesenc CONCAT(xdata,i), xkeyA ; key 11 | |
200 | %assign i (i+1) | |
201 | %endrep | |
202 | ||
203 | movdqa xkeyA, [p_keys + 13*16] | |
204 | %assign i 0 | |
205 | %rep %%by | |
206 | aesenc CONCAT(xdata,i), xkey12 ; key 12 | |
207 | %assign i (i+1) | |
208 | %endrep | |
209 | ||
210 | movdqa xkeyB, [p_keys + 14*16] | |
211 | %assign i 0 | |
212 | %rep %%by | |
213 | aesenc CONCAT(xdata,i), xkeyA ; key 13 | |
214 | %assign i (i+1) | |
215 | %endrep | |
216 | ||
217 | %assign i 0 | |
218 | %rep %%by | |
219 | aesenclast CONCAT(xdata,i), xkeyB ; key 14 | |
220 | %assign i (i+1) | |
221 | %endrep | |
222 | ||
223 | %assign i 0 | |
224 | %rep (%%by / 2) | |
225 | %assign j (i+1) | |
226 | MOVDQ xkeyA, [p_in + i*16 - 16*%%by] | |
227 | MOVDQ xkeyB, [p_in + j*16 - 16*%%by] | |
228 | pxor CONCAT(xdata,i), xkeyA | |
229 | pxor CONCAT(xdata,j), xkeyB | |
230 | %assign i (i+2) | |
231 | %endrep | |
232 | %if (i < %%by) | |
233 | MOVDQ xkeyA, [p_in + i*16 - 16*%%by] | |
234 | pxor CONCAT(xdata,i), xkeyA | |
235 | %endif | |
236 | ||
237 | %assign i 0 | |
238 | %rep %%by | |
239 | MOVDQ [p_out + i*16], CONCAT(xdata,i) | |
240 | %assign i (i+1) | |
241 | %endrep | |
242 | %endmacro | |
243 | ||
244 | struc STACK | |
245 | _buffer: resq 2 | |
246 | _rsp_save: resq 1 | |
247 | endstruc | |
248 | ||
249 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
250 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
251 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
252 | section .text | |
253 | ||
254 | ;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) | |
255 | align 32 | |
9f95a23c TL |
256 | MKGLOBAL(AES_CNTR_256,function,internal) |
257 | AES_CNTR_256: | |
11fdf7f2 TL |
258 | |
259 | %ifndef LINUX | |
260 | mov num_bytes, [rsp + 8*5] | |
261 | %endif | |
262 | ||
263 | movdqa xbyteswap, [rel byteswap_const] | |
264 | test p_ivlen, 16 | |
265 | jnz iv_is_16_bytes | |
266 | ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 | |
267 | mov DWORD(tmp), 0x01000000 | |
268 | pinsrq xcounter, [p_IV], 0 | |
269 | pinsrd xcounter, [p_IV + 8], 2 | |
270 | pinsrd xcounter, DWORD(tmp), 3 | |
271 | bswap_iv: | |
272 | pshufb xcounter, xbyteswap | |
273 | ||
274 | mov tmp, num_bytes | |
275 | and tmp, 3*16 | |
276 | jz chk ; x4 > or < 15 (not 3 lines) | |
277 | ||
278 | ; 1 <= tmp <= 3 | |
279 | cmp tmp, 2*16 | |
280 | jg eq3 | |
281 | je eq2 | |
282 | eq1: | |
283 | do_aes_load 1 | |
284 | add p_out, 1*16 | |
285 | jmp chk | |
286 | ||
287 | eq2: | |
288 | do_aes_load 2 | |
289 | add p_out, 2*16 | |
290 | jmp chk | |
291 | ||
292 | eq3: | |
293 | do_aes_load 3 | |
294 | add p_out, 3*16 | |
295 | ; fall through to chk | |
296 | chk: | |
297 | and num_bytes, ~(3*16) | |
298 | jz do_return2 | |
299 | cmp num_bytes, 16 | |
300 | jb last | |
301 | ||
302 | ; process multiples of 4 blocks | |
303 | movdqa xkey0, [p_keys + 0*16] | |
304 | movdqa xkey4, [p_keys + 4*16] | |
305 | movdqa xkey8, [p_keys + 8*16] | |
306 | movdqa xkey12, [p_keys + 12*16] | |
307 | jmp main_loop2 | |
308 | ||
309 | align 32 | |
310 | main_loop2: | |
311 | ; num_bytes is a multiple of 4 and >0 | |
312 | do_aes_noload 4 | |
313 | add p_out, 4*16 | |
314 | sub num_bytes, 4*16 | |
315 | cmp num_bytes, 4*16 | |
316 | jae main_loop2 | |
317 | ||
318 | test num_bytes, 15 ; partial bytes to be processed? | |
319 | jnz last | |
320 | ||
321 | do_return2: | |
322 | ; don't return updated IV | |
323 | ; pshufb xcounter, xbyteswap | |
324 | ; movdqu [p_IV], xcounter | |
325 | ret | |
326 | ||
327 | last: | |
328 | ;; Code dealing with the partial block cases | |
329 | ; reserve 16 byte aligned buffer on stack | |
330 | mov rax, rsp | |
331 | sub rsp, STACK_size | |
332 | and rsp, -16 | |
333 | mov [rsp + _rsp_save], rax ; save SP | |
334 | ||
335 | ; copy input bytes into scratch buffer | |
336 | memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax | |
337 | ; Encryption of a single partial block (p_tmp) | |
338 | pshufb xcounter, xbyteswap | |
339 | movdqa xdata0, xcounter | |
340 | pxor xdata0, [p_keys + 16*0] | |
341 | %assign i 1 | |
342 | %rep 13 | |
343 | aesenc xdata0, [p_keys + 16*i] | |
344 | %assign i (i+1) | |
345 | %endrep | |
346 | ; created keystream | |
347 | aesenclast xdata0, [p_keys + 16*i] | |
348 | ; xor keystream with the message (scratch) | |
349 | pxor xdata0, [p_tmp] | |
350 | movdqa [p_tmp], xdata0 | |
351 | ; copy result into the output buffer | |
352 | memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax | |
353 | ; remove the stack frame | |
354 | mov rsp, [rsp + _rsp_save] ; original SP | |
355 | jmp do_return2 | |
356 | ||
357 | iv_is_16_bytes: | |
358 | ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) | |
359 | movdqu xcounter, [p_IV] | |
360 | jmp bswap_iv | |
361 | ||
362 | %ifdef LINUX | |
363 | section .note.GNU-stack noalloc noexec nowrite progbits | |
364 | %endif |