]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ; routine to do AES256 CBC decrypt "by8" | |
29 | ||
30 | ; XMM registers are clobbered. Saving/restoring must be done at a higher level | |
f67539c2 | 31 | %include "include/os.asm" |
11fdf7f2 TL |
32 | |
33 | %define CONCAT(a,b) a %+ b | |
34 | %define VMOVDQ vmovdqu | |
35 | ||
36 | %define xdata0 xmm0 | |
37 | %define xdata1 xmm1 | |
38 | %define xdata2 xmm2 | |
39 | %define xdata3 xmm3 | |
40 | %define xdata4 xmm4 | |
41 | %define xdata5 xmm5 | |
42 | %define xdata6 xmm6 | |
43 | %define xdata7 xmm7 | |
44 | %define xIV xmm8 | |
45 | %define xkey0 xmm9 | |
46 | %define xkey3 xmm10 | |
47 | %define xkey6 xmm11 | |
48 | %define xkey9 xmm12 | |
49 | %define xkey12 xmm13 | |
50 | %define xkeyA xmm14 | |
51 | %define xkeyB xmm15 | |
52 | ||
53 | %ifdef LINUX | |
54 | %define p_in rdi | |
55 | %define p_IV rsi | |
56 | %define p_keys rdx | |
57 | %define p_out rcx | |
58 | %define num_bytes r8 | |
59 | %else | |
60 | %define p_in rcx | |
61 | %define p_IV rdx | |
62 | %define p_keys r8 | |
63 | %define p_out r9 | |
64 | %define num_bytes rax | |
65 | %endif | |
66 | ||
67 | %define tmp r10 | |
68 | ||
69 | %macro do_aes_load 1 | |
70 | do_aes %1, 1 | |
71 | %endmacro | |
72 | ||
73 | %macro do_aes_noload 1 | |
74 | do_aes %1, 0 | |
75 | %endmacro | |
76 | ||
77 | ; do_aes num_in_par load_keys | |
78 | ; This increments p_in, but not p_out | |
79 | %macro do_aes 2 | |
80 | %define %%by %1 | |
81 | %define %%load_keys %2 | |
82 | ||
83 | %if (%%load_keys) | |
84 | vmovdqa xkey0, [p_keys + 0*16] | |
85 | %endif | |
86 | ||
87 | %assign i 0 | |
88 | %rep %%by | |
89 | VMOVDQ CONCAT(xdata,i), [p_in + i*16] | |
90 | %assign i (i+1) | |
91 | %endrep | |
92 | ||
93 | vmovdqa xkeyA, [p_keys + 1*16] | |
94 | ||
95 | %assign i 0 | |
96 | %rep %%by | |
97 | vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 | |
98 | %assign i (i+1) | |
99 | %endrep | |
100 | ||
101 | vmovdqa xkeyB, [p_keys + 2*16] | |
102 | ||
103 | %assign i 0 | |
104 | %rep %%by | |
105 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA | |
106 | %assign i (i+1) | |
107 | %endrep | |
108 | ||
109 | add p_in, 16*%%by | |
110 | ||
111 | %if (%%load_keys) | |
112 | vmovdqa xkey3, [p_keys + 3*16] | |
113 | %endif | |
114 | ||
115 | %assign i 0 | |
116 | %rep %%by | |
117 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB | |
118 | %assign i (i+1) | |
119 | %endrep | |
120 | ||
121 | vmovdqa xkeyA, [p_keys + 4*16] | |
122 | ||
123 | %assign i 0 | |
124 | %rep %%by | |
125 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 | |
126 | %assign i (i+1) | |
127 | %endrep | |
128 | ||
129 | vmovdqa xkeyB, [p_keys + 5*16] | |
130 | ||
131 | %assign i 0 | |
132 | %rep %%by | |
133 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA | |
134 | %assign i (i+1) | |
135 | %endrep | |
136 | ||
137 | %if (%%load_keys) | |
138 | vmovdqa xkey6, [p_keys + 6*16] | |
139 | %endif | |
140 | ||
141 | %assign i 0 | |
142 | %rep %%by | |
143 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB | |
144 | %assign i (i+1) | |
145 | %endrep | |
146 | ||
147 | vmovdqa xkeyA, [p_keys + 7*16] | |
148 | ||
149 | %assign i 0 | |
150 | %rep %%by | |
151 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 | |
152 | %assign i (i+1) | |
153 | %endrep | |
154 | ||
155 | vmovdqa xkeyB, [p_keys + 8*16] | |
156 | ||
157 | %assign i 0 | |
158 | %rep %%by | |
159 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA | |
160 | %assign i (i+1) | |
161 | %endrep | |
162 | ||
163 | %if (%%load_keys) | |
164 | vmovdqa xkey9, [p_keys + 9*16] | |
165 | %endif | |
166 | ||
167 | %assign i 0 | |
168 | %rep %%by | |
169 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB | |
170 | %assign i (i+1) | |
171 | %endrep | |
172 | ||
173 | vmovdqa xkeyA, [p_keys + 10*16] | |
174 | ||
175 | %assign i 0 | |
176 | %rep %%by | |
177 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 | |
178 | %assign i (i+1) | |
179 | %endrep | |
180 | ||
181 | vmovdqa xkeyB, [p_keys + 11*16] | |
182 | ||
183 | %assign i 0 | |
184 | %rep %%by | |
185 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA | |
186 | %assign i (i+1) | |
187 | %endrep | |
188 | ||
189 | %if (%%load_keys) | |
190 | vmovdqa xkey12, [p_keys + 12*16] | |
191 | %endif | |
192 | ||
193 | %assign i 0 | |
194 | %rep %%by | |
195 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB | |
196 | %assign i (i+1) | |
197 | %endrep | |
198 | ||
199 | vmovdqa xkeyA, [p_keys + 13*16] | |
200 | ||
201 | %assign i 0 | |
202 | %rep %%by | |
203 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12 | |
204 | %assign i (i+1) | |
205 | %endrep | |
206 | ||
207 | vmovdqa xkeyB, [p_keys + 14*16] | |
208 | ||
209 | %assign i 0 | |
210 | %rep %%by | |
211 | vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA | |
212 | %assign i (i+1) | |
213 | %endrep | |
214 | ||
215 | %assign i 0 | |
216 | %rep %%by | |
217 | vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB | |
218 | %assign i (i+1) | |
219 | %endrep | |
220 | ||
221 | vpxor xdata0, xdata0, xIV | |
222 | %assign i 1 | |
223 | %if (%%by > 1) | |
224 | %rep (%%by - 1) | |
225 | VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] | |
226 | vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV | |
227 | %assign i (i+1) | |
228 | %endrep | |
229 | %endif | |
230 | VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] | |
231 | ||
232 | %assign i 0 | |
233 | %rep %%by | |
234 | VMOVDQ [p_out + i*16], CONCAT(xdata,i) | |
235 | %assign i (i+1) | |
236 | %endrep | |
237 | %endmacro | |
238 | ||
239 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
240 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
241 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
242 | ||
243 | section .text | |
244 | ||
245 | ;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) | |
246 | MKGLOBAL(aes_cbc_dec_256_avx,function,internal) | |
247 | aes_cbc_dec_256_avx: | |
248 | ||
249 | %ifndef LINUX | |
250 | mov num_bytes, [rsp + 8*5] | |
251 | %endif | |
252 | ||
253 | vmovdqu xIV, [p_IV] | |
254 | ||
255 | mov tmp, num_bytes | |
256 | and tmp, 7*16 | |
257 | jz mult_of_8_blks | |
258 | ||
259 | ; 1 <= tmp <= 7 | |
260 | cmp tmp, 4*16 | |
261 | jg gt4 | |
262 | je eq4 | |
263 | ||
264 | lt4: | |
265 | cmp tmp, 2*16 | |
266 | jg eq3 | |
267 | je eq2 | |
268 | eq1: | |
269 | do_aes_load 1 | |
270 | add p_out, 1*16 | |
271 | and num_bytes, ~7*16 | |
272 | jz do_return2 | |
273 | jmp main_loop2 | |
274 | ||
275 | eq2: | |
276 | do_aes_load 2 | |
277 | add p_out, 2*16 | |
278 | and num_bytes, ~7*16 | |
279 | jz do_return2 | |
280 | jmp main_loop2 | |
281 | ||
282 | eq3: | |
283 | do_aes_load 3 | |
284 | add p_out, 3*16 | |
285 | and num_bytes, ~7*16 | |
286 | jz do_return2 | |
287 | jmp main_loop2 | |
288 | ||
289 | eq4: | |
290 | do_aes_load 4 | |
291 | add p_out, 4*16 | |
292 | and num_bytes, ~7*16 | |
293 | jz do_return2 | |
294 | jmp main_loop2 | |
295 | ||
296 | gt4: | |
297 | cmp tmp, 6*16 | |
298 | jg eq7 | |
299 | je eq6 | |
300 | ||
301 | eq5: | |
302 | do_aes_load 5 | |
303 | add p_out, 5*16 | |
304 | and num_bytes, ~7*16 | |
305 | jz do_return2 | |
306 | jmp main_loop2 | |
307 | ||
308 | eq6: | |
309 | do_aes_load 6 | |
310 | add p_out, 6*16 | |
311 | and num_bytes, ~7*16 | |
312 | jz do_return2 | |
313 | jmp main_loop2 | |
314 | ||
315 | eq7: | |
316 | do_aes_load 7 | |
317 | add p_out, 7*16 | |
318 | and num_bytes, ~7*16 | |
319 | jz do_return2 | |
320 | jmp main_loop2 | |
321 | ||
322 | mult_of_8_blks: | |
323 | vmovdqa xkey0, [p_keys + 0*16] | |
324 | vmovdqa xkey3, [p_keys + 3*16] | |
325 | vmovdqa xkey6, [p_keys + 6*16] | |
326 | vmovdqa xkey9, [p_keys + 9*16] | |
327 | vmovdqa xkey12, [p_keys + 12*16] | |
328 | ||
329 | main_loop2: | |
330 | ; num_bytes is a multiple of 8 and >0 | |
331 | do_aes_noload 8 | |
332 | add p_out, 8*16 | |
333 | sub num_bytes, 8*16 | |
334 | jne main_loop2 | |
335 | ||
336 | do_return2: | |
337 | ; Don't write back IV | |
338 | ; vmovdqu [p_IV], xIV | |
339 | ||
340 | ret | |
341 | ||
342 | %ifdef LINUX | |
343 | section .note.GNU-stack noalloc noexec nowrite progbits | |
344 | %endif |