]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / aes_cbc_enc_192_x8.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;;; routine to do a 192 bit CBC AES encrypt
29
30 ;; clobbers all registers except for ARG1 and rbp
31
32 %include "include/os.asm"
33 %include "mb_mgr_datastruct.asm"
34
35 %define VMOVDQ vmovdqu ;; assume buffers not aligned
36
37 %macro VPXOR2 2
38 vpxor %1, %1, %2
39 %endm
40
41 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42 ;; struct AES_ARGS {
43 ;; void* in[8];
44 ;; void* out[8];
45 ;; UINT128* keys[8];
46 ;; UINT128 IV[8];
47 ;; }
48 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49 ;; void aes_cbc_enc_192_x8(AES_ARGS *args, UINT64 len);
50 ;; arg 1: ARG : addr of AES_ARGS structure
51 ;; arg 2: LEN : len (in units of bytes)
52
53 struc STACK
54 _gpr_save: resq 1
55 _len: resq 1
56 endstruc
57
58 %define GPR_SAVE_AREA rsp + _gpr_save
59 %define LEN_AREA rsp + _len
60
61 %ifdef LINUX
62 %define ARG rdi
63 %define LEN rsi
64 %define REG3 rcx
65 %define REG4 rdx
66 %else
67 %define ARG rcx
68 %define LEN rdx
69 %define REG3 rsi
70 %define REG4 rdi
71 %endif
72
73 %define IDX rax
74 %define TMP rbx
75
76 %define KEYS0 REG3
77 %define KEYS1 REG4
78 %define KEYS2 rbp
79 %define KEYS3 r8
80 %define KEYS4 r9
81 %define KEYS5 r10
82 %define KEYS6 r11
83 %define KEYS7 r12
84
85 %define IN0 r13
86 %define IN2 r14
87 %define IN4 r15
88 %define IN6 LEN
89
90 %define XDATA0 xmm0
91 %define XDATA1 xmm1
92 %define XDATA2 xmm2
93 %define XDATA3 xmm3
94 %define XDATA4 xmm4
95 %define XDATA5 xmm5
96 %define XDATA6 xmm6
97 %define XDATA7 xmm7
98
99 %define XKEY0_3 xmm8
100 %define XKEY1_4 xmm9
101 %define XKEY2_5 xmm10
102 %define XKEY3_6 xmm11
103 %define XKEY4_7 xmm12
104 %define XKEY5_8 xmm13
105 %define XKEY6_9 xmm14
106 %define XTMP xmm15
107
108 section .text
109
110 MKGLOBAL(aes_cbc_enc_192_x8,function,internal)
111 aes_cbc_enc_192_x8:
112
113 sub rsp, STACK_size
114 mov [GPR_SAVE_AREA + 8*0], rbp
115
116 mov IDX, 16
117 mov [LEN_AREA], LEN
118
119 mov IN0, [ARG + _aesarg_in + 8*0]
120 mov IN2, [ARG + _aesarg_in + 8*2]
121 mov IN4, [ARG + _aesarg_in + 8*4]
122 mov IN6, [ARG + _aesarg_in + 8*6]
123
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125
126 mov TMP, [ARG + _aesarg_in + 8*1]
127 VMOVDQ XDATA0, [IN0] ; load first block of plain text
128 VMOVDQ XDATA1, [TMP] ; load first block of plain text
129 mov TMP, [ARG + _aesarg_in + 8*3]
130 VMOVDQ XDATA2, [IN2] ; load first block of plain text
131 VMOVDQ XDATA3, [TMP] ; load first block of plain text
132 mov TMP, [ARG + _aesarg_in + 8*5]
133 VMOVDQ XDATA4, [IN4] ; load first block of plain text
134 VMOVDQ XDATA5, [TMP] ; load first block of plain text
135 mov TMP, [ARG + _aesarg_in + 8*7]
136 VMOVDQ XDATA6, [IN6] ; load first block of plain text
137 VMOVDQ XDATA7, [TMP] ; load first block of plain text
138
139
140 VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
141 VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
142 VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
143 VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
144 VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
145 VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
146 VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
147 VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
148
149 mov KEYS0, [ARG + _aesarg_keys + 8*0]
150 mov KEYS1, [ARG + _aesarg_keys + 8*1]
151 mov KEYS2, [ARG + _aesarg_keys + 8*2]
152 mov KEYS3, [ARG + _aesarg_keys + 8*3]
153 mov KEYS4, [ARG + _aesarg_keys + 8*4]
154 mov KEYS5, [ARG + _aesarg_keys + 8*5]
155 mov KEYS6, [ARG + _aesarg_keys + 8*6]
156 mov KEYS7, [ARG + _aesarg_keys + 8*7]
157
158 VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
159 VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
160 VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
161 VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
162 VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
163 VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
164 VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
165 VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
166
167 vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
168 vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
169 vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
170 vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
171 vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
172 vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
173 vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
174 vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
175
176 vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
177
178 vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
179 vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
180 vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
181 vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
182 vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
183 vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
184 vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
185 vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
186
187 vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
188
189 vaesenc XDATA0, XKEY0_3 ; 3. ENC
190 vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
191 vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
192 vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
193 vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
194 vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
195 vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
196 vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
197
198 vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
199 vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
200 vaesenc XDATA1, XKEY1_4 ; 4. ENC
201 vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
202 vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
203 vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
204 vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
205 vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
206 vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
207
208 vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
209 vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
210 vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
211 vaesenc XDATA2, XKEY2_5 ; 5. ENC
212 vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
213 vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
214 vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
215 vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
216 vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
217
218 vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
219 vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
220 vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
221 vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
222 vaesenc XDATA3, XKEY3_6 ; 6. ENC
223 vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
224 vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
225 vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
226 vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
227
228 vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
229 vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
230 vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
231 vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
232 vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
233 vaesenc XDATA4, XKEY4_7 ; 7. ENC
234 vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
235 vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
236 vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
237
238 vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
239 vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
240 vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
241 vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
242 vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
243 vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
244 vaesenc XDATA5, XKEY5_8 ; 8. ENC
245 vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
246 vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
247
248 vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
249 vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
250 vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
251 vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
252 vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
253 vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
254 mov TMP, [ARG + _aesarg_out + 8*0]
255 vaesenc XDATA6, XKEY6_9 ; 9. ENC
256 vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
257
258
259 vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
260 vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
261 vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
262 vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
263 vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
264 vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
265 vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
266 vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
267
268 vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
269 vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
270 vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
271 vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
272 vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
273 vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
274 vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
275 vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
276
277
278 vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
279 vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
280 vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
281 vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
282 vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
283 vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
284 vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
285 vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
286
287 VMOVDQ [TMP], XDATA0 ; write back ciphertext
288 mov TMP, [ARG + _aesarg_out + 8*1]
289 VMOVDQ [TMP], XDATA1 ; write back ciphertext
290 mov TMP, [ARG + _aesarg_out + 8*2]
291 VMOVDQ [TMP], XDATA2 ; write back ciphertext
292 mov TMP, [ARG + _aesarg_out + 8*3]
293 VMOVDQ [TMP], XDATA3 ; write back ciphertext
294 mov TMP, [ARG + _aesarg_out + 8*4]
295 VMOVDQ [TMP], XDATA4 ; write back ciphertext
296 mov TMP, [ARG + _aesarg_out + 8*5]
297 VMOVDQ [TMP], XDATA5 ; write back ciphertext
298 mov TMP, [ARG + _aesarg_out + 8*6]
299 VMOVDQ [TMP], XDATA6 ; write back ciphertext
300 mov TMP, [ARG + _aesarg_out + 8*7]
301 VMOVDQ [TMP], XDATA7 ; write back ciphertext
302
303 cmp [LEN_AREA], IDX
304 je done
305
306 main_loop:
307 mov TMP, [ARG + _aesarg_in + 8*1]
308 VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
309 VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
310 mov TMP, [ARG + _aesarg_in + 8*3]
311 VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
312 VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
313 mov TMP, [ARG + _aesarg_in + 8*5]
314 VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
315 VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
316 mov TMP, [ARG + _aesarg_in + 8*7]
317 VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
318 VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
319
320
321 VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
322 VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
323 VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
324 VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
325 VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
326 VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
327 VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
328 VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
329
330 vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
331 vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
332 vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
333 vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
334 vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
335 vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
336 vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
337 vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
338
339 vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
340 vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
341 vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
342 vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
343 vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
344 vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
345 vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
346 vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
347
348 vaesenc XDATA0, XKEY0_3 ; 3. ENC
349 vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
350 vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
351 vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
352 vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
353 vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
354 vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
355 vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
356
357 vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
358 vaesenc XDATA1, XKEY1_4 ; 4. ENC
359 vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
360 vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
361 vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
362 vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
363 vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
364 vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
365
366 vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
367 vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
368 vaesenc XDATA2, XKEY2_5 ; 5. ENC
369 vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
370 vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
371 vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
372 vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
373 vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
374
375 vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
376 vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
377 vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
378 vaesenc XDATA3, XKEY3_6 ; 6. ENC
379 vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
380 vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
381 vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
382 vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
383
384 vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
385 vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
386 vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
387 vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
388 vaesenc XDATA4, XKEY4_7 ; 7. ENC
389 vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
390 vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
391 vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
392
393 vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
394 vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
395 vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
396 vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
397 vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
398 vaesenc XDATA5, XKEY5_8 ; 8. ENC
399 vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
400 vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
401
402 vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
403 vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
404 vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
405 vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
406 vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
407 vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
408 mov TMP, [ARG + _aesarg_out + 8*0]
409 vaesenc XDATA6, XKEY6_9 ; 9. ENC
410 vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
411
412
413 vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
414 vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
415 vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
416 vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
417 vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
418 vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
419 vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
420 vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
421
422 vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
423 vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
424 vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
425 vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
426 vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
427 vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
428 vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
429 vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
430
431 vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
432 vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
433 vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
434 vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
435 vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
436 vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
437 vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
438 vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
439
440
441 VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
442 mov TMP, [ARG + _aesarg_out + 8*1]
443 VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
444 mov TMP, [ARG + _aesarg_out + 8*2]
445 VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
446 mov TMP, [ARG + _aesarg_out + 8*3]
447 VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
448 mov TMP, [ARG + _aesarg_out + 8*4]
449 VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
450 mov TMP, [ARG + _aesarg_out + 8*5]
451 VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
452 mov TMP, [ARG + _aesarg_out + 8*6]
453 VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
454 mov TMP, [ARG + _aesarg_out + 8*7]
455 VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
456
457 add IDX, 16
458 cmp [LEN_AREA], IDX
459 jne main_loop
460
461 done:
462 ;; update IV
463 vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
464 vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
465 vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
466 vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
467 vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
468 vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
469 vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
470 vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
471
472 ;; update IN and OUT
473 vmovd xmm0, [LEN_AREA]
474 vpshufd xmm0, xmm0, 0x44
475 vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
476 vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
477 vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
478 vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
479 vmovdqa [ARG + _aesarg_in + 16*0], xmm1
480 vmovdqa [ARG + _aesarg_in + 16*1], xmm2
481 vmovdqa [ARG + _aesarg_in + 16*2], xmm3
482 vmovdqa [ARG + _aesarg_in + 16*3], xmm4
483 vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
484 vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
485 vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
486 vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
487 vmovdqa [ARG + _aesarg_out + 16*0], xmm5
488 vmovdqa [ARG + _aesarg_out + 16*1], xmm6
489 vmovdqa [ARG + _aesarg_out + 16*2], xmm7
490 vmovdqa [ARG + _aesarg_out + 16*3], xmm8
491
492 ;; XMMs are saved at a higher level
493 mov rbp, [GPR_SAVE_AREA + 8*0]
494
495 add rsp, STACK_size
496
497 ret
498
499 %ifdef LINUX
500 section .note.GNU-stack noalloc noexec nowrite progbits
501 %endif