]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / aes_cbc_enc_192_x8.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;;; routine to do a 192 bit CBC AES encrypt
29
30;; clobbers all registers except for ARG1 and rbp
31
f67539c2 32%include "include/os.asm"
11fdf7f2
TL
33%include "mb_mgr_datastruct.asm"
34
35%define VMOVDQ vmovdqu ;; assume buffers not aligned
36
37%macro VPXOR2 2
38 vpxor %1, %1, %2
39%endm
40
41;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
f67539c2 42;; struct AES_ARGS {
11fdf7f2
TL
43;; void* in[8];
44;; void* out[8];
45;; UINT128* keys[8];
46;; UINT128 IV[8];
47;; }
48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
f67539c2
TL
49;; void aes_cbc_enc_192_x8(AES_ARGS *args, UINT64 len);
50;; arg 1: ARG : addr of AES_ARGS structure
11fdf7f2
TL
51;; arg 2: LEN : len (in units of bytes)
52
53struc STACK
54_gpr_save: resq 1
55_len: resq 1
56endstruc
57
58%define GPR_SAVE_AREA rsp + _gpr_save
59%define LEN_AREA rsp + _len
60
61%ifdef LINUX
62%define ARG rdi
63%define LEN rsi
64%define REG3 rcx
65%define REG4 rdx
66%else
67%define ARG rcx
68%define LEN rdx
69%define REG3 rsi
70%define REG4 rdi
71%endif
72
73%define IDX rax
74%define TMP rbx
75
76%define KEYS0 REG3
77%define KEYS1 REG4
78%define KEYS2 rbp
79%define KEYS3 r8
80%define KEYS4 r9
81%define KEYS5 r10
82%define KEYS6 r11
83%define KEYS7 r12
84
85%define IN0 r13
86%define IN2 r14
87%define IN4 r15
88%define IN6 LEN
89
90%define XDATA0 xmm0
91%define XDATA1 xmm1
92%define XDATA2 xmm2
93%define XDATA3 xmm3
94%define XDATA4 xmm4
95%define XDATA5 xmm5
96%define XDATA6 xmm6
97%define XDATA7 xmm7
98
99%define XKEY0_3 xmm8
100%define XKEY1_4 xmm9
101%define XKEY2_5 xmm10
102%define XKEY3_6 xmm11
103%define XKEY4_7 xmm12
104%define XKEY5_8 xmm13
105%define XKEY6_9 xmm14
106%define XTMP xmm15
107
108section .text
109
110MKGLOBAL(aes_cbc_enc_192_x8,function,internal)
111aes_cbc_enc_192_x8:
112
113 sub rsp, STACK_size
114 mov [GPR_SAVE_AREA + 8*0], rbp
115
116 mov IDX, 16
117 mov [LEN_AREA], LEN
118
119 mov IN0, [ARG + _aesarg_in + 8*0]
120 mov IN2, [ARG + _aesarg_in + 8*2]
121 mov IN4, [ARG + _aesarg_in + 8*4]
122 mov IN6, [ARG + _aesarg_in + 8*6]
123
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125
126 mov TMP, [ARG + _aesarg_in + 8*1]
127 VMOVDQ XDATA0, [IN0] ; load first block of plain text
128 VMOVDQ XDATA1, [TMP] ; load first block of plain text
129 mov TMP, [ARG + _aesarg_in + 8*3]
130 VMOVDQ XDATA2, [IN2] ; load first block of plain text
131 VMOVDQ XDATA3, [TMP] ; load first block of plain text
132 mov TMP, [ARG + _aesarg_in + 8*5]
133 VMOVDQ XDATA4, [IN4] ; load first block of plain text
134 VMOVDQ XDATA5, [TMP] ; load first block of plain text
135 mov TMP, [ARG + _aesarg_in + 8*7]
136 VMOVDQ XDATA6, [IN6] ; load first block of plain text
137 VMOVDQ XDATA7, [TMP] ; load first block of plain text
138
139
140 VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
141 VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
142 VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
143 VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
144 VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
145 VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
146 VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
147 VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
148
149 mov KEYS0, [ARG + _aesarg_keys + 8*0]
150 mov KEYS1, [ARG + _aesarg_keys + 8*1]
151 mov KEYS2, [ARG + _aesarg_keys + 8*2]
152 mov KEYS3, [ARG + _aesarg_keys + 8*3]
153 mov KEYS4, [ARG + _aesarg_keys + 8*4]
154 mov KEYS5, [ARG + _aesarg_keys + 8*5]
155 mov KEYS6, [ARG + _aesarg_keys + 8*6]
156 mov KEYS7, [ARG + _aesarg_keys + 8*7]
157
158 VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
159 VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
160 VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
161 VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
162 VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
163 VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
164 VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
165 VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
166
167 vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
168 vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
169 vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
170 vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
171 vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
172 vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
173 vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
174 vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
175
176 vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
177
178 vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
179 vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
180 vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
181 vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
182 vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
183 vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
184 vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
185 vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
186
187 vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
188
189 vaesenc XDATA0, XKEY0_3 ; 3. ENC
190 vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
191 vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
192 vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
193 vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
194 vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
195 vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
196 vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
197
198 vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
199 vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
200 vaesenc XDATA1, XKEY1_4 ; 4. ENC
201 vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
202 vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
203 vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
204 vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
205 vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
206 vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
207
208 vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
209 vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
210 vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
211 vaesenc XDATA2, XKEY2_5 ; 5. ENC
212 vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
213 vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
214 vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
215 vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
216 vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
217
218 vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
219 vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
220 vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
221 vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
222 vaesenc XDATA3, XKEY3_6 ; 6. ENC
223 vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
224 vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
225 vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
226 vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
227
228 vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
229 vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
230 vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
231 vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
232 vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
233 vaesenc XDATA4, XKEY4_7 ; 7. ENC
234 vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
235 vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
236 vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
237
238 vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
239 vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
240 vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
241 vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
242 vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
243 vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
244 vaesenc XDATA5, XKEY5_8 ; 8. ENC
245 vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
246 vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
247
248 vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
249 vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
250 vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
251 vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
252 vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
253 vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
254 mov TMP, [ARG + _aesarg_out + 8*0]
255 vaesenc XDATA6, XKEY6_9 ; 9. ENC
256 vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
257
258
259 vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
260 vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
261 vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
262 vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
263 vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
264 vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
265 vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
266 vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
267
268 vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
269 vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
270 vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
271 vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
272 vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
273 vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
274 vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
275 vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
276
277
278 vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
279 vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
280 vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
281 vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
282 vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
283 vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
284 vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
285 vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
286
287 VMOVDQ [TMP], XDATA0 ; write back ciphertext
288 mov TMP, [ARG + _aesarg_out + 8*1]
289 VMOVDQ [TMP], XDATA1 ; write back ciphertext
290 mov TMP, [ARG + _aesarg_out + 8*2]
291 VMOVDQ [TMP], XDATA2 ; write back ciphertext
292 mov TMP, [ARG + _aesarg_out + 8*3]
293 VMOVDQ [TMP], XDATA3 ; write back ciphertext
294 mov TMP, [ARG + _aesarg_out + 8*4]
295 VMOVDQ [TMP], XDATA4 ; write back ciphertext
296 mov TMP, [ARG + _aesarg_out + 8*5]
297 VMOVDQ [TMP], XDATA5 ; write back ciphertext
298 mov TMP, [ARG + _aesarg_out + 8*6]
299 VMOVDQ [TMP], XDATA6 ; write back ciphertext
300 mov TMP, [ARG + _aesarg_out + 8*7]
301 VMOVDQ [TMP], XDATA7 ; write back ciphertext
302
303 cmp [LEN_AREA], IDX
304 je done
305
306main_loop:
307 mov TMP, [ARG + _aesarg_in + 8*1]
308 VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
309 VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
310 mov TMP, [ARG + _aesarg_in + 8*3]
311 VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
312 VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
313 mov TMP, [ARG + _aesarg_in + 8*5]
314 VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
315 VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
316 mov TMP, [ARG + _aesarg_in + 8*7]
317 VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
318 VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
319
320
321 VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
322 VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
323 VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
324 VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
325 VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
326 VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
327 VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
328 VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
329
330 vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
331 vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
332 vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
333 vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
334 vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
335 vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
336 vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
337 vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
338
339 vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
340 vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
341 vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
342 vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
343 vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
344 vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
345 vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
346 vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
347
348 vaesenc XDATA0, XKEY0_3 ; 3. ENC
349 vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
350 vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
351 vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
352 vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
353 vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
354 vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
355 vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
356
357 vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
358 vaesenc XDATA1, XKEY1_4 ; 4. ENC
359 vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
360 vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
361 vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
362 vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
363 vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
364 vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
365
366 vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
367 vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
368 vaesenc XDATA2, XKEY2_5 ; 5. ENC
369 vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
370 vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
371 vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
372 vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
373 vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
374
375 vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
376 vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
377 vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
378 vaesenc XDATA3, XKEY3_6 ; 6. ENC
379 vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
380 vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
381 vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
382 vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
383
384 vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
385 vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
386 vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
387 vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
388 vaesenc XDATA4, XKEY4_7 ; 7. ENC
389 vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
390 vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
391 vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
392
393 vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
394 vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
395 vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
396 vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
397 vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
398 vaesenc XDATA5, XKEY5_8 ; 8. ENC
399 vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
400 vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
401
402 vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
403 vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
404 vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
405 vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
406 vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
407 vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
408 mov TMP, [ARG + _aesarg_out + 8*0]
409 vaesenc XDATA6, XKEY6_9 ; 9. ENC
410 vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
411
412
413 vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
414 vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
415 vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
416 vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
417 vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
418 vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
419 vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
420 vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
421
422 vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
423 vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
424 vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
425 vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
426 vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
427 vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
428 vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
429 vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
430
431 vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
432 vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
433 vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
434 vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
435 vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
436 vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
437 vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
438 vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
439
440
441 VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
442 mov TMP, [ARG + _aesarg_out + 8*1]
443 VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
444 mov TMP, [ARG + _aesarg_out + 8*2]
445 VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
446 mov TMP, [ARG + _aesarg_out + 8*3]
447 VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
448 mov TMP, [ARG + _aesarg_out + 8*4]
449 VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
450 mov TMP, [ARG + _aesarg_out + 8*5]
451 VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
452 mov TMP, [ARG + _aesarg_out + 8*6]
453 VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
454 mov TMP, [ARG + _aesarg_out + 8*7]
455 VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
456
457 add IDX, 16
458 cmp [LEN_AREA], IDX
459 jne main_loop
460
461done:
462 ;; update IV
463 vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
464 vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
465 vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
466 vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
467 vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
468 vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
469 vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
470 vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
471
472 ;; update IN and OUT
473 vmovd xmm0, [LEN_AREA]
474 vpshufd xmm0, xmm0, 0x44
475 vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
476 vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
477 vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
478 vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
479 vmovdqa [ARG + _aesarg_in + 16*0], xmm1
480 vmovdqa [ARG + _aesarg_in + 16*1], xmm2
481 vmovdqa [ARG + _aesarg_in + 16*2], xmm3
482 vmovdqa [ARG + _aesarg_in + 16*3], xmm4
483 vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
484 vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
485 vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
486 vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
487 vmovdqa [ARG + _aesarg_out + 16*0], xmm5
488 vmovdqa [ARG + _aesarg_out + 16*1], xmm6
489 vmovdqa [ARG + _aesarg_out + 16*2], xmm7
490 vmovdqa [ARG + _aesarg_out + 16*3], xmm8
491
492;; XMMs are saved at a higher level
493 mov rbp, [GPR_SAVE_AREA + 8*0]
494
495 add rsp, STACK_size
496
497 ret
498
499%ifdef LINUX
500section .note.GNU-stack noalloc noexec nowrite progbits
501%endif