2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 ; routine to do AES128 CNTR enc/decrypt "by4"
32 ; XMM registers are clobbered. Saving/restoring must be done at a higher level
35 %define AES_CNTR_128 aes_cntr_128_sse
38 extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
40 %define CONCAT(a,b) a %+ b
52 %define xbyteswap xmm9
73 %define p_ivlen qword [rsp + 8*6]
76 %define p_tmp rsp + _buffer
83 %macro do_aes_noload 1
87 ; do_aes num_in_par load_keys
88 ; This increments p_in, but not p_out
91 %define %%load_keys %2
94 movdqa xkey0, [p_keys + 0*16]
97 movdqa xdata0, xcounter
98 pshufb xdata0, xbyteswap
101 movdqa CONCAT(xdata,i), xcounter
102 paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
103 pshufb CONCAT(xdata,i), xbyteswap
107 movdqa xkeyA, [p_keys + 1*16]
110 paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
113 pxor CONCAT(xdata,i), xkey0
117 movdqa xkeyB, [p_keys + 2*16]
120 aesenc CONCAT(xdata,i), xkeyA ; key 1
125 movdqa xkey3, [p_keys + 3*16]
129 aesenc CONCAT(xdata,i), xkeyB ; key 2
135 movdqa xkeyB, [p_keys + 4*16]
138 aesenc CONCAT(xdata,i), xkey3 ; key 3
142 movdqa xkeyA, [p_keys + 5*16]
145 aesenc CONCAT(xdata,i), xkeyB ; key 4
150 movdqa xkey6, [p_keys + 6*16]
154 aesenc CONCAT(xdata,i), xkeyA ; key 5
158 movdqa xkeyA, [p_keys + 7*16]
161 aesenc CONCAT(xdata,i), xkey6 ; key 6
165 movdqa xkeyB, [p_keys + 8*16]
168 aesenc CONCAT(xdata,i), xkeyA ; key 7
173 movdqa xkey9, [p_keys + 9*16]
177 aesenc CONCAT(xdata,i), xkeyB ; key 8
181 movdqa xkeyB, [p_keys + 10*16]
184 aesenc CONCAT(xdata,i), xkey9 ; key 9
190 aesenclast CONCAT(xdata,i), xkeyB ; key 10
197 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
198 MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
199 pxor CONCAT(xdata,i), xkeyA
200 pxor CONCAT(xdata,j), xkeyB
204 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
205 pxor CONCAT(xdata,i), xkeyA
210 MOVDQ [p_out + i*16], CONCAT(xdata,i)
220 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
225 ;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
227 MKGLOBAL(AES_CNTR_128,function,internal)
231 mov num_bytes, [rsp + 8*5] ; arg5
234 movdqa xbyteswap, [rel byteswap_const]
237 ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
238 mov DWORD(tmp), 0x01000000
239 pinsrq xcounter, [p_IV], 0
240 pinsrd xcounter, [p_IV + 8], 2
241 pinsrd xcounter, DWORD(tmp), 3
243 pshufb xcounter, xbyteswap
247 jz chk ; x4 > or < 15 (not 3 lines)
254 do_aes_load 1 ; 1 block
259 do_aes_load 2 ; 2 blocks
264 do_aes_load 3 ; 3 blocks
266 ; fall through to chk
268 and num_bytes, ~(3*16)
273 ; process multiples of 4 blocks
274 movdqa xkey0, [p_keys + 0*16]
275 movdqa xkey3, [p_keys + 3*16]
276 movdqa xkey6, [p_keys + 6*16]
277 movdqa xkey9, [p_keys + 9*16]
282 ; num_bytes is a multiple of 4 blocks + partial bytes
289 test num_bytes, 15 ; partial bytes to be processed?
293 ; don't return updated IV
294 ; pshufb xcounter, xbyteswap
295 ; movdqu [p_IV], xcounter
299 ;; Code dealing with the partial block cases
300 ; reserve 16 byte aligned buffer on the stack
304 mov [rsp + _rsp_save], rax ; save SP
306 ; copy input bytes into scratch buffer
307 memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
308 ; Encryption of a single partial block (p_tmp)
309 pshufb xcounter, xbyteswap
310 movdqa xdata0, xcounter
311 pxor xdata0, [p_keys + 16*0]
314 aesenc xdata0, [p_keys + 16*i]
318 aesenclast xdata0, [p_keys + 16*i]
319 ; xor keystream with the message (scratch)
321 movdqa [p_tmp], xdata0
322 ; copy result into the output buffer
323 memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
324 ; remove the stack frame
325 mov rsp, [rsp + _rsp_save] ; original SP
329 ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
330 movdqu xcounter, [p_IV]
334 section .note.GNU-stack noalloc noexec nowrite progbits