[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / aes256_cntr_by4_sse.asm

;;
;; Copyright (c) 2012-2018, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;;     * Redistributions of source code must retain the above copyright notice,
;;       this list of conditions and the following disclaimer.
;;     * Redistributions in binary form must reproduce the above copyright
;;       notice, this list of conditions and the following disclaimer in the
;;       documentation and/or other materials provided with the distribution.
;;     * Neither the name of Intel Corporation nor the names of its contributors
;;       may be used to endorse or promote products derived from this software
;;       without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;

%include "os.asm"
%include "memcpy.asm"

; routine to do AES256 CNTR enc/decrypt "by4"
; XMM registers are clobbered. Saving/restoring must be done at a higher level

%ifndef AES_CNTR_256
%define AES_CNTR_256 aes_cntr_256_sse
%endif

extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4

%define CONCAT(a,b) a %+ b
%define MOVDQ movdqu

%define xdata0	xmm0
%define xdata1	xmm1
%define xdata2	xmm2
%define xdata3	xmm3
%define xdata4	xmm4
%define xdata5	xmm5
%define xdata6	xmm6
%define xdata7	xmm7
%define xcounter xmm8
%define xbyteswap xmm9
%define xkey0 	xmm10
%define xkey4 	xmm11
%define xkey8 	xmm12
%define xkey12	xmm13
%define xkeyA	xmm14
%define xkeyB	xmm15

%ifdef LINUX
%define p_in	  rdi
%define p_IV	  rsi
%define p_keys	  rdx
%define p_out	  rcx
%define num_bytes r8
%define p_ivlen   r9
%else
%define p_in	  rcx
%define p_IV	  rdx
%define p_keys	  r8
%define p_out	  r9
%define num_bytes r10
%define p_ivlen   qword [rsp + 8*6]
%endif

%define tmp	r11
%define p_tmp	rsp + _buffer

%macro do_aes_load 1
	do_aes %1, 1
%endmacro

%macro do_aes_noload 1
	do_aes %1, 0
%endmacro


; do_aes num_in_par load_keys
; This increments p_in, but not p_out
%macro do_aes 2
%define %%by %1
%define %%load_keys %2

%if (%%load_keys)
	movdqa	xkey0, [p_keys + 0*16]
%endif

	movdqa	xdata0, xcounter
	pshufb	xdata0, xbyteswap
%assign i 1
%rep (%%by - 1)
	movdqa	CONCAT(xdata,i), xcounter
	paddd	CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
	pshufb	CONCAT(xdata,i), xbyteswap
%assign i (i + 1)
%endrep

	movdqa	xkeyA, [p_keys + 1*16]

	pxor	xdata0, xkey0
	paddd	xcounter, [rel CONCAT(ddq_add_,%%by)]
%assign i 1
%rep (%%by - 1)
	pxor	CONCAT(xdata,i), xkey0
%assign i (i + 1)
%endrep

	movdqa	xkeyB, [p_keys + 2*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 1
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 3*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyB		; key 2
%assign i (i+1)
%endrep

	add	p_in, 16*%%by

%if (%%load_keys)
	movdqa	xkey4, [p_keys + 4*16]
%endif
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 3
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 5*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkey4		; key 4
%assign i (i+1)
%endrep

	movdqa	xkeyB, [p_keys + 6*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 5
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 7*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyB		; key 6
%assign i (i+1)
%endrep

%if (%%load_keys)
	movdqa	xkey8, [p_keys + 8*16]
%endif
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 7
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 9*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkey8		; key 8
%assign i (i+1)
%endrep

	movdqa	xkeyB, [p_keys + 10*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 9
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 11*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyB		; key 10
%assign i (i+1)
%endrep

%if (%%load_keys)
	movdqa	xkey12, [p_keys + 12*16]
%endif
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 11
%assign i (i+1)
%endrep

	movdqa	xkeyA, [p_keys + 13*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkey12	; key 12
%assign i (i+1)
%endrep

	movdqa	xkeyB, [p_keys + 14*16]
%assign i 0
%rep %%by
	aesenc	CONCAT(xdata,i), xkeyA		; key 13
%assign i (i+1)
%endrep

%assign i 0
%rep %%by
	aesenclast	CONCAT(xdata,i), xkeyB	; key 14
%assign i (i+1)
%endrep

%assign i 0
%rep (%%by / 2)
%assign j (i+1)
	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
	MOVDQ	xkeyB, [p_in + j*16 - 16*%%by]
	pxor	CONCAT(xdata,i), xkeyA
	pxor	CONCAT(xdata,j), xkeyB
%assign i (i+2)
%endrep
%if (i < %%by)
	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
	pxor	CONCAT(xdata,i), xkeyA
%endif

%assign i 0
%rep %%by
	MOVDQ	[p_out  + i*16], CONCAT(xdata,i)
%assign i (i+1)
%endrep
%endmacro

struc STACK
_buffer:	resq	2
_rsp_save:	resq	1
endstruc

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section .text

;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
align 32
MKGLOBAL(AES_CNTR_256,function,internal)
AES_CNTR_256:

%ifndef LINUX
	mov	num_bytes, [rsp + 8*5]
%endif

	movdqa	xbyteswap, [rel byteswap_const]
        test    p_ivlen, 16
        jnz     iv_is_16_bytes
        ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
        mov     DWORD(tmp), 0x01000000
        pinsrq  xcounter, [p_IV], 0
        pinsrd  xcounter, [p_IV + 8], 2
        pinsrd  xcounter, DWORD(tmp), 3
bswap_iv:
	pshufb	xcounter, xbyteswap

	mov	tmp, num_bytes
	and	tmp, 3*16
	jz	chk             ; x4 > or < 15 (not 3 lines)

	; 1 <= tmp <= 3
	cmp	tmp, 2*16
	jg	eq3
	je	eq2
eq1:
	do_aes_load	1
	add	p_out, 1*16
	jmp	chk

eq2:
	do_aes_load	2
	add	p_out, 2*16
	jmp	chk

eq3:
	do_aes_load	3
	add	p_out, 3*16
	; fall through to chk
chk:
	and	num_bytes, ~(3*16)
	jz	do_return2
        cmp	num_bytes, 16
        jb	last

	; process multiples of 4 blocks
	movdqa	xkey0, [p_keys + 0*16]
	movdqa	xkey4, [p_keys + 4*16]
	movdqa	xkey8, [p_keys + 8*16]
	movdqa	xkey12, [p_keys + 12*16]
	jmp	main_loop2

align 32
main_loop2:
	; num_bytes is a multiple of 4 and >0
	do_aes_noload	4
	add	p_out,	4*16
	sub	num_bytes, 4*16
        cmp	num_bytes, 4*16
	jae	main_loop2

	test	num_bytes, 15	; partial bytes to be processed?
	jnz	last

do_return2:
	; don't return updated IV
;	pshufb	xcounter, xbyteswap
;	movdqu	[p_IV], xcounter
	ret

last:
	;; Code dealing with the partial block cases
	; reserve 16 byte aligned buffer on stack
        mov	rax, rsp
        sub	rsp, STACK_size
        and	rsp, -16
	mov	[rsp + _rsp_save], rax ; save SP

	; copy input bytes into scratch buffer
	memcpy_sse_16_1	p_tmp, p_in, num_bytes, tmp, rax
	; Encryption of a single partial block (p_tmp)
        pshufb	xcounter, xbyteswap
        movdqa	xdata0, xcounter
        pxor    xdata0, [p_keys + 16*0]
%assign i 1
%rep 13
        aesenc  xdata0, [p_keys + 16*i]
%assign i (i+1)
%endrep
	; created keystream
        aesenclast xdata0, [p_keys + 16*i]
	; xor keystream with the message (scratch)
        pxor    xdata0, [p_tmp]
	movdqa	[p_tmp], xdata0
	; copy result into the output buffer
	memcpy_sse_16_1	p_out, p_tmp, num_bytes, tmp, rax
	; remove the stack frame
	mov	rsp, [rsp + _rsp_save]	; original SP
	jmp	do_return2

iv_is_16_bytes:
        ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
        movdqu  xcounter, [p_IV]
        jmp     bswap_iv

%ifdef LINUX
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
Commit	Line	Data
11fdf7f2 TL	1	;;
	2	;; Copyright (c) 2012-2018, Intel Corporation
	3	;;
	4	;; Redistribution and use in source and binary forms, with or without
	5	;; modification, are permitted provided that the following conditions are met:
	6	;;
	7	;; * Redistributions of source code must retain the above copyright notice,
	8	;; this list of conditions and the following disclaimer.
	9	;; * Redistributions in binary form must reproduce the above copyright
	10	;; notice, this list of conditions and the following disclaimer in the
	11	;; documentation and/or other materials provided with the distribution.
	12	;; * Neither the name of Intel Corporation nor the names of its contributors
	13	;; may be used to endorse or promote products derived from this software
	14	;; without specific prior written permission.
	15	;;
	16	;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	17	;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	18	;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	19	;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
	20	;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	21	;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	22	;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	23	;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	24	;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	25	;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	26	;;
	27
	28	%include "os.asm"
	29	%include "memcpy.asm"
	30
	31	; routine to do AES256 CNTR enc/decrypt "by4"
	32	; XMM registers are clobbered. Saving/restoring must be done at a higher level
	33
9f95a23c TL	34	%ifndef AES_CNTR_256
	35	%define AES_CNTR_256 aes_cntr_256_sse
	36	%endif
	37
11fdf7f2 TL	38	extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
	39
	40	%define CONCAT(a,b) a %+ b
	41	%define MOVDQ movdqu
	42
	43	%define xdata0 xmm0
	44	%define xdata1 xmm1
	45	%define xdata2 xmm2
	46	%define xdata3 xmm3
	47	%define xdata4 xmm4
	48	%define xdata5 xmm5
	49	%define xdata6 xmm6
	50	%define xdata7 xmm7
	51	%define xcounter xmm8
	52	%define xbyteswap xmm9
	53	%define xkey0 xmm10
	54	%define xkey4 xmm11
	55	%define xkey8 xmm12
	56	%define xkey12 xmm13
	57	%define xkeyA xmm14
	58	%define xkeyB xmm15
	59
	60	%ifdef LINUX
	61	%define p_in rdi
	62	%define p_IV rsi
	63	%define p_keys rdx
	64	%define p_out rcx
	65	%define num_bytes r8
	66	%define p_ivlen r9
	67	%else
	68	%define p_in rcx
	69	%define p_IV rdx
	70	%define p_keys r8
	71	%define p_out r9
	72	%define num_bytes r10
	73	%define p_ivlen qword [rsp + 8*6]
	74	%endif
	75
	76	%define tmp r11
	77	%define p_tmp rsp + _buffer
	78
	79	%macro do_aes_load 1
	80	do_aes %1, 1
	81	%endmacro
	82
	83	%macro do_aes_noload 1
	84	do_aes %1, 0
	85	%endmacro
	86
	87
	88	; do_aes num_in_par load_keys
	89	; This increments p_in, but not p_out
	90	%macro do_aes 2
	91	%define %%by %1
	92	%define %%load_keys %2
	93
	94	%if (%%load_keys)
	95	movdqa xkey0, [p_keys + 0*16]
	96	%endif
	97
	98	movdqa xdata0, xcounter
	99	pshufb xdata0, xbyteswap
	100	%assign i 1
	101	%rep (%%by - 1)
102	movdqa CONCAT(xdata,i), xcounter
103	paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
104	pshufb CONCAT(xdata,i), xbyteswap
105	%assign i (i + 1)
106	%endrep
107
108	movdqa xkeyA, [p_keys + 1*16]
109
110	pxor xdata0, xkey0
111	paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
112	%assign i 1
113	%rep (%%by - 1)
114	pxor CONCAT(xdata,i), xkey0
115	%assign i (i + 1)
116	%endrep
117
118	movdqa xkeyB, [p_keys + 2*16]
119	%assign i 0
120	%rep %%by
121	aesenc CONCAT(xdata,i), xkeyA ; key 1
122	%assign i (i+1)
123	%endrep
124
125	movdqa xkeyA, [p_keys + 3*16]
126	%assign i 0
127	%rep %%by
128	aesenc CONCAT(xdata,i), xkeyB ; key 2
129	%assign i (i+1)
130	%endrep
131
132	add p_in, 16*%%by
133
134	%if (%%load_keys)
135	movdqa xkey4, [p_keys + 4*16]
136	%endif
137	%assign i 0
138	%rep %%by
139	aesenc CONCAT(xdata,i), xkeyA ; key 3
140	%assign i (i+1)
141	%endrep
142
143	movdqa xkeyA, [p_keys + 5*16]
144	%assign i 0
145	%rep %%by
146	aesenc CONCAT(xdata,i), xkey4 ; key 4
147	%assign i (i+1)
148	%endrep
149
150	movdqa xkeyB, [p_keys + 6*16]
151	%assign i 0
152	%rep %%by
153	aesenc CONCAT(xdata,i), xkeyA ; key 5
154	%assign i (i+1)
155	%endrep
156
157	movdqa xkeyA, [p_keys + 7*16]
158	%assign i 0
159	%rep %%by
160	aesenc CONCAT(xdata,i), xkeyB ; key 6
161	%assign i (i+1)
162	%endrep
163
164	%if (%%load_keys)
165	movdqa xkey8, [p_keys + 8*16]
166	%endif
167	%assign i 0
168	%rep %%by
169	aesenc CONCAT(xdata,i), xkeyA ; key 7
170	%assign i (i+1)
171	%endrep
172
173	movdqa xkeyA, [p_keys + 9*16]
174	%assign i 0
175	%rep %%by
176	aesenc CONCAT(xdata,i), xkey8 ; key 8
177	%assign i (i+1)
178	%endrep
179
180	movdqa xkeyB, [p_keys + 10*16]
181	%assign i 0
182	%rep %%by
183	aesenc CONCAT(xdata,i), xkeyA ; key 9
184	%assign i (i+1)
185	%endrep
186
187	movdqa xkeyA, [p_keys + 11*16]
188	%assign i 0
189	%rep %%by
190	aesenc CONCAT(xdata,i), xkeyB ; key 10
191	%assign i (i+1)
192	%endrep
193
194	%if (%%load_keys)
195	movdqa xkey12, [p_keys + 12*16]
196	%endif
197	%assign i 0
198	%rep %%by
199	aesenc CONCAT(xdata,i), xkeyA ; key 11
200	%assign i (i+1)
201	%endrep
202
203	movdqa xkeyA, [p_keys + 13*16]
204	%assign i 0
205	%rep %%by
206	aesenc CONCAT(xdata,i), xkey12 ; key 12
207	%assign i (i+1)
208	%endrep
209
210	movdqa xkeyB, [p_keys + 14*16]
211	%assign i 0
212	%rep %%by
213	aesenc CONCAT(xdata,i), xkeyA ; key 13
214	%assign i (i+1)
215	%endrep
216
217	%assign i 0
218	%rep %%by
219	aesenclast CONCAT(xdata,i), xkeyB ; key 14
220	%assign i (i+1)
221	%endrep
222
223	%assign i 0
224	%rep (%%by / 2)
225	%assign j (i+1)
226	MOVDQ xkeyA, [p_in + i16 - 16%%by]
227	MOVDQ xkeyB, [p_in + j16 - 16%%by]
228	pxor CONCAT(xdata,i), xkeyA
229	pxor CONCAT(xdata,j), xkeyB
230	%assign i (i+2)
231	%endrep
232	%if (i < %%by)
233	MOVDQ xkeyA, [p_in + i16 - 16%%by]
234	pxor CONCAT(xdata,i), xkeyA
235	%endif
236
237	%assign i 0
238	%rep %%by
239	MOVDQ [p_out + i*16], CONCAT(xdata,i)
240	%assign i (i+1)
241	%endrep
242	%endmacro
243
244	struc STACK
245	_buffer: resq 2
246	_rsp_save: resq 1
247	endstruc
248
249	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
250	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
251	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252	section .text
253
254	;; aes_cntr_256_sse(void in, void IV, void keys, void out, UINT64 num_bytes, UINT64 iv_len)
255	align 32
9f95a23c TL	256	MKGLOBAL(AES_CNTR_256,function,internal)
9f95a23c TL	257	AES_CNTR_256:
11fdf7f2 TL	258
	259	%ifndef LINUX
	260	mov num_bytes, [rsp + 8*5]
	261	%endif
	262
	263	movdqa xbyteswap, [rel byteswap_const]
	264	test p_ivlen, 16
	265	jnz iv_is_16_bytes
	266	; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
	267	mov DWORD(tmp), 0x01000000
	268	pinsrq xcounter, [p_IV], 0
	269	pinsrd xcounter, [p_IV + 8], 2
	270	pinsrd xcounter, DWORD(tmp), 3
	271	bswap_iv:
	272	pshufb xcounter, xbyteswap
	273
	274	mov tmp, num_bytes
	275	and tmp, 3*16
	276	jz chk ; x4 > or < 15 (not 3 lines)
	277
	278	; 1 <= tmp <= 3
	279	cmp tmp, 2*16
	280	jg eq3
	281	je eq2
	282	eq1:
	283	do_aes_load 1
	284	add p_out, 1*16
	285	jmp chk
	286
	287	eq2:
	288	do_aes_load 2
	289	add p_out, 2*16
	290	jmp chk
	291
	292	eq3:
	293	do_aes_load 3
	294	add p_out, 3*16
	295	; fall through to chk
	296	chk:
	297	and num_bytes, ~(3*16)
	298	jz do_return2
	299	cmp num_bytes, 16
	300	jb last
	301
	302	; process multiples of 4 blocks
	303	movdqa xkey0, [p_keys + 0*16]
	304	movdqa xkey4, [p_keys + 4*16]
	305	movdqa xkey8, [p_keys + 8*16]
	306	movdqa xkey12, [p_keys + 12*16]
	307	jmp main_loop2
	308
	309	align 32
	310	main_loop2:
	311	; num_bytes is a multiple of 4 and >0
	312	do_aes_noload 4
	313	add p_out, 4*16
	314	sub num_bytes, 4*16
	315	cmp num_bytes, 4*16
	316	jae main_loop2
	317
	318	test num_bytes, 15 ; partial bytes to be processed?
	319	jnz last
	320
	321	do_return2:
322	; don't return updated IV
323	; pshufb xcounter, xbyteswap
324	; movdqu [p_IV], xcounter
325	ret
326
327	last:
328	;; Code dealing with the partial block cases
329	; reserve 16 byte aligned buffer on stack
330	mov rax, rsp
331	sub rsp, STACK_size
332	and rsp, -16
333	mov [rsp + _rsp_save], rax ; save SP
334
335	; copy input bytes into scratch buffer
336	memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
337	; Encryption of a single partial block (p_tmp)
338	pshufb xcounter, xbyteswap
339	movdqa xdata0, xcounter
340	pxor xdata0, [p_keys + 16*0]
341	%assign i 1
342	%rep 13
343	aesenc xdata0, [p_keys + 16*i]
344	%assign i (i+1)
345	%endrep
346	; created keystream
347	aesenclast xdata0, [p_keys + 16*i]
348	; xor keystream with the message (scratch)
349	pxor xdata0, [p_tmp]
350	movdqa [p_tmp], xdata0
351	; copy result into the output buffer
352	memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
353	; remove the stack frame
354	mov rsp, [rsp + _rsp_save] ; original SP
355	jmp do_return2
356
357	iv_is_16_bytes:
358	; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
359	movdqu xcounter, [p_IV]
360	jmp bswap_iv
361
362	%ifdef LINUX
363	section .note.GNU-stack noalloc noexec nowrite progbits
364	%endif