]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - zfs/module/icp/asm-x86_64/aes/aes_amd64.S
UBUNTU: SAUCE: (noup) Update spl to 0.7.3-1ubuntu1, zfs to 0.7.3-1ubuntu1
[mirror_ubuntu-bionic-kernel.git] / zfs / module / icp / asm-x86_64 / aes / aes_amd64.S
CommitLineData
86e3c28a
CIK
1/*
2 * ---------------------------------------------------------------------------
3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
4 *
5 * LICENSE TERMS
6 *
7 * The free distribution and use of this software is allowed (with or without
8 * changes) provided that:
9 *
10 * 1. source code distributions include the above copyright notice, this
11 * list of conditions and the following disclaimer;
12 *
13 * 2. binary distributions include the above copyright notice, this list
14 * of conditions and the following disclaimer in their documentation;
15 *
16 * 3. the name of the copyright holder is not used to endorse products
17 * built using this software without specific written permission.
18 *
19 * DISCLAIMER
20 *
21 * This software is provided 'as is' with no explicit or implied warranties
22 * in respect of its properties, including, but not limited to, correctness
23 * and/or fitness for purpose.
24 * ---------------------------------------------------------------------------
25 * Issue 20/12/2007
26 *
27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
28 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
29 * Some of the techniques used in this implementation are the result of
30 * suggestions made by him for which I am most grateful.
31 *
32 * An AES implementation for AMD64 processors using the YASM assembler. This
33 * implementation provides only encryption, decryption and hence requires key
34 * scheduling support in C. It uses 8k bytes of tables but its encryption and
35 * decryption performance is very close to that obtained using large tables.
36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
37 * which are as follows:
38 * ms windows gnu/linux/opensolaris os
39 *
40 * in_blk rcx rdi
41 * out_blk rdx rsi
42 * context (cx) r8 rdx
43 *
44 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
45 * registers rdi - on both
46 *
47 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
48 * registers - rdi on both
49 *
50 * The convention used here is that for gnu/linux/opensolaris os.
51 *
52 * This code provides the standard AES block size (128 bits, 16 bytes) and the
53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
54 * interface as my C implementation. It uses the Microsoft C AMD64 calling
55 * conventions in which the three parameters are placed in rcx, rdx and r8
56 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
57 *
58 * OpenSolaris Note:
59 * Modified to use GNU/Linux/Solaris calling conventions.
60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
61 *
62 * AES_RETURN aes_encrypt(const unsigned char in_blk[],
63 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
64 *
65 * AES_RETURN aes_decrypt(const unsigned char in_blk[],
66 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
67 *
68 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
69 * const aes_encrypt_ctx cx[1])/
70 *
71 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
72 * const aes_decrypt_ctx cx[1])/
73 *
74 * AES_RETURN aes_encrypt_key(const unsigned char key[],
75 * unsigned int len, const aes_decrypt_ctx cx[1])/
76 *
77 * AES_RETURN aes_decrypt_key(const unsigned char key[],
78 * unsigned int len, const aes_decrypt_ctx cx[1])/
79 *
80 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in
81 * either bits or bytes.
82 *
83 * Comment in/out the following lines to obtain the desired subroutines. These
84 * selections MUST match those in the C header file aesopt.h
85 */
86#define AES_REV_DKS /* define if key decryption schedule is reversed */
87
88#define LAST_ROUND_TABLES /* define for the faster version using extra tables */
89
90/*
91 * The encryption key schedule has the following in memory layout where N is the
92 * number of rounds (10, 12 or 14):
93 *
94 * lo: | input key (round 0) | / each round is four 32-bit words
95 * | encryption round 1 |
96 * | encryption round 2 |
97 * ....
98 * | encryption round N-1 |
99 * hi: | encryption round N |
100 *
101 * The decryption key schedule is normally set up so that it has the same
102 * layout as above by actually reversing the order of the encryption key
103 * schedule in memory (this happens when AES_REV_DKS is set):
104 *
105 * lo: | decryption round 0 | = | encryption round N |
106 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
107 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
108 * .... ....
109 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
110 * hi: | decryption round N | = | input key (round 0) |
111 *
112 * with rounds except the first and last modified using inv_mix_column()
113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
114 * encryption so that it has to be accessed in reverse when used for
115 * decryption (although the inverse mix column modifications are done)
116 *
117 * lo: | decryption round 0 | = | input key (round 0) |
118 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
119 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
120 * .... ....
121 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
122 * hi: | decryption round N | = | encryption round N |
123 *
124 * This layout is faster when the assembler key scheduling provided here
125 * is used.
126 *
127 * End of user defines
128 */
129
130/*
131 * ---------------------------------------------------------------------------
132 * OpenSolaris OS modifications
133 *
134 * This source originates from Brian Gladman file aes_amd64.asm
135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
136 * with these changes:
137 *
138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
139 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION,
140 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
141 *
142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
143 *
144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
145 *
146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
147 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
148 * and "[register+offset]", addressing changed to "offset(register)",
149 * parenthesis in constant expressions "()" changed to square brackets "[]",
150 * "." removed from local (numeric) labels, and other changes.
151 * Examples:
152 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax
153 * mov rax,(4*20h) mov $[4*0x20],%rax
154 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax
155 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax
156 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax
157 *
158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
159 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
160 * definitions for lint.
161 *
162 * 6. Renamed functions and reordered parameters to match OpenSolaris:
163 * Original Gladman interface:
164 * int aes_encrypt(const unsigned char *in,
165 * unsigned char *out, const aes_encrypt_ctx cx[1])/
166 * int aes_decrypt(const unsigned char *in,
167 * unsigned char *out, const aes_encrypt_ctx cx[1])/
168 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
169 * and a union type, inf., containing inf.l, a uint32_t and
170 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is
171 * used and contains the key schedule length * 16 where key schedule length is
172 * 10, 12, or 14 bytes.
173 *
174 * OpenSolaris OS interface:
175 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
176 * const uint32_t pt[4], uint32_t ct[4])/
177 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
178 * const uint32_t pt[4], uint32_t ct[4])/
179 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
180 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
181 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
182 * ct is crypto text, and MAX_AES_NR is 14.
183 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
184 */
185
186#if defined(lint) || defined(__lint)
187
188#include <sys/types.h>
189/* ARGSUSED */
190void
191aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
192 uint32_t ct[4]) {
193}
194/* ARGSUSED */
195void
196aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
197 uint32_t pt[4]) {
198}
199
200
201#else
202
203#define _ASM
204#include <sys/asm_linkage.h>
205
206#define KS_LENGTH 60
207
208#define raxd eax
209#define rdxd edx
210#define rcxd ecx
211#define rbxd ebx
212#define rsid esi
213#define rdid edi
214
215#define raxb al
216#define rdxb dl
217#define rcxb cl
218#define rbxb bl
219#define rsib sil
220#define rdib dil
221
222// finite field multiplies by {02}, {04} and {08}
223
224#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
225#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
226#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
227
228// finite field multiplies required in table generation
229
230#define f3(x) [[f2(x)] ^ [x]]
231#define f9(x) [[f8(x)] ^ [x]]
232#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
233#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
234#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
235
236// macros for expanding S-box data
237
238#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
239#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
240#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
241
242#define enc_vals(x) \
243 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
244 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
245 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
246 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
247 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
248 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
249 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
250 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
251 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
252 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
253 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
254 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
255 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
256 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
257 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
258 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
259 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
260 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
261 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
262 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
263 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
264 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
265 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
266 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
267 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
268 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
269 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
270 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
271 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
272 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
273 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
274 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
275
276#define dec_vals(x) \
277 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
278 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
279 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
280 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
281 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
282 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
283 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
284 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
285 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
286 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
287 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
288 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
289 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
290 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
291 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
292 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
293 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
294 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
295 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
296 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
297 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
298 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
299 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
300 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
301 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
302 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
303 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
304 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
305 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
306 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
307 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
308 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
309
310#define tptr %rbp /* table pointer */
311#define kptr %r8 /* key schedule pointer */
312#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */
313#define fk_ref(x, y) -16*x+fofs+4*y(kptr)
314
315#ifdef AES_REV_DKS
316#define rofs 128
317#define ik_ref(x, y) -16*x+rofs+4*y(kptr)
318
319#else
320#define rofs -128
321#define ik_ref(x, y) 16*x+rofs+4*y(kptr)
322#endif /* AES_REV_DKS */
323
324#define tab_0(x) (tptr,x,8)
325#define tab_1(x) 3(tptr,x,8)
326#define tab_2(x) 2(tptr,x,8)
327#define tab_3(x) 1(tptr,x,8)
328#define tab_f(x) 1(tptr,x,8)
329#define tab_i(x) 7(tptr,x,8)
330
331#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \
332 mov fk_ref(round,0), p1; \
333 mov fk_ref(round,1), p2; \
334 mov fk_ref(round,2), p3; \
335 mov fk_ref(round,3), p4; \
336 \
337 movzx %al, %esi; \
338 movzx %ah, %edi; \
339 shr $16, %eax; \
340 xor tab_0(%rsi), p1; \
341 xor tab_1(%rdi), p4; \
342 movzx %al, %esi; \
343 movzx %ah, %edi; \
344 xor tab_2(%rsi), p3; \
345 xor tab_3(%rdi), p2; \
346 \
347 movzx %bl, %esi; \
348 movzx %bh, %edi; \
349 shr $16, %ebx; \
350 xor tab_0(%rsi), p2; \
351 xor tab_1(%rdi), p1; \
352 movzx %bl, %esi; \
353 movzx %bh, %edi; \
354 xor tab_2(%rsi), p4; \
355 xor tab_3(%rdi), p3; \
356 \
357 movzx %cl, %esi; \
358 movzx %ch, %edi; \
359 shr $16, %ecx; \
360 xor tab_0(%rsi), p3; \
361 xor tab_1(%rdi), p2; \
362 movzx %cl, %esi; \
363 movzx %ch, %edi; \
364 xor tab_2(%rsi), p1; \
365 xor tab_3(%rdi), p4; \
366 \
367 movzx %dl, %esi; \
368 movzx %dh, %edi; \
369 shr $16, %edx; \
370 xor tab_0(%rsi), p4; \
371 xor tab_1(%rdi), p3; \
372 movzx %dl, %esi; \
373 movzx %dh, %edi; \
374 xor tab_2(%rsi), p2; \
375 xor tab_3(%rdi), p1; \
376 \
377 mov p1, %eax; \
378 mov p2, %ebx; \
379 mov p3, %ecx; \
380 mov p4, %edx
381
382#ifdef LAST_ROUND_TABLES
383
384#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
385 add $2048, tptr; \
386 mov fk_ref(round,0), p1; \
387 mov fk_ref(round,1), p2; \
388 mov fk_ref(round,2), p3; \
389 mov fk_ref(round,3), p4; \
390 \
391 movzx %al, %esi; \
392 movzx %ah, %edi; \
393 shr $16, %eax; \
394 xor tab_0(%rsi), p1; \
395 xor tab_1(%rdi), p4; \
396 movzx %al, %esi; \
397 movzx %ah, %edi; \
398 xor tab_2(%rsi), p3; \
399 xor tab_3(%rdi), p2; \
400 \
401 movzx %bl, %esi; \
402 movzx %bh, %edi; \
403 shr $16, %ebx; \
404 xor tab_0(%rsi), p2; \
405 xor tab_1(%rdi), p1; \
406 movzx %bl, %esi; \
407 movzx %bh, %edi; \
408 xor tab_2(%rsi), p4; \
409 xor tab_3(%rdi), p3; \
410 \
411 movzx %cl, %esi; \
412 movzx %ch, %edi; \
413 shr $16, %ecx; \
414 xor tab_0(%rsi), p3; \
415 xor tab_1(%rdi), p2; \
416 movzx %cl, %esi; \
417 movzx %ch, %edi; \
418 xor tab_2(%rsi), p1; \
419 xor tab_3(%rdi), p4; \
420 \
421 movzx %dl, %esi; \
422 movzx %dh, %edi; \
423 shr $16, %edx; \
424 xor tab_0(%rsi), p4; \
425 xor tab_1(%rdi), p3; \
426 movzx %dl, %esi; \
427 movzx %dh, %edi; \
428 xor tab_2(%rsi), p2; \
429 xor tab_3(%rdi), p1
430
431#else
432
433#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
434 mov fk_ref(round,0), p1; \
435 mov fk_ref(round,1), p2; \
436 mov fk_ref(round,2), p3; \
437 mov fk_ref(round,3), p4; \
438 \
439 movzx %al, %esi; \
440 movzx %ah, %edi; \
441 shr $16, %eax; \
442 movzx tab_f(%rsi), %esi; \
443 movzx tab_f(%rdi), %edi; \
444 xor %esi, p1; \
445 rol $8, %edi; \
446 xor %edi, p4; \
447 movzx %al, %esi; \
448 movzx %ah, %edi; \
449 movzx tab_f(%rsi), %esi; \
450 movzx tab_f(%rdi), %edi; \
451 rol $16, %esi; \
452 rol $24, %edi; \
453 xor %esi, p3; \
454 xor %edi, p2; \
455 \
456 movzx %bl, %esi; \
457 movzx %bh, %edi; \
458 shr $16, %ebx; \
459 movzx tab_f(%rsi), %esi; \
460 movzx tab_f(%rdi), %edi; \
461 xor %esi, p2; \
462 rol $8, %edi; \
463 xor %edi, p1; \
464 movzx %bl, %esi; \
465 movzx %bh, %edi; \
466 movzx tab_f(%rsi), %esi; \
467 movzx tab_f(%rdi), %edi; \
468 rol $16, %esi; \
469 rol $24, %edi; \
470 xor %esi, p4; \
471 xor %edi, p3; \
472 \
473 movzx %cl, %esi; \
474 movzx %ch, %edi; \
475 movzx tab_f(%rsi), %esi; \
476 movzx tab_f(%rdi), %edi; \
477 shr $16, %ecx; \
478 xor %esi, p3; \
479 rol $8, %edi; \
480 xor %edi, p2; \
481 movzx %cl, %esi; \
482 movzx %ch, %edi; \
483 movzx tab_f(%rsi), %esi; \
484 movzx tab_f(%rdi), %edi; \
485 rol $16, %esi; \
486 rol $24, %edi; \
487 xor %esi, p1; \
488 xor %edi, p4; \
489 \
490 movzx %dl, %esi; \
491 movzx %dh, %edi; \
492 movzx tab_f(%rsi), %esi; \
493 movzx tab_f(%rdi), %edi; \
494 shr $16, %edx; \
495 xor %esi, p4; \
496 rol $8, %edi; \
497 xor %edi, p3; \
498 movzx %dl, %esi; \
499 movzx %dh, %edi; \
500 movzx tab_f(%rsi), %esi; \
501 movzx tab_f(%rdi), %edi; \
502 rol $16, %esi; \
503 rol $24, %edi; \
504 xor %esi, p2; \
505 xor %edi, p1
506
507#endif /* LAST_ROUND_TABLES */
508
509#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \
510 mov ik_ref(round,0), p1; \
511 mov ik_ref(round,1), p2; \
512 mov ik_ref(round,2), p3; \
513 mov ik_ref(round,3), p4; \
514 \
515 movzx %al, %esi; \
516 movzx %ah, %edi; \
517 shr $16, %eax; \
518 xor tab_0(%rsi), p1; \
519 xor tab_1(%rdi), p2; \
520 movzx %al, %esi; \
521 movzx %ah, %edi; \
522 xor tab_2(%rsi), p3; \
523 xor tab_3(%rdi), p4; \
524 \
525 movzx %bl, %esi; \
526 movzx %bh, %edi; \
527 shr $16, %ebx; \
528 xor tab_0(%rsi), p2; \
529 xor tab_1(%rdi), p3; \
530 movzx %bl, %esi; \
531 movzx %bh, %edi; \
532 xor tab_2(%rsi), p4; \
533 xor tab_3(%rdi), p1; \
534 \
535 movzx %cl, %esi; \
536 movzx %ch, %edi; \
537 shr $16, %ecx; \
538 xor tab_0(%rsi), p3; \
539 xor tab_1(%rdi), p4; \
540 movzx %cl, %esi; \
541 movzx %ch, %edi; \
542 xor tab_2(%rsi), p1; \
543 xor tab_3(%rdi), p2; \
544 \
545 movzx %dl, %esi; \
546 movzx %dh, %edi; \
547 shr $16, %edx; \
548 xor tab_0(%rsi), p4; \
549 xor tab_1(%rdi), p1; \
550 movzx %dl, %esi; \
551 movzx %dh, %edi; \
552 xor tab_2(%rsi), p2; \
553 xor tab_3(%rdi), p3; \
554 \
555 mov p1, %eax; \
556 mov p2, %ebx; \
557 mov p3, %ecx; \
558 mov p4, %edx
559
560#ifdef LAST_ROUND_TABLES
561
562#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
563 add $2048, tptr; \
564 mov ik_ref(round,0), p1; \
565 mov ik_ref(round,1), p2; \
566 mov ik_ref(round,2), p3; \
567 mov ik_ref(round,3), p4; \
568 \
569 movzx %al, %esi; \
570 movzx %ah, %edi; \
571 shr $16, %eax; \
572 xor tab_0(%rsi), p1; \
573 xor tab_1(%rdi), p2; \
574 movzx %al, %esi; \
575 movzx %ah, %edi; \
576 xor tab_2(%rsi), p3; \
577 xor tab_3(%rdi), p4; \
578 \
579 movzx %bl, %esi; \
580 movzx %bh, %edi; \
581 shr $16, %ebx; \
582 xor tab_0(%rsi), p2; \
583 xor tab_1(%rdi), p3; \
584 movzx %bl, %esi; \
585 movzx %bh, %edi; \
586 xor tab_2(%rsi), p4; \
587 xor tab_3(%rdi), p1; \
588 \
589 movzx %cl, %esi; \
590 movzx %ch, %edi; \
591 shr $16, %ecx; \
592 xor tab_0(%rsi), p3; \
593 xor tab_1(%rdi), p4; \
594 movzx %cl, %esi; \
595 movzx %ch, %edi; \
596 xor tab_2(%rsi), p1; \
597 xor tab_3(%rdi), p2; \
598 \
599 movzx %dl, %esi; \
600 movzx %dh, %edi; \
601 shr $16, %edx; \
602 xor tab_0(%rsi), p4; \
603 xor tab_1(%rdi), p1; \
604 movzx %dl, %esi; \
605 movzx %dh, %edi; \
606 xor tab_2(%rsi), p2; \
607 xor tab_3(%rdi), p3
608
609#else
610
611#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
612 mov ik_ref(round,0), p1; \
613 mov ik_ref(round,1), p2; \
614 mov ik_ref(round,2), p3; \
615 mov ik_ref(round,3), p4; \
616 \
617 movzx %al, %esi; \
618 movzx %ah, %edi; \
619 movzx tab_i(%rsi), %esi; \
620 movzx tab_i(%rdi), %edi; \
621 shr $16, %eax; \
622 xor %esi, p1; \
623 rol $8, %edi; \
624 xor %edi, p2; \
625 movzx %al, %esi; \
626 movzx %ah, %edi; \
627 movzx tab_i(%rsi), %esi; \
628 movzx tab_i(%rdi), %edi; \
629 rol $16, %esi; \
630 rol $24, %edi; \
631 xor %esi, p3; \
632 xor %edi, p4; \
633 \
634 movzx %bl, %esi; \
635 movzx %bh, %edi; \
636 movzx tab_i(%rsi), %esi; \
637 movzx tab_i(%rdi), %edi; \
638 shr $16, %ebx; \
639 xor %esi, p2; \
640 rol $8, %edi; \
641 xor %edi, p3; \
642 movzx %bl, %esi; \
643 movzx %bh, %edi; \
644 movzx tab_i(%rsi), %esi; \
645 movzx tab_i(%rdi), %edi; \
646 rol $16, %esi; \
647 rol $24, %edi; \
648 xor %esi, p4; \
649 xor %edi, p1; \
650 \
651 movzx %cl, %esi; \
652 movzx %ch, %edi; \
653 movzx tab_i(%rsi), %esi; \
654 movzx tab_i(%rdi), %edi; \
655 shr $16, %ecx; \
656 xor %esi, p3; \
657 rol $8, %edi; \
658 xor %edi, p4; \
659 movzx %cl, %esi; \
660 movzx %ch, %edi; \
661 movzx tab_i(%rsi), %esi; \
662 movzx tab_i(%rdi), %edi; \
663 rol $16, %esi; \
664 rol $24, %edi; \
665 xor %esi, p1; \
666 xor %edi, p2; \
667 \
668 movzx %dl, %esi; \
669 movzx %dh, %edi; \
670 movzx tab_i(%rsi), %esi; \
671 movzx tab_i(%rdi), %edi; \
672 shr $16, %edx; \
673 xor %esi, p4; \
674 rol $8, %edi; \
675 xor %edi, p1; \
676 movzx %dl, %esi; \
677 movzx %dh, %edi; \
678 movzx tab_i(%rsi), %esi; \
679 movzx tab_i(%rdi), %edi; \
680 rol $16, %esi; \
681 rol $24, %edi; \
682 xor %esi, p2; \
683 xor %edi, p3
684
685#endif /* LAST_ROUND_TABLES */
686
687/*
688 * OpenSolaris OS:
689 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
690 * const uint32_t pt[4], uint32_t ct[4])/
691 *
692 * Original interface:
693 * int aes_encrypt(const unsigned char *in,
694 * unsigned char *out, const aes_encrypt_ctx cx[1])/
695 */
696.data
697.align 64
698enc_tab:
699 enc_vals(u8)
700#ifdef LAST_ROUND_TABLES
701 // Last Round Tables:
702 enc_vals(w8)
703#endif
704
705
706ENTRY_NP(aes_encrypt_amd64)
707#ifdef GLADMAN_INTERFACE
708 // Original interface
709 sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
710 mov %rsi, (%rsp) // output pointer (P2)
711 mov %rdx, %r8 // context (P3)
712
713 mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
714 mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
715 mov %r12, 3*8(%rsp) // P3: context in r8
716 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
717
718#else
719 // OpenSolaris OS interface
720 sub $[4*8], %rsp // Make room on stack to save registers
721 mov %rcx, (%rsp) // Save output pointer (P4) on stack
722 mov %rdi, %r8 // context (P1)
723 mov %rdx, %rdi // P3: save input pointer
724 shl $4, %esi // P2: esi byte key length * 16
725
726 mov %rbx, 1*8(%rsp) // Save registers
727 mov %rbp, 2*8(%rsp)
728 mov %r12, 3*8(%rsp)
729 // P1: context in r8
730 // P2: byte key length * 16 in esi
731 // P3: input pointer in rdi
732 // P4: output pointer in (rsp)
733#endif /* GLADMAN_INTERFACE */
734
735 lea enc_tab(%rip), tptr
736 sub $fofs, kptr
737
738 // Load input block into registers
739 mov (%rdi), %eax
740 mov 1*4(%rdi), %ebx
741 mov 2*4(%rdi), %ecx
742 mov 3*4(%rdi), %edx
743
744 xor fofs(kptr), %eax
745 xor fofs+4(kptr), %ebx
746 xor fofs+8(kptr), %ecx
747 xor fofs+12(kptr), %edx
748
749 lea (kptr,%rsi), kptr
750 // Jump based on byte key length * 16:
751 cmp $[10*16], %esi
752 je 3f
753 cmp $[12*16], %esi
754 je 2f
755 cmp $[14*16], %esi
756 je 1f
757 mov $-1, %rax // error
758 jmp 4f
759
760 // Perform normal forward rounds
7611: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
762 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
7632: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
764 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
7653: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9)
766 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8)
767 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7)
768 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6)
769 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5)
770 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4)
771 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3)
772 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2)
773 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1)
774 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0)
775
776 // Copy results
777 mov (%rsp), %rbx
778 mov %r9d, (%rbx)
779 mov %r10d, 4(%rbx)
780 mov %r11d, 8(%rbx)
781 mov %r12d, 12(%rbx)
782 xor %rax, %rax
7834: // Restore registers
784 mov 1*8(%rsp), %rbx
785 mov 2*8(%rsp), %rbp
786 mov 3*8(%rsp), %r12
787 add $[4*8], %rsp
788 ret
789
790 SET_SIZE(aes_encrypt_amd64)
791
792/*
793 * OpenSolaris OS:
794 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
795 * const uint32_t pt[4], uint32_t ct[4])/
796 *
797 * Original interface:
798 * int aes_decrypt(const unsigned char *in,
799 * unsigned char *out, const aes_encrypt_ctx cx[1])/
800 */
801.data
802.align 64
803dec_tab:
804 dec_vals(v8)
805#ifdef LAST_ROUND_TABLES
806 // Last Round Tables:
807 dec_vals(w8)
808#endif
809
810
811ENTRY_NP(aes_decrypt_amd64)
812#ifdef GLADMAN_INTERFACE
813 // Original interface
814 sub $[4*8], %rsp // gnu/linux/opensolaris binary interface
815 mov %rsi, (%rsp) // output pointer (P2)
816 mov %rdx, %r8 // context (P3)
817
818 mov %rbx, 1*8(%rsp) // P1: input pointer in rdi
819 mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp)
820 mov %r12, 3*8(%rsp) // P3: context in r8
821 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16
822
823#else
824 // OpenSolaris OS interface
825 sub $[4*8], %rsp // Make room on stack to save registers
826 mov %rcx, (%rsp) // Save output pointer (P4) on stack
827 mov %rdi, %r8 // context (P1)
828 mov %rdx, %rdi // P3: save input pointer
829 shl $4, %esi // P2: esi byte key length * 16
830
831 mov %rbx, 1*8(%rsp) // Save registers
832 mov %rbp, 2*8(%rsp)
833 mov %r12, 3*8(%rsp)
834 // P1: context in r8
835 // P2: byte key length * 16 in esi
836 // P3: input pointer in rdi
837 // P4: output pointer in (rsp)
838#endif /* GLADMAN_INTERFACE */
839
840 lea dec_tab(%rip), tptr
841 sub $rofs, kptr
842
843 // Load input block into registers
844 mov (%rdi), %eax
845 mov 1*4(%rdi), %ebx
846 mov 2*4(%rdi), %ecx
847 mov 3*4(%rdi), %edx
848
849#ifdef AES_REV_DKS
850 mov kptr, %rdi
851 lea (kptr,%rsi), kptr
852#else
853 lea (kptr,%rsi), %rdi
854#endif
855
856 xor rofs(%rdi), %eax
857 xor rofs+4(%rdi), %ebx
858 xor rofs+8(%rdi), %ecx
859 xor rofs+12(%rdi), %edx
860
861 // Jump based on byte key length * 16:
862 cmp $[10*16], %esi
863 je 3f
864 cmp $[12*16], %esi
865 je 2f
866 cmp $[14*16], %esi
867 je 1f
868 mov $-1, %rax // error
869 jmp 4f
870
871 // Perform normal inverse rounds
8721: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
873 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
8742: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
875 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
8763: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9)
877 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8)
878 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7)
879 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6)
880 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5)
881 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4)
882 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3)
883 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2)
884 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1)
885 il_rnd(%r9d, %r10d, %r11d, %r12d, 0)
886
887 // Copy results
888 mov (%rsp), %rbx
889 mov %r9d, (%rbx)
890 mov %r10d, 4(%rbx)
891 mov %r11d, 8(%rbx)
892 mov %r12d, 12(%rbx)
893 xor %rax, %rax
8944: // Restore registers
895 mov 1*8(%rsp), %rbx
896 mov 2*8(%rsp), %rbp
897 mov 3*8(%rsp), %r12
898 add $[4*8], %rsp
899 ret
900
901 SET_SIZE(aes_decrypt_amd64)
902#endif /* lint || __lint */
903
904#ifdef __ELF__
905.section .note.GNU-stack,"",%progbits
906#endif