]>
Commit | Line | Data |
---|---|---|
86e3c28a CIK |
1 | /* |
2 | * --------------------------------------------------------------------------- | |
3 | * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. | |
4 | * | |
5 | * LICENSE TERMS | |
6 | * | |
7 | * The free distribution and use of this software is allowed (with or without | |
8 | * changes) provided that: | |
9 | * | |
10 | * 1. source code distributions include the above copyright notice, this | |
11 | * list of conditions and the following disclaimer; | |
12 | * | |
13 | * 2. binary distributions include the above copyright notice, this list | |
14 | * of conditions and the following disclaimer in their documentation; | |
15 | * | |
16 | * 3. the name of the copyright holder is not used to endorse products | |
17 | * built using this software without specific written permission. | |
18 | * | |
19 | * DISCLAIMER | |
20 | * | |
21 | * This software is provided 'as is' with no explicit or implied warranties | |
22 | * in respect of its properties, including, but not limited to, correctness | |
23 | * and/or fitness for purpose. | |
24 | * --------------------------------------------------------------------------- | |
25 | * Issue 20/12/2007 | |
26 | * | |
27 | * I am grateful to Dag Arne Osvik for many discussions of the techniques that | |
28 | * can be used to optimise AES assembler code on AMD64/EM64T architectures. | |
29 | * Some of the techniques used in this implementation are the result of | |
30 | * suggestions made by him for which I am most grateful. | |
31 | * | |
32 | * An AES implementation for AMD64 processors using the YASM assembler. This | |
33 | * implementation provides only encryption, decryption and hence requires key | |
34 | * scheduling support in C. It uses 8k bytes of tables but its encryption and | |
35 | * decryption performance is very close to that obtained using large tables. | |
36 | * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, | |
37 | * which are as follows: | |
38 | * ms windows gnu/linux/opensolaris os | |
39 | * | |
40 | * in_blk rcx rdi | |
41 | * out_blk rdx rsi | |
42 | * context (cx) r8 rdx | |
43 | * | |
44 | * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 | |
45 | * registers rdi - on both | |
46 | * | |
47 | * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 | |
48 | * registers - rdi on both | |
49 | * | |
50 | * The convention used here is that for gnu/linux/opensolaris os. | |
51 | * | |
52 | * This code provides the standard AES block size (128 bits, 16 bytes) and the | |
53 | * three standard AES key sizes (128, 192 and 256 bits). It has the same call | |
54 | * interface as my C implementation. It uses the Microsoft C AMD64 calling | |
55 | * conventions in which the three parameters are placed in rcx, rdx and r8 | |
56 | * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. | |
57 | * | |
58 | * OpenSolaris Note: | |
59 | * Modified to use GNU/Linux/Solaris calling conventions. | |
60 | * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. | |
61 | * | |
62 | * AES_RETURN aes_encrypt(const unsigned char in_blk[], | |
63 | * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ | |
64 | * | |
65 | * AES_RETURN aes_decrypt(const unsigned char in_blk[], | |
66 | * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ | |
67 | * | |
68 | * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], | |
69 | * const aes_encrypt_ctx cx[1])/ | |
70 | * | |
71 | * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], | |
72 | * const aes_decrypt_ctx cx[1])/ | |
73 | * | |
74 | * AES_RETURN aes_encrypt_key(const unsigned char key[], | |
75 | * unsigned int len, const aes_decrypt_ctx cx[1])/ | |
76 | * | |
77 | * AES_RETURN aes_decrypt_key(const unsigned char key[], | |
78 | * unsigned int len, const aes_decrypt_ctx cx[1])/ | |
79 | * | |
80 | * where <NNN> is 128, 102 or 256. In the last two calls the length can be in | |
81 | * either bits or bytes. | |
82 | * | |
83 | * Comment in/out the following lines to obtain the desired subroutines. These | |
84 | * selections MUST match those in the C header file aesopt.h | |
85 | */ | |
86 | #define AES_REV_DKS /* define if key decryption schedule is reversed */ | |
87 | ||
88 | #define LAST_ROUND_TABLES /* define for the faster version using extra tables */ | |
89 | ||
90 | /* | |
91 | * The encryption key schedule has the following in memory layout where N is the | |
92 | * number of rounds (10, 12 or 14): | |
93 | * | |
94 | * lo: | input key (round 0) | / each round is four 32-bit words | |
95 | * | encryption round 1 | | |
96 | * | encryption round 2 | | |
97 | * .... | |
98 | * | encryption round N-1 | | |
99 | * hi: | encryption round N | | |
100 | * | |
101 | * The decryption key schedule is normally set up so that it has the same | |
102 | * layout as above by actually reversing the order of the encryption key | |
103 | * schedule in memory (this happens when AES_REV_DKS is set): | |
104 | * | |
105 | * lo: | decryption round 0 | = | encryption round N | | |
106 | * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] | |
107 | * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] | |
108 | * .... .... | |
109 | * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] | |
110 | * hi: | decryption round N | = | input key (round 0) | | |
111 | * | |
112 | * with rounds except the first and last modified using inv_mix_column() | |
113 | * But if AES_REV_DKS is NOT set the order of keys is left as it is for | |
114 | * encryption so that it has to be accessed in reverse when used for | |
115 | * decryption (although the inverse mix column modifications are done) | |
116 | * | |
117 | * lo: | decryption round 0 | = | input key (round 0) | | |
118 | * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] | |
119 | * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] | |
120 | * .... .... | |
121 | * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] | |
122 | * hi: | decryption round N | = | encryption round N | | |
123 | * | |
124 | * This layout is faster when the assembler key scheduling provided here | |
125 | * is used. | |
126 | * | |
127 | * End of user defines | |
128 | */ | |
129 | ||
130 | /* | |
131 | * --------------------------------------------------------------------------- | |
132 | * OpenSolaris OS modifications | |
133 | * | |
134 | * This source originates from Brian Gladman file aes_amd64.asm | |
135 | * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip | |
136 | * with these changes: | |
137 | * | |
138 | * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and | |
139 | * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, | |
140 | * AES_128, AES_192, AES_256, AES_VAR ifdefs. | |
141 | * | |
142 | * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define | |
143 | * | |
144 | * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef | |
145 | * | |
146 | * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax | |
147 | * (operands reversed, literals prefixed with "$", registers prefixed with "%", | |
148 | * and "[register+offset]", addressing changed to "offset(register)", | |
149 | * parenthesis in constant expressions "()" changed to square brackets "[]", | |
150 | * "." removed from local (numeric) labels, and other changes. | |
151 | * Examples: | |
152 | * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax | |
153 | * mov rax,(4*20h) mov $[4*0x20],%rax | |
154 | * mov rax,[ebx+20h] mov 0x20(%ebx),%rax | |
155 | * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax | |
156 | * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax | |
157 | * | |
158 | * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from | |
159 | * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function | |
160 | * definitions for lint. | |
161 | * | |
162 | * 6. Renamed functions and reordered parameters to match OpenSolaris: | |
163 | * Original Gladman interface: | |
164 | * int aes_encrypt(const unsigned char *in, | |
165 | * unsigned char *out, const aes_encrypt_ctx cx[1])/ | |
166 | * int aes_decrypt(const unsigned char *in, | |
167 | * unsigned char *out, const aes_encrypt_ctx cx[1])/ | |
168 | * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, | |
169 | * and a union type, inf., containing inf.l, a uint32_t and | |
170 | * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is | |
171 | * used and contains the key schedule length * 16 where key schedule length is | |
172 | * 10, 12, or 14 bytes. | |
173 | * | |
174 | * OpenSolaris OS interface: | |
175 | * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, | |
176 | * const uint32_t pt[4], uint32_t ct[4])/ | |
177 | * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, | |
178 | * const uint32_t pt[4], uint32_t ct[4])/ | |
179 | * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ | |
180 | * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ | |
181 | * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, | |
182 | * ct is crypto text, and MAX_AES_NR is 14. | |
183 | * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. | |
184 | */ | |
185 | ||
186 | #if defined(lint) || defined(__lint) | |
187 | ||
188 | #include <sys/types.h> | |
189 | /* ARGSUSED */ | |
190 | void | |
191 | aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], | |
192 | uint32_t ct[4]) { | |
193 | } | |
194 | /* ARGSUSED */ | |
195 | void | |
196 | aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], | |
197 | uint32_t pt[4]) { | |
198 | } | |
199 | ||
200 | ||
201 | #else | |
202 | ||
203 | #define _ASM | |
204 | #include <sys/asm_linkage.h> | |
205 | ||
206 | #define KS_LENGTH 60 | |
207 | ||
208 | #define raxd eax | |
209 | #define rdxd edx | |
210 | #define rcxd ecx | |
211 | #define rbxd ebx | |
212 | #define rsid esi | |
213 | #define rdid edi | |
214 | ||
215 | #define raxb al | |
216 | #define rdxb dl | |
217 | #define rcxb cl | |
218 | #define rbxb bl | |
219 | #define rsib sil | |
220 | #define rdib dil | |
221 | ||
222 | // finite field multiplies by {02}, {04} and {08} | |
223 | ||
224 | #define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] | |
225 | #define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] | |
226 | #define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] | |
227 | ||
228 | // finite field multiplies required in table generation | |
229 | ||
230 | #define f3(x) [[f2(x)] ^ [x]] | |
231 | #define f9(x) [[f8(x)] ^ [x]] | |
232 | #define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] | |
233 | #define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] | |
234 | #define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] | |
235 | ||
236 | // macros for expanding S-box data | |
237 | ||
238 | #define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] | |
239 | #define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] | |
240 | #define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 | |
241 | ||
242 | #define enc_vals(x) \ | |
243 | .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ | |
244 | .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ | |
245 | .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ | |
246 | .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ | |
247 | .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ | |
248 | .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ | |
249 | .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ | |
250 | .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ | |
251 | .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ | |
252 | .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ | |
253 | .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ | |
254 | .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ | |
255 | .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ | |
256 | .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ | |
257 | .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ | |
258 | .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ | |
259 | .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ | |
260 | .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ | |
261 | .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ | |
262 | .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ | |
263 | .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ | |
264 | .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ | |
265 | .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ | |
266 | .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ | |
267 | .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ | |
268 | .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ | |
269 | .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ | |
270 | .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ | |
271 | .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ | |
272 | .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ | |
273 | .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ | |
274 | .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) | |
275 | ||
276 | #define dec_vals(x) \ | |
277 | .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ | |
278 | .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ | |
279 | .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ | |
280 | .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ | |
281 | .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ | |
282 | .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ | |
283 | .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ | |
284 | .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ | |
285 | .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ | |
286 | .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ | |
287 | .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ | |
288 | .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ | |
289 | .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ | |
290 | .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ | |
291 | .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ | |
292 | .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ | |
293 | .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ | |
294 | .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ | |
295 | .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ | |
296 | .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ | |
297 | .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ | |
298 | .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ | |
299 | .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ | |
300 | .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ | |
301 | .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ | |
302 | .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ | |
303 | .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ | |
304 | .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ | |
305 | .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ | |
306 | .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ | |
307 | .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ | |
308 | .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) | |
309 | ||
310 | #define tptr %rbp /* table pointer */ | |
311 | #define kptr %r8 /* key schedule pointer */ | |
312 | #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ | |
313 | #define fk_ref(x, y) -16*x+fofs+4*y(kptr) | |
314 | ||
315 | #ifdef AES_REV_DKS | |
316 | #define rofs 128 | |
317 | #define ik_ref(x, y) -16*x+rofs+4*y(kptr) | |
318 | ||
319 | #else | |
320 | #define rofs -128 | |
321 | #define ik_ref(x, y) 16*x+rofs+4*y(kptr) | |
322 | #endif /* AES_REV_DKS */ | |
323 | ||
324 | #define tab_0(x) (tptr,x,8) | |
325 | #define tab_1(x) 3(tptr,x,8) | |
326 | #define tab_2(x) 2(tptr,x,8) | |
327 | #define tab_3(x) 1(tptr,x,8) | |
328 | #define tab_f(x) 1(tptr,x,8) | |
329 | #define tab_i(x) 7(tptr,x,8) | |
330 | ||
331 | #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ | |
332 | mov fk_ref(round,0), p1; \ | |
333 | mov fk_ref(round,1), p2; \ | |
334 | mov fk_ref(round,2), p3; \ | |
335 | mov fk_ref(round,3), p4; \ | |
336 | \ | |
337 | movzx %al, %esi; \ | |
338 | movzx %ah, %edi; \ | |
339 | shr $16, %eax; \ | |
340 | xor tab_0(%rsi), p1; \ | |
341 | xor tab_1(%rdi), p4; \ | |
342 | movzx %al, %esi; \ | |
343 | movzx %ah, %edi; \ | |
344 | xor tab_2(%rsi), p3; \ | |
345 | xor tab_3(%rdi), p2; \ | |
346 | \ | |
347 | movzx %bl, %esi; \ | |
348 | movzx %bh, %edi; \ | |
349 | shr $16, %ebx; \ | |
350 | xor tab_0(%rsi), p2; \ | |
351 | xor tab_1(%rdi), p1; \ | |
352 | movzx %bl, %esi; \ | |
353 | movzx %bh, %edi; \ | |
354 | xor tab_2(%rsi), p4; \ | |
355 | xor tab_3(%rdi), p3; \ | |
356 | \ | |
357 | movzx %cl, %esi; \ | |
358 | movzx %ch, %edi; \ | |
359 | shr $16, %ecx; \ | |
360 | xor tab_0(%rsi), p3; \ | |
361 | xor tab_1(%rdi), p2; \ | |
362 | movzx %cl, %esi; \ | |
363 | movzx %ch, %edi; \ | |
364 | xor tab_2(%rsi), p1; \ | |
365 | xor tab_3(%rdi), p4; \ | |
366 | \ | |
367 | movzx %dl, %esi; \ | |
368 | movzx %dh, %edi; \ | |
369 | shr $16, %edx; \ | |
370 | xor tab_0(%rsi), p4; \ | |
371 | xor tab_1(%rdi), p3; \ | |
372 | movzx %dl, %esi; \ | |
373 | movzx %dh, %edi; \ | |
374 | xor tab_2(%rsi), p2; \ | |
375 | xor tab_3(%rdi), p1; \ | |
376 | \ | |
377 | mov p1, %eax; \ | |
378 | mov p2, %ebx; \ | |
379 | mov p3, %ecx; \ | |
380 | mov p4, %edx | |
381 | ||
382 | #ifdef LAST_ROUND_TABLES | |
383 | ||
384 | #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ | |
385 | add $2048, tptr; \ | |
386 | mov fk_ref(round,0), p1; \ | |
387 | mov fk_ref(round,1), p2; \ | |
388 | mov fk_ref(round,2), p3; \ | |
389 | mov fk_ref(round,3), p4; \ | |
390 | \ | |
391 | movzx %al, %esi; \ | |
392 | movzx %ah, %edi; \ | |
393 | shr $16, %eax; \ | |
394 | xor tab_0(%rsi), p1; \ | |
395 | xor tab_1(%rdi), p4; \ | |
396 | movzx %al, %esi; \ | |
397 | movzx %ah, %edi; \ | |
398 | xor tab_2(%rsi), p3; \ | |
399 | xor tab_3(%rdi), p2; \ | |
400 | \ | |
401 | movzx %bl, %esi; \ | |
402 | movzx %bh, %edi; \ | |
403 | shr $16, %ebx; \ | |
404 | xor tab_0(%rsi), p2; \ | |
405 | xor tab_1(%rdi), p1; \ | |
406 | movzx %bl, %esi; \ | |
407 | movzx %bh, %edi; \ | |
408 | xor tab_2(%rsi), p4; \ | |
409 | xor tab_3(%rdi), p3; \ | |
410 | \ | |
411 | movzx %cl, %esi; \ | |
412 | movzx %ch, %edi; \ | |
413 | shr $16, %ecx; \ | |
414 | xor tab_0(%rsi), p3; \ | |
415 | xor tab_1(%rdi), p2; \ | |
416 | movzx %cl, %esi; \ | |
417 | movzx %ch, %edi; \ | |
418 | xor tab_2(%rsi), p1; \ | |
419 | xor tab_3(%rdi), p4; \ | |
420 | \ | |
421 | movzx %dl, %esi; \ | |
422 | movzx %dh, %edi; \ | |
423 | shr $16, %edx; \ | |
424 | xor tab_0(%rsi), p4; \ | |
425 | xor tab_1(%rdi), p3; \ | |
426 | movzx %dl, %esi; \ | |
427 | movzx %dh, %edi; \ | |
428 | xor tab_2(%rsi), p2; \ | |
429 | xor tab_3(%rdi), p1 | |
430 | ||
431 | #else | |
432 | ||
433 | #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ | |
434 | mov fk_ref(round,0), p1; \ | |
435 | mov fk_ref(round,1), p2; \ | |
436 | mov fk_ref(round,2), p3; \ | |
437 | mov fk_ref(round,3), p4; \ | |
438 | \ | |
439 | movzx %al, %esi; \ | |
440 | movzx %ah, %edi; \ | |
441 | shr $16, %eax; \ | |
442 | movzx tab_f(%rsi), %esi; \ | |
443 | movzx tab_f(%rdi), %edi; \ | |
444 | xor %esi, p1; \ | |
445 | rol $8, %edi; \ | |
446 | xor %edi, p4; \ | |
447 | movzx %al, %esi; \ | |
448 | movzx %ah, %edi; \ | |
449 | movzx tab_f(%rsi), %esi; \ | |
450 | movzx tab_f(%rdi), %edi; \ | |
451 | rol $16, %esi; \ | |
452 | rol $24, %edi; \ | |
453 | xor %esi, p3; \ | |
454 | xor %edi, p2; \ | |
455 | \ | |
456 | movzx %bl, %esi; \ | |
457 | movzx %bh, %edi; \ | |
458 | shr $16, %ebx; \ | |
459 | movzx tab_f(%rsi), %esi; \ | |
460 | movzx tab_f(%rdi), %edi; \ | |
461 | xor %esi, p2; \ | |
462 | rol $8, %edi; \ | |
463 | xor %edi, p1; \ | |
464 | movzx %bl, %esi; \ | |
465 | movzx %bh, %edi; \ | |
466 | movzx tab_f(%rsi), %esi; \ | |
467 | movzx tab_f(%rdi), %edi; \ | |
468 | rol $16, %esi; \ | |
469 | rol $24, %edi; \ | |
470 | xor %esi, p4; \ | |
471 | xor %edi, p3; \ | |
472 | \ | |
473 | movzx %cl, %esi; \ | |
474 | movzx %ch, %edi; \ | |
475 | movzx tab_f(%rsi), %esi; \ | |
476 | movzx tab_f(%rdi), %edi; \ | |
477 | shr $16, %ecx; \ | |
478 | xor %esi, p3; \ | |
479 | rol $8, %edi; \ | |
480 | xor %edi, p2; \ | |
481 | movzx %cl, %esi; \ | |
482 | movzx %ch, %edi; \ | |
483 | movzx tab_f(%rsi), %esi; \ | |
484 | movzx tab_f(%rdi), %edi; \ | |
485 | rol $16, %esi; \ | |
486 | rol $24, %edi; \ | |
487 | xor %esi, p1; \ | |
488 | xor %edi, p4; \ | |
489 | \ | |
490 | movzx %dl, %esi; \ | |
491 | movzx %dh, %edi; \ | |
492 | movzx tab_f(%rsi), %esi; \ | |
493 | movzx tab_f(%rdi), %edi; \ | |
494 | shr $16, %edx; \ | |
495 | xor %esi, p4; \ | |
496 | rol $8, %edi; \ | |
497 | xor %edi, p3; \ | |
498 | movzx %dl, %esi; \ | |
499 | movzx %dh, %edi; \ | |
500 | movzx tab_f(%rsi), %esi; \ | |
501 | movzx tab_f(%rdi), %edi; \ | |
502 | rol $16, %esi; \ | |
503 | rol $24, %edi; \ | |
504 | xor %esi, p2; \ | |
505 | xor %edi, p1 | |
506 | ||
507 | #endif /* LAST_ROUND_TABLES */ | |
508 | ||
509 | #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ | |
510 | mov ik_ref(round,0), p1; \ | |
511 | mov ik_ref(round,1), p2; \ | |
512 | mov ik_ref(round,2), p3; \ | |
513 | mov ik_ref(round,3), p4; \ | |
514 | \ | |
515 | movzx %al, %esi; \ | |
516 | movzx %ah, %edi; \ | |
517 | shr $16, %eax; \ | |
518 | xor tab_0(%rsi), p1; \ | |
519 | xor tab_1(%rdi), p2; \ | |
520 | movzx %al, %esi; \ | |
521 | movzx %ah, %edi; \ | |
522 | xor tab_2(%rsi), p3; \ | |
523 | xor tab_3(%rdi), p4; \ | |
524 | \ | |
525 | movzx %bl, %esi; \ | |
526 | movzx %bh, %edi; \ | |
527 | shr $16, %ebx; \ | |
528 | xor tab_0(%rsi), p2; \ | |
529 | xor tab_1(%rdi), p3; \ | |
530 | movzx %bl, %esi; \ | |
531 | movzx %bh, %edi; \ | |
532 | xor tab_2(%rsi), p4; \ | |
533 | xor tab_3(%rdi), p1; \ | |
534 | \ | |
535 | movzx %cl, %esi; \ | |
536 | movzx %ch, %edi; \ | |
537 | shr $16, %ecx; \ | |
538 | xor tab_0(%rsi), p3; \ | |
539 | xor tab_1(%rdi), p4; \ | |
540 | movzx %cl, %esi; \ | |
541 | movzx %ch, %edi; \ | |
542 | xor tab_2(%rsi), p1; \ | |
543 | xor tab_3(%rdi), p2; \ | |
544 | \ | |
545 | movzx %dl, %esi; \ | |
546 | movzx %dh, %edi; \ | |
547 | shr $16, %edx; \ | |
548 | xor tab_0(%rsi), p4; \ | |
549 | xor tab_1(%rdi), p1; \ | |
550 | movzx %dl, %esi; \ | |
551 | movzx %dh, %edi; \ | |
552 | xor tab_2(%rsi), p2; \ | |
553 | xor tab_3(%rdi), p3; \ | |
554 | \ | |
555 | mov p1, %eax; \ | |
556 | mov p2, %ebx; \ | |
557 | mov p3, %ecx; \ | |
558 | mov p4, %edx | |
559 | ||
560 | #ifdef LAST_ROUND_TABLES | |
561 | ||
562 | #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ | |
563 | add $2048, tptr; \ | |
564 | mov ik_ref(round,0), p1; \ | |
565 | mov ik_ref(round,1), p2; \ | |
566 | mov ik_ref(round,2), p3; \ | |
567 | mov ik_ref(round,3), p4; \ | |
568 | \ | |
569 | movzx %al, %esi; \ | |
570 | movzx %ah, %edi; \ | |
571 | shr $16, %eax; \ | |
572 | xor tab_0(%rsi), p1; \ | |
573 | xor tab_1(%rdi), p2; \ | |
574 | movzx %al, %esi; \ | |
575 | movzx %ah, %edi; \ | |
576 | xor tab_2(%rsi), p3; \ | |
577 | xor tab_3(%rdi), p4; \ | |
578 | \ | |
579 | movzx %bl, %esi; \ | |
580 | movzx %bh, %edi; \ | |
581 | shr $16, %ebx; \ | |
582 | xor tab_0(%rsi), p2; \ | |
583 | xor tab_1(%rdi), p3; \ | |
584 | movzx %bl, %esi; \ | |
585 | movzx %bh, %edi; \ | |
586 | xor tab_2(%rsi), p4; \ | |
587 | xor tab_3(%rdi), p1; \ | |
588 | \ | |
589 | movzx %cl, %esi; \ | |
590 | movzx %ch, %edi; \ | |
591 | shr $16, %ecx; \ | |
592 | xor tab_0(%rsi), p3; \ | |
593 | xor tab_1(%rdi), p4; \ | |
594 | movzx %cl, %esi; \ | |
595 | movzx %ch, %edi; \ | |
596 | xor tab_2(%rsi), p1; \ | |
597 | xor tab_3(%rdi), p2; \ | |
598 | \ | |
599 | movzx %dl, %esi; \ | |
600 | movzx %dh, %edi; \ | |
601 | shr $16, %edx; \ | |
602 | xor tab_0(%rsi), p4; \ | |
603 | xor tab_1(%rdi), p1; \ | |
604 | movzx %dl, %esi; \ | |
605 | movzx %dh, %edi; \ | |
606 | xor tab_2(%rsi), p2; \ | |
607 | xor tab_3(%rdi), p3 | |
608 | ||
609 | #else | |
610 | ||
611 | #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ | |
612 | mov ik_ref(round,0), p1; \ | |
613 | mov ik_ref(round,1), p2; \ | |
614 | mov ik_ref(round,2), p3; \ | |
615 | mov ik_ref(round,3), p4; \ | |
616 | \ | |
617 | movzx %al, %esi; \ | |
618 | movzx %ah, %edi; \ | |
619 | movzx tab_i(%rsi), %esi; \ | |
620 | movzx tab_i(%rdi), %edi; \ | |
621 | shr $16, %eax; \ | |
622 | xor %esi, p1; \ | |
623 | rol $8, %edi; \ | |
624 | xor %edi, p2; \ | |
625 | movzx %al, %esi; \ | |
626 | movzx %ah, %edi; \ | |
627 | movzx tab_i(%rsi), %esi; \ | |
628 | movzx tab_i(%rdi), %edi; \ | |
629 | rol $16, %esi; \ | |
630 | rol $24, %edi; \ | |
631 | xor %esi, p3; \ | |
632 | xor %edi, p4; \ | |
633 | \ | |
634 | movzx %bl, %esi; \ | |
635 | movzx %bh, %edi; \ | |
636 | movzx tab_i(%rsi), %esi; \ | |
637 | movzx tab_i(%rdi), %edi; \ | |
638 | shr $16, %ebx; \ | |
639 | xor %esi, p2; \ | |
640 | rol $8, %edi; \ | |
641 | xor %edi, p3; \ | |
642 | movzx %bl, %esi; \ | |
643 | movzx %bh, %edi; \ | |
644 | movzx tab_i(%rsi), %esi; \ | |
645 | movzx tab_i(%rdi), %edi; \ | |
646 | rol $16, %esi; \ | |
647 | rol $24, %edi; \ | |
648 | xor %esi, p4; \ | |
649 | xor %edi, p1; \ | |
650 | \ | |
651 | movzx %cl, %esi; \ | |
652 | movzx %ch, %edi; \ | |
653 | movzx tab_i(%rsi), %esi; \ | |
654 | movzx tab_i(%rdi), %edi; \ | |
655 | shr $16, %ecx; \ | |
656 | xor %esi, p3; \ | |
657 | rol $8, %edi; \ | |
658 | xor %edi, p4; \ | |
659 | movzx %cl, %esi; \ | |
660 | movzx %ch, %edi; \ | |
661 | movzx tab_i(%rsi), %esi; \ | |
662 | movzx tab_i(%rdi), %edi; \ | |
663 | rol $16, %esi; \ | |
664 | rol $24, %edi; \ | |
665 | xor %esi, p1; \ | |
666 | xor %edi, p2; \ | |
667 | \ | |
668 | movzx %dl, %esi; \ | |
669 | movzx %dh, %edi; \ | |
670 | movzx tab_i(%rsi), %esi; \ | |
671 | movzx tab_i(%rdi), %edi; \ | |
672 | shr $16, %edx; \ | |
673 | xor %esi, p4; \ | |
674 | rol $8, %edi; \ | |
675 | xor %edi, p1; \ | |
676 | movzx %dl, %esi; \ | |
677 | movzx %dh, %edi; \ | |
678 | movzx tab_i(%rsi), %esi; \ | |
679 | movzx tab_i(%rdi), %edi; \ | |
680 | rol $16, %esi; \ | |
681 | rol $24, %edi; \ | |
682 | xor %esi, p2; \ | |
683 | xor %edi, p3 | |
684 | ||
685 | #endif /* LAST_ROUND_TABLES */ | |
686 | ||
687 | /* | |
688 | * OpenSolaris OS: | |
689 | * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, | |
690 | * const uint32_t pt[4], uint32_t ct[4])/ | |
691 | * | |
692 | * Original interface: | |
693 | * int aes_encrypt(const unsigned char *in, | |
694 | * unsigned char *out, const aes_encrypt_ctx cx[1])/ | |
695 | */ | |
696 | .data | |
697 | .align 64 | |
698 | enc_tab: | |
699 | enc_vals(u8) | |
700 | #ifdef LAST_ROUND_TABLES | |
701 | // Last Round Tables: | |
702 | enc_vals(w8) | |
703 | #endif | |
704 | ||
705 | ||
706 | ENTRY_NP(aes_encrypt_amd64) | |
707 | #ifdef GLADMAN_INTERFACE | |
708 | // Original interface | |
709 | sub $[4*8], %rsp // gnu/linux/opensolaris binary interface | |
710 | mov %rsi, (%rsp) // output pointer (P2) | |
711 | mov %rdx, %r8 // context (P3) | |
712 | ||
713 | mov %rbx, 1*8(%rsp) // P1: input pointer in rdi | |
714 | mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) | |
715 | mov %r12, 3*8(%rsp) // P3: context in r8 | |
716 | movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 | |
717 | ||
718 | #else | |
719 | // OpenSolaris OS interface | |
720 | sub $[4*8], %rsp // Make room on stack to save registers | |
721 | mov %rcx, (%rsp) // Save output pointer (P4) on stack | |
722 | mov %rdi, %r8 // context (P1) | |
723 | mov %rdx, %rdi // P3: save input pointer | |
724 | shl $4, %esi // P2: esi byte key length * 16 | |
725 | ||
726 | mov %rbx, 1*8(%rsp) // Save registers | |
727 | mov %rbp, 2*8(%rsp) | |
728 | mov %r12, 3*8(%rsp) | |
729 | // P1: context in r8 | |
730 | // P2: byte key length * 16 in esi | |
731 | // P3: input pointer in rdi | |
732 | // P4: output pointer in (rsp) | |
733 | #endif /* GLADMAN_INTERFACE */ | |
734 | ||
735 | lea enc_tab(%rip), tptr | |
736 | sub $fofs, kptr | |
737 | ||
738 | // Load input block into registers | |
739 | mov (%rdi), %eax | |
740 | mov 1*4(%rdi), %ebx | |
741 | mov 2*4(%rdi), %ecx | |
742 | mov 3*4(%rdi), %edx | |
743 | ||
744 | xor fofs(kptr), %eax | |
745 | xor fofs+4(kptr), %ebx | |
746 | xor fofs+8(kptr), %ecx | |
747 | xor fofs+12(kptr), %edx | |
748 | ||
749 | lea (kptr,%rsi), kptr | |
750 | // Jump based on byte key length * 16: | |
751 | cmp $[10*16], %esi | |
752 | je 3f | |
753 | cmp $[12*16], %esi | |
754 | je 2f | |
755 | cmp $[14*16], %esi | |
756 | je 1f | |
757 | mov $-1, %rax // error | |
758 | jmp 4f | |
759 | ||
760 | // Perform normal forward rounds | |
761 | 1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) | |
762 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) | |
763 | 2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) | |
764 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) | |
765 | 3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) | |
766 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) | |
767 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) | |
768 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) | |
769 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) | |
770 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) | |
771 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) | |
772 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) | |
773 | ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) | |
774 | fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) | |
775 | ||
776 | // Copy results | |
777 | mov (%rsp), %rbx | |
778 | mov %r9d, (%rbx) | |
779 | mov %r10d, 4(%rbx) | |
780 | mov %r11d, 8(%rbx) | |
781 | mov %r12d, 12(%rbx) | |
782 | xor %rax, %rax | |
783 | 4: // Restore registers | |
784 | mov 1*8(%rsp), %rbx | |
785 | mov 2*8(%rsp), %rbp | |
786 | mov 3*8(%rsp), %r12 | |
787 | add $[4*8], %rsp | |
788 | ret | |
789 | ||
790 | SET_SIZE(aes_encrypt_amd64) | |
791 | ||
792 | /* | |
793 | * OpenSolaris OS: | |
794 | * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, | |
795 | * const uint32_t pt[4], uint32_t ct[4])/ | |
796 | * | |
797 | * Original interface: | |
798 | * int aes_decrypt(const unsigned char *in, | |
799 | * unsigned char *out, const aes_encrypt_ctx cx[1])/ | |
800 | */ | |
801 | .data | |
802 | .align 64 | |
803 | dec_tab: | |
804 | dec_vals(v8) | |
805 | #ifdef LAST_ROUND_TABLES | |
806 | // Last Round Tables: | |
807 | dec_vals(w8) | |
808 | #endif | |
809 | ||
810 | ||
811 | ENTRY_NP(aes_decrypt_amd64) | |
812 | #ifdef GLADMAN_INTERFACE | |
813 | // Original interface | |
814 | sub $[4*8], %rsp // gnu/linux/opensolaris binary interface | |
815 | mov %rsi, (%rsp) // output pointer (P2) | |
816 | mov %rdx, %r8 // context (P3) | |
817 | ||
818 | mov %rbx, 1*8(%rsp) // P1: input pointer in rdi | |
819 | mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) | |
820 | mov %r12, 3*8(%rsp) // P3: context in r8 | |
821 | movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 | |
822 | ||
823 | #else | |
824 | // OpenSolaris OS interface | |
825 | sub $[4*8], %rsp // Make room on stack to save registers | |
826 | mov %rcx, (%rsp) // Save output pointer (P4) on stack | |
827 | mov %rdi, %r8 // context (P1) | |
828 | mov %rdx, %rdi // P3: save input pointer | |
829 | shl $4, %esi // P2: esi byte key length * 16 | |
830 | ||
831 | mov %rbx, 1*8(%rsp) // Save registers | |
832 | mov %rbp, 2*8(%rsp) | |
833 | mov %r12, 3*8(%rsp) | |
834 | // P1: context in r8 | |
835 | // P2: byte key length * 16 in esi | |
836 | // P3: input pointer in rdi | |
837 | // P4: output pointer in (rsp) | |
838 | #endif /* GLADMAN_INTERFACE */ | |
839 | ||
840 | lea dec_tab(%rip), tptr | |
841 | sub $rofs, kptr | |
842 | ||
843 | // Load input block into registers | |
844 | mov (%rdi), %eax | |
845 | mov 1*4(%rdi), %ebx | |
846 | mov 2*4(%rdi), %ecx | |
847 | mov 3*4(%rdi), %edx | |
848 | ||
849 | #ifdef AES_REV_DKS | |
850 | mov kptr, %rdi | |
851 | lea (kptr,%rsi), kptr | |
852 | #else | |
853 | lea (kptr,%rsi), %rdi | |
854 | #endif | |
855 | ||
856 | xor rofs(%rdi), %eax | |
857 | xor rofs+4(%rdi), %ebx | |
858 | xor rofs+8(%rdi), %ecx | |
859 | xor rofs+12(%rdi), %edx | |
860 | ||
861 | // Jump based on byte key length * 16: | |
862 | cmp $[10*16], %esi | |
863 | je 3f | |
864 | cmp $[12*16], %esi | |
865 | je 2f | |
866 | cmp $[14*16], %esi | |
867 | je 1f | |
868 | mov $-1, %rax // error | |
869 | jmp 4f | |
870 | ||
871 | // Perform normal inverse rounds | |
872 | 1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) | |
873 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) | |
874 | 2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) | |
875 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) | |
876 | 3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) | |
877 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) | |
878 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) | |
879 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) | |
880 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) | |
881 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) | |
882 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) | |
883 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) | |
884 | ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) | |
885 | il_rnd(%r9d, %r10d, %r11d, %r12d, 0) | |
886 | ||
887 | // Copy results | |
888 | mov (%rsp), %rbx | |
889 | mov %r9d, (%rbx) | |
890 | mov %r10d, 4(%rbx) | |
891 | mov %r11d, 8(%rbx) | |
892 | mov %r12d, 12(%rbx) | |
893 | xor %rax, %rax | |
894 | 4: // Restore registers | |
895 | mov 1*8(%rsp), %rbx | |
896 | mov 2*8(%rsp), %rbp | |
897 | mov 3*8(%rsp), %r12 | |
898 | add $[4*8], %rsp | |
899 | ret | |
900 | ||
901 | SET_SIZE(aes_decrypt_amd64) | |
902 | #endif /* lint || __lint */ | |
903 | ||
904 | #ifdef __ELF__ | |
905 | .section .note.GNU-stack,"",%progbits | |
906 | #endif |