]>
Commit | Line | Data |
---|---|---|
64b94cea JK |
1 | /* |
2 | * Blowfish Cipher Algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
5186e395 JK |
23 | #include <linux/linkage.h> |
24 | ||
64b94cea JK |
25 | .file "blowfish-x86_64-asm.S" |
26 | .text | |
27 | ||
28 | /* structure of crypto context */ | |
29 | #define p 0 | |
30 | #define s0 ((16 + 2) * 4) | |
31 | #define s1 ((16 + 2 + (1 * 256)) * 4) | |
32 | #define s2 ((16 + 2 + (2 * 256)) * 4) | |
33 | #define s3 ((16 + 2 + (3 * 256)) * 4) | |
34 | ||
35 | /* register macros */ | |
569f11c9 | 36 | #define CTX %r12 |
64b94cea JK |
37 | #define RIO %rsi |
38 | ||
39 | #define RX0 %rax | |
40 | #define RX1 %rbx | |
41 | #define RX2 %rcx | |
42 | #define RX3 %rdx | |
43 | ||
44 | #define RX0d %eax | |
45 | #define RX1d %ebx | |
46 | #define RX2d %ecx | |
47 | #define RX3d %edx | |
48 | ||
49 | #define RX0bl %al | |
50 | #define RX1bl %bl | |
51 | #define RX2bl %cl | |
52 | #define RX3bl %dl | |
53 | ||
54 | #define RX0bh %ah | |
55 | #define RX1bh %bh | |
56 | #define RX2bh %ch | |
57 | #define RX3bh %dh | |
58 | ||
569f11c9 | 59 | #define RT0 %rdi |
64b94cea | 60 | #define RT1 %rsi |
e827bb09 JK |
61 | #define RT2 %r8 |
62 | #define RT3 %r9 | |
64b94cea | 63 | |
569f11c9 | 64 | #define RT0d %edi |
64b94cea | 65 | #define RT1d %esi |
e827bb09 JK |
66 | #define RT2d %r8d |
67 | #define RT3d %r9d | |
64b94cea | 68 | |
e827bb09 | 69 | #define RKEY %r10 |
64b94cea JK |
70 | |
71 | /*********************************************************************** | |
72 | * 1-way blowfish | |
73 | ***********************************************************************/ | |
e827bb09 JK |
74 | #define F() \ |
75 | rorq $16, RX0; \ | |
76 | movzbl RX0bh, RT0d; \ | |
77 | movzbl RX0bl, RT1d; \ | |
78 | rolq $16, RX0; \ | |
79 | movl s0(CTX,RT0,4), RT0d; \ | |
80 | addl s1(CTX,RT1,4), RT0d; \ | |
81 | movzbl RX0bh, RT1d; \ | |
82 | movzbl RX0bl, RT2d; \ | |
83 | rolq $32, RX0; \ | |
84 | xorl s2(CTX,RT1,4), RT0d; \ | |
85 | addl s3(CTX,RT2,4), RT0d; \ | |
86 | xorq RT0, RX0; | |
64b94cea JK |
87 | |
88 | #define add_roundkey_enc(n) \ | |
89 | xorq p+4*(n)(CTX), RX0; | |
90 | ||
91 | #define round_enc(n) \ | |
92 | add_roundkey_enc(n); \ | |
93 | \ | |
e827bb09 JK |
94 | F(); \ |
95 | F(); | |
64b94cea JK |
96 | |
97 | #define add_roundkey_dec(n) \ | |
98 | movq p+4*(n-1)(CTX), RT0; \ | |
99 | rorq $32, RT0; \ | |
100 | xorq RT0, RX0; | |
101 | ||
102 | #define round_dec(n) \ | |
103 | add_roundkey_dec(n); \ | |
104 | \ | |
e827bb09 JK |
105 | F(); \ |
106 | F(); \ | |
64b94cea JK |
107 | |
108 | #define read_block() \ | |
109 | movq (RIO), RX0; \ | |
110 | rorq $32, RX0; \ | |
111 | bswapq RX0; | |
112 | ||
113 | #define write_block() \ | |
114 | bswapq RX0; \ | |
115 | movq RX0, (RIO); | |
116 | ||
117 | #define xor_block() \ | |
118 | bswapq RX0; \ | |
119 | xorq RX0, (RIO); | |
120 | ||
5186e395 | 121 | ENTRY(__blowfish_enc_blk) |
e827bb09 | 122 | /* input: |
569f11c9 | 123 | * %rdi: ctx |
e827bb09 JK |
124 | * %rsi: dst |
125 | * %rdx: src | |
126 | * %rcx: bool, if true: xor output | |
127 | */ | |
569f11c9 | 128 | movq %r12, %r11; |
e827bb09 | 129 | |
569f11c9 | 130 | movq %rdi, CTX; |
e827bb09 | 131 | movq %rsi, %r10; |
64b94cea JK |
132 | movq %rdx, RIO; |
133 | ||
134 | read_block(); | |
135 | ||
136 | round_enc(0); | |
137 | round_enc(2); | |
138 | round_enc(4); | |
139 | round_enc(6); | |
140 | round_enc(8); | |
141 | round_enc(10); | |
142 | round_enc(12); | |
143 | round_enc(14); | |
144 | add_roundkey_enc(16); | |
145 | ||
569f11c9 | 146 | movq %r11, %r12; |
64b94cea | 147 | |
e827bb09 JK |
148 | movq %r10, RIO; |
149 | test %cl, %cl; | |
5186e395 | 150 | jnz .L__enc_xor; |
64b94cea JK |
151 | |
152 | write_block(); | |
64b94cea | 153 | ret; |
5186e395 | 154 | .L__enc_xor: |
64b94cea | 155 | xor_block(); |
e827bb09 | 156 | ret; |
5186e395 | 157 | ENDPROC(__blowfish_enc_blk) |
64b94cea | 158 | |
5186e395 | 159 | ENTRY(blowfish_dec_blk) |
e827bb09 | 160 | /* input: |
569f11c9 | 161 | * %rdi: ctx |
e827bb09 JK |
162 | * %rsi: dst |
163 | * %rdx: src | |
164 | */ | |
569f11c9 | 165 | movq %r12, %r11; |
e827bb09 | 166 | |
569f11c9 | 167 | movq %rdi, CTX; |
e827bb09 | 168 | movq %rsi, %r10; |
64b94cea JK |
169 | movq %rdx, RIO; |
170 | ||
171 | read_block(); | |
172 | ||
173 | round_dec(17); | |
174 | round_dec(15); | |
175 | round_dec(13); | |
176 | round_dec(11); | |
177 | round_dec(9); | |
178 | round_dec(7); | |
179 | round_dec(5); | |
180 | round_dec(3); | |
181 | add_roundkey_dec(1); | |
182 | ||
e827bb09 | 183 | movq %r10, RIO; |
64b94cea JK |
184 | write_block(); |
185 | ||
569f11c9 | 186 | movq %r11, %r12; |
64b94cea JK |
187 | |
188 | ret; | |
5186e395 | 189 | ENDPROC(blowfish_dec_blk) |
64b94cea JK |
190 | |
191 | /********************************************************************** | |
192 | 4-way blowfish, four blocks parallel | |
193 | **********************************************************************/ | |
e827bb09 JK |
194 | |
195 | /* F() for 4-way. Slower when used alone/1-way, but faster when used | |
196 | * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). | |
197 | */ | |
198 | #define F4(x) \ | |
199 | movzbl x ## bh, RT1d; \ | |
200 | movzbl x ## bl, RT3d; \ | |
201 | rorq $16, x; \ | |
202 | movzbl x ## bh, RT0d; \ | |
203 | movzbl x ## bl, RT2d; \ | |
204 | rorq $16, x; \ | |
205 | movl s0(CTX,RT0,4), RT0d; \ | |
206 | addl s1(CTX,RT2,4), RT0d; \ | |
207 | xorl s2(CTX,RT1,4), RT0d; \ | |
208 | addl s3(CTX,RT3,4), RT0d; \ | |
209 | xorq RT0, x; | |
210 | ||
64b94cea JK |
211 | #define add_preloaded_roundkey4() \ |
212 | xorq RKEY, RX0; \ | |
213 | xorq RKEY, RX1; \ | |
214 | xorq RKEY, RX2; \ | |
215 | xorq RKEY, RX3; | |
216 | ||
217 | #define preload_roundkey_enc(n) \ | |
218 | movq p+4*(n)(CTX), RKEY; | |
219 | ||
220 | #define add_roundkey_enc4(n) \ | |
221 | add_preloaded_roundkey4(); \ | |
222 | preload_roundkey_enc(n + 2); | |
223 | ||
224 | #define round_enc4(n) \ | |
225 | add_roundkey_enc4(n); \ | |
226 | \ | |
e827bb09 JK |
227 | F4(RX0); \ |
228 | F4(RX1); \ | |
229 | F4(RX2); \ | |
230 | F4(RX3); \ | |
64b94cea | 231 | \ |
e827bb09 JK |
232 | F4(RX0); \ |
233 | F4(RX1); \ | |
234 | F4(RX2); \ | |
235 | F4(RX3); | |
64b94cea JK |
236 | |
237 | #define preload_roundkey_dec(n) \ | |
238 | movq p+4*((n)-1)(CTX), RKEY; \ | |
239 | rorq $32, RKEY; | |
240 | ||
241 | #define add_roundkey_dec4(n) \ | |
242 | add_preloaded_roundkey4(); \ | |
243 | preload_roundkey_dec(n - 2); | |
244 | ||
245 | #define round_dec4(n) \ | |
246 | add_roundkey_dec4(n); \ | |
247 | \ | |
e827bb09 JK |
248 | F4(RX0); \ |
249 | F4(RX1); \ | |
250 | F4(RX2); \ | |
251 | F4(RX3); \ | |
64b94cea | 252 | \ |
e827bb09 JK |
253 | F4(RX0); \ |
254 | F4(RX1); \ | |
255 | F4(RX2); \ | |
256 | F4(RX3); | |
64b94cea JK |
257 | |
258 | #define read_block4() \ | |
259 | movq (RIO), RX0; \ | |
260 | rorq $32, RX0; \ | |
261 | bswapq RX0; \ | |
262 | \ | |
263 | movq 8(RIO), RX1; \ | |
264 | rorq $32, RX1; \ | |
265 | bswapq RX1; \ | |
266 | \ | |
267 | movq 16(RIO), RX2; \ | |
268 | rorq $32, RX2; \ | |
269 | bswapq RX2; \ | |
270 | \ | |
271 | movq 24(RIO), RX3; \ | |
272 | rorq $32, RX3; \ | |
273 | bswapq RX3; | |
274 | ||
275 | #define write_block4() \ | |
276 | bswapq RX0; \ | |
277 | movq RX0, (RIO); \ | |
278 | \ | |
279 | bswapq RX1; \ | |
280 | movq RX1, 8(RIO); \ | |
281 | \ | |
282 | bswapq RX2; \ | |
283 | movq RX2, 16(RIO); \ | |
284 | \ | |
285 | bswapq RX3; \ | |
286 | movq RX3, 24(RIO); | |
287 | ||
288 | #define xor_block4() \ | |
289 | bswapq RX0; \ | |
290 | xorq RX0, (RIO); \ | |
291 | \ | |
292 | bswapq RX1; \ | |
293 | xorq RX1, 8(RIO); \ | |
294 | \ | |
295 | bswapq RX2; \ | |
296 | xorq RX2, 16(RIO); \ | |
297 | \ | |
298 | bswapq RX3; \ | |
299 | xorq RX3, 24(RIO); | |
300 | ||
5186e395 | 301 | ENTRY(__blowfish_enc_blk_4way) |
e827bb09 | 302 | /* input: |
569f11c9 | 303 | * %rdi: ctx |
e827bb09 JK |
304 | * %rsi: dst |
305 | * %rdx: src | |
306 | * %rcx: bool, if true: xor output | |
307 | */ | |
569f11c9 | 308 | pushq %r12; |
64b94cea | 309 | pushq %rbx; |
e827bb09 JK |
310 | pushq %rcx; |
311 | ||
569f11c9 | 312 | movq %rdi, CTX |
e827bb09 | 313 | movq %rsi, %r11; |
64b94cea JK |
314 | movq %rdx, RIO; |
315 | ||
569f11c9 JP |
316 | preload_roundkey_enc(0); |
317 | ||
64b94cea JK |
318 | read_block4(); |
319 | ||
320 | round_enc4(0); | |
321 | round_enc4(2); | |
322 | round_enc4(4); | |
323 | round_enc4(6); | |
324 | round_enc4(8); | |
325 | round_enc4(10); | |
326 | round_enc4(12); | |
327 | round_enc4(14); | |
328 | add_preloaded_roundkey4(); | |
329 | ||
569f11c9 | 330 | popq %r12; |
e827bb09 | 331 | movq %r11, RIO; |
64b94cea | 332 | |
569f11c9 | 333 | test %r12b, %r12b; |
5186e395 | 334 | jnz .L__enc_xor4; |
64b94cea JK |
335 | |
336 | write_block4(); | |
337 | ||
64b94cea | 338 | popq %rbx; |
569f11c9 | 339 | popq %r12; |
64b94cea JK |
340 | ret; |
341 | ||
5186e395 | 342 | .L__enc_xor4: |
64b94cea JK |
343 | xor_block4(); |
344 | ||
e827bb09 | 345 | popq %rbx; |
569f11c9 | 346 | popq %r12; |
e827bb09 | 347 | ret; |
5186e395 | 348 | ENDPROC(__blowfish_enc_blk_4way) |
64b94cea | 349 | |
5186e395 | 350 | ENTRY(blowfish_dec_blk_4way) |
e827bb09 | 351 | /* input: |
569f11c9 | 352 | * %rdi: ctx |
e827bb09 JK |
353 | * %rsi: dst |
354 | * %rdx: src | |
355 | */ | |
569f11c9 | 356 | pushq %r12; |
64b94cea | 357 | pushq %rbx; |
64b94cea | 358 | |
569f11c9 JP |
359 | movq %rdi, CTX; |
360 | movq %rsi, %r11 | |
64b94cea JK |
361 | movq %rdx, RIO; |
362 | ||
569f11c9 | 363 | preload_roundkey_dec(17); |
64b94cea JK |
364 | read_block4(); |
365 | ||
366 | round_dec4(17); | |
367 | round_dec4(15); | |
368 | round_dec4(13); | |
369 | round_dec4(11); | |
370 | round_dec4(9); | |
371 | round_dec4(7); | |
372 | round_dec4(5); | |
373 | round_dec4(3); | |
374 | add_preloaded_roundkey4(); | |
375 | ||
e827bb09 | 376 | movq %r11, RIO; |
64b94cea JK |
377 | write_block4(); |
378 | ||
64b94cea | 379 | popq %rbx; |
569f11c9 | 380 | popq %r12; |
64b94cea JK |
381 | |
382 | ret; | |
5186e395 | 383 | ENDPROC(blowfish_dec_blk_4way) |