]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/crypto/blowfish-x86_64-asm_64.S
crypto: aesni - handle zero length dst buffer
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / crypto / blowfish-x86_64-asm_64.S
CommitLineData
64b94cea
JK
1/*
2 * Blowfish Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
5186e395
JK
23#include <linux/linkage.h>
24
64b94cea
JK
25.file "blowfish-x86_64-asm.S"
26.text
27
28/* structure of crypto context */
29#define p 0
30#define s0 ((16 + 2) * 4)
31#define s1 ((16 + 2 + (1 * 256)) * 4)
32#define s2 ((16 + 2 + (2 * 256)) * 4)
33#define s3 ((16 + 2 + (3 * 256)) * 4)
34
35/* register macros */
569f11c9 36#define CTX %r12
64b94cea
JK
37#define RIO %rsi
38
39#define RX0 %rax
40#define RX1 %rbx
41#define RX2 %rcx
42#define RX3 %rdx
43
44#define RX0d %eax
45#define RX1d %ebx
46#define RX2d %ecx
47#define RX3d %edx
48
49#define RX0bl %al
50#define RX1bl %bl
51#define RX2bl %cl
52#define RX3bl %dl
53
54#define RX0bh %ah
55#define RX1bh %bh
56#define RX2bh %ch
57#define RX3bh %dh
58
569f11c9 59#define RT0 %rdi
64b94cea 60#define RT1 %rsi
e827bb09
JK
61#define RT2 %r8
62#define RT3 %r9
64b94cea 63
569f11c9 64#define RT0d %edi
64b94cea 65#define RT1d %esi
e827bb09
JK
66#define RT2d %r8d
67#define RT3d %r9d
64b94cea 68
e827bb09 69#define RKEY %r10
64b94cea
JK
70
71/***********************************************************************
72 * 1-way blowfish
73 ***********************************************************************/
e827bb09
JK
74#define F() \
75 rorq $16, RX0; \
76 movzbl RX0bh, RT0d; \
77 movzbl RX0bl, RT1d; \
78 rolq $16, RX0; \
79 movl s0(CTX,RT0,4), RT0d; \
80 addl s1(CTX,RT1,4), RT0d; \
81 movzbl RX0bh, RT1d; \
82 movzbl RX0bl, RT2d; \
83 rolq $32, RX0; \
84 xorl s2(CTX,RT1,4), RT0d; \
85 addl s3(CTX,RT2,4), RT0d; \
86 xorq RT0, RX0;
64b94cea
JK
87
88#define add_roundkey_enc(n) \
89 xorq p+4*(n)(CTX), RX0;
90
91#define round_enc(n) \
92 add_roundkey_enc(n); \
93 \
e827bb09
JK
94 F(); \
95 F();
64b94cea
JK
96
97#define add_roundkey_dec(n) \
98 movq p+4*(n-1)(CTX), RT0; \
99 rorq $32, RT0; \
100 xorq RT0, RX0;
101
102#define round_dec(n) \
103 add_roundkey_dec(n); \
104 \
e827bb09
JK
105 F(); \
106 F(); \
64b94cea
JK
107
108#define read_block() \
109 movq (RIO), RX0; \
110 rorq $32, RX0; \
111 bswapq RX0;
112
113#define write_block() \
114 bswapq RX0; \
115 movq RX0, (RIO);
116
117#define xor_block() \
118 bswapq RX0; \
119 xorq RX0, (RIO);
120
5186e395 121ENTRY(__blowfish_enc_blk)
e827bb09 122 /* input:
569f11c9 123 * %rdi: ctx
e827bb09
JK
124 * %rsi: dst
125 * %rdx: src
126 * %rcx: bool, if true: xor output
127 */
569f11c9 128 movq %r12, %r11;
e827bb09 129
569f11c9 130 movq %rdi, CTX;
e827bb09 131 movq %rsi, %r10;
64b94cea
JK
132 movq %rdx, RIO;
133
134 read_block();
135
136 round_enc(0);
137 round_enc(2);
138 round_enc(4);
139 round_enc(6);
140 round_enc(8);
141 round_enc(10);
142 round_enc(12);
143 round_enc(14);
144 add_roundkey_enc(16);
145
569f11c9 146 movq %r11, %r12;
64b94cea 147
e827bb09
JK
148 movq %r10, RIO;
149 test %cl, %cl;
5186e395 150 jnz .L__enc_xor;
64b94cea
JK
151
152 write_block();
64b94cea 153 ret;
5186e395 154.L__enc_xor:
64b94cea 155 xor_block();
e827bb09 156 ret;
5186e395 157ENDPROC(__blowfish_enc_blk)
64b94cea 158
5186e395 159ENTRY(blowfish_dec_blk)
e827bb09 160 /* input:
569f11c9 161 * %rdi: ctx
e827bb09
JK
162 * %rsi: dst
163 * %rdx: src
164 */
569f11c9 165 movq %r12, %r11;
e827bb09 166
569f11c9 167 movq %rdi, CTX;
e827bb09 168 movq %rsi, %r10;
64b94cea
JK
169 movq %rdx, RIO;
170
171 read_block();
172
173 round_dec(17);
174 round_dec(15);
175 round_dec(13);
176 round_dec(11);
177 round_dec(9);
178 round_dec(7);
179 round_dec(5);
180 round_dec(3);
181 add_roundkey_dec(1);
182
e827bb09 183 movq %r10, RIO;
64b94cea
JK
184 write_block();
185
569f11c9 186 movq %r11, %r12;
64b94cea
JK
187
188 ret;
5186e395 189ENDPROC(blowfish_dec_blk)
64b94cea
JK
190
191/**********************************************************************
192 4-way blowfish, four blocks parallel
193 **********************************************************************/
e827bb09
JK
194
195/* F() for 4-way. Slower when used alone/1-way, but faster when used
196 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
197 */
198#define F4(x) \
199 movzbl x ## bh, RT1d; \
200 movzbl x ## bl, RT3d; \
201 rorq $16, x; \
202 movzbl x ## bh, RT0d; \
203 movzbl x ## bl, RT2d; \
204 rorq $16, x; \
205 movl s0(CTX,RT0,4), RT0d; \
206 addl s1(CTX,RT2,4), RT0d; \
207 xorl s2(CTX,RT1,4), RT0d; \
208 addl s3(CTX,RT3,4), RT0d; \
209 xorq RT0, x;
210
64b94cea
JK
211#define add_preloaded_roundkey4() \
212 xorq RKEY, RX0; \
213 xorq RKEY, RX1; \
214 xorq RKEY, RX2; \
215 xorq RKEY, RX3;
216
217#define preload_roundkey_enc(n) \
218 movq p+4*(n)(CTX), RKEY;
219
220#define add_roundkey_enc4(n) \
221 add_preloaded_roundkey4(); \
222 preload_roundkey_enc(n + 2);
223
224#define round_enc4(n) \
225 add_roundkey_enc4(n); \
226 \
e827bb09
JK
227 F4(RX0); \
228 F4(RX1); \
229 F4(RX2); \
230 F4(RX3); \
64b94cea 231 \
e827bb09
JK
232 F4(RX0); \
233 F4(RX1); \
234 F4(RX2); \
235 F4(RX3);
64b94cea
JK
236
237#define preload_roundkey_dec(n) \
238 movq p+4*((n)-1)(CTX), RKEY; \
239 rorq $32, RKEY;
240
241#define add_roundkey_dec4(n) \
242 add_preloaded_roundkey4(); \
243 preload_roundkey_dec(n - 2);
244
245#define round_dec4(n) \
246 add_roundkey_dec4(n); \
247 \
e827bb09
JK
248 F4(RX0); \
249 F4(RX1); \
250 F4(RX2); \
251 F4(RX3); \
64b94cea 252 \
e827bb09
JK
253 F4(RX0); \
254 F4(RX1); \
255 F4(RX2); \
256 F4(RX3);
64b94cea
JK
257
258#define read_block4() \
259 movq (RIO), RX0; \
260 rorq $32, RX0; \
261 bswapq RX0; \
262 \
263 movq 8(RIO), RX1; \
264 rorq $32, RX1; \
265 bswapq RX1; \
266 \
267 movq 16(RIO), RX2; \
268 rorq $32, RX2; \
269 bswapq RX2; \
270 \
271 movq 24(RIO), RX3; \
272 rorq $32, RX3; \
273 bswapq RX3;
274
275#define write_block4() \
276 bswapq RX0; \
277 movq RX0, (RIO); \
278 \
279 bswapq RX1; \
280 movq RX1, 8(RIO); \
281 \
282 bswapq RX2; \
283 movq RX2, 16(RIO); \
284 \
285 bswapq RX3; \
286 movq RX3, 24(RIO);
287
288#define xor_block4() \
289 bswapq RX0; \
290 xorq RX0, (RIO); \
291 \
292 bswapq RX1; \
293 xorq RX1, 8(RIO); \
294 \
295 bswapq RX2; \
296 xorq RX2, 16(RIO); \
297 \
298 bswapq RX3; \
299 xorq RX3, 24(RIO);
300
5186e395 301ENTRY(__blowfish_enc_blk_4way)
e827bb09 302 /* input:
569f11c9 303 * %rdi: ctx
e827bb09
JK
304 * %rsi: dst
305 * %rdx: src
306 * %rcx: bool, if true: xor output
307 */
569f11c9 308 pushq %r12;
64b94cea 309 pushq %rbx;
e827bb09
JK
310 pushq %rcx;
311
569f11c9 312 movq %rdi, CTX
e827bb09 313 movq %rsi, %r11;
64b94cea
JK
314 movq %rdx, RIO;
315
569f11c9
JP
316 preload_roundkey_enc(0);
317
64b94cea
JK
318 read_block4();
319
320 round_enc4(0);
321 round_enc4(2);
322 round_enc4(4);
323 round_enc4(6);
324 round_enc4(8);
325 round_enc4(10);
326 round_enc4(12);
327 round_enc4(14);
328 add_preloaded_roundkey4();
329
569f11c9 330 popq %r12;
e827bb09 331 movq %r11, RIO;
64b94cea 332
569f11c9 333 test %r12b, %r12b;
5186e395 334 jnz .L__enc_xor4;
64b94cea
JK
335
336 write_block4();
337
64b94cea 338 popq %rbx;
569f11c9 339 popq %r12;
64b94cea
JK
340 ret;
341
5186e395 342.L__enc_xor4:
64b94cea
JK
343 xor_block4();
344
e827bb09 345 popq %rbx;
569f11c9 346 popq %r12;
e827bb09 347 ret;
5186e395 348ENDPROC(__blowfish_enc_blk_4way)
64b94cea 349
5186e395 350ENTRY(blowfish_dec_blk_4way)
e827bb09 351 /* input:
569f11c9 352 * %rdi: ctx
e827bb09
JK
353 * %rsi: dst
354 * %rdx: src
355 */
569f11c9 356 pushq %r12;
64b94cea 357 pushq %rbx;
64b94cea 358
569f11c9
JP
359 movq %rdi, CTX;
360 movq %rsi, %r11
64b94cea
JK
361 movq %rdx, RIO;
362
569f11c9 363 preload_roundkey_dec(17);
64b94cea
JK
364 read_block4();
365
366 round_dec4(17);
367 round_dec4(15);
368 round_dec4(13);
369 round_dec4(11);
370 round_dec4(9);
371 round_dec4(7);
372 round_dec4(5);
373 round_dec4(3);
374 add_preloaded_roundkey4();
375
e827bb09 376 movq %r11, RIO;
64b94cea
JK
377 write_block4();
378
64b94cea 379 popq %rbx;
569f11c9 380 popq %r12;
64b94cea
JK
381
382 ret;
5186e395 383ENDPROC(blowfish_dec_blk_4way)