vpunpcklqdq t1, t0, x0; \
vpunpckhqdq t1, t0, x1;
-#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
- vmovdqu (0*4*4)(in), x0; \
- vmovdqu (1*4*4)(in), x1; \
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
\
transpose_2x4(x0, x1, t0, t1)
-#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
transpose_2x4(x0, x1, t0, t1) \
\
vpshufb rmask, x0, x0; \
- vpshufb rmask, x1, x1; \
- vmovdqu x0, (0*4*4)(out); \
- vmovdqu x1, (1*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
- transpose_2x4(x0, x1, t0, t1) \
- \
- vpshufb rmask, x0, x0; \
- vpshufb rmask, x1, x1; \
- vpxor (0*4*4)(out), x0, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor (1*4*4)(out), x1, x1; \
- vmovdqu x1, (1*4*4)(out);
+ vpshufb rmask, x1, x1;
.data
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+ .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
.L16_mask:
.byte 16, 16, 16, 16
.L32_mask:
.text
.align 16
-.global __cast5_enc_blk_16way
-.type __cast5_enc_blk_16way,@function;
+.type __cast5_enc_blk16,@function;
-__cast5_enc_blk_16way:
+__cast5_enc_blk16:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool, if true: xor output
+ * RL1: blocks 1 and 2
+ * RR1: blocks 3 and 4
+ * RL2: blocks 5 and 6
+ * RR2: blocks 7 and 8
+ * RL3: blocks 9 and 10
+ * RR3: blocks 11 and 12
+ * RL4: blocks 13 and 14
+ * RR4: blocks 15 and 16
+ * output:
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
*/
pushq %rbp;
pushq %rbx;
- pushq %rcx;
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
enc_preload_rkr();
- leaq 1*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
- inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
round(RL, RR, 0, 1);
round(RR, RL, 1, 2);
round(RR, RL, 15, 1);
__skip_enc:
- popq %rcx;
popq %rbx;
popq %rbp;
vmovdqa .Lbswap_mask, RKM;
- leaq 1*(2*4*4)(%r11), %rax;
- testb %cl, %cl;
- jnz __enc_xor16;
-
- outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
-
- ret;
-
-__enc_xor16:
- outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
ret;
.align 16
-.global cast5_dec_blk_16way
-.type cast5_dec_blk_16way,@function;
+.type __cast5_dec_blk16,@function;
-cast5_dec_blk_16way:
+__cast5_dec_blk16:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ * output:
+ * RL1: decrypted blocks 1 and 2
+ * RR1: decrypted blocks 3 and 4
+ * RL2: decrypted blocks 5 and 6
+ * RR2: decrypted blocks 7 and 8
+ * RL3: decrypted blocks 9 and 10
+ * RR3: decrypted blocks 11 and 12
+ * RL4: decrypted blocks 13 and 14
+ * RR4: decrypted blocks 15 and 16
*/
pushq %rbp;
vmovd .L32_mask, R32;
dec_preload_rkr();
- leaq 1*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
- inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
movzbl rr(CTX), %eax;
testl %eax, %eax;
popq %rbx;
popq %rbp;
- leaq 1*(2*4*4)(%r11), %rax;
- outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
ret;
__skip_dec:
vpsrldq $4, RKR, RKR;
jmp __dec_tail;
+
+.align 16
+.global cast5_ecb_enc_16way
+.type cast5_ecb_enc_16way,@function;
+
+cast5_ecb_enc_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_enc_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ ret;
+
+.align 16
+.global cast5_ecb_dec_16way
+.type cast5_ecb_dec_16way,@function;
+
+cast5_ecb_dec_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ ret;
+
+.align 16
+.global cast5_cbc_dec_16way
+.type cast5_cbc_dec_16way,@function;
+
+cast5_cbc_dec_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vmovdqu (0*16)(%rdx), RL1;
+ vmovdqu (1*16)(%rdx), RR1;
+ vmovdqu (2*16)(%rdx), RL2;
+ vmovdqu (3*16)(%rdx), RR2;
+ vmovdqu (4*16)(%rdx), RL3;
+ vmovdqu (5*16)(%rdx), RR3;
+ vmovdqu (6*16)(%rdx), RL4;
+ vmovdqu (7*16)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ /* xor with src */
+ vmovq (%r12), RX;
+ vpshufd $0x4f, RX, RX;
+ vpxor RX, RR1, RR1;
+ vpxor 0*16+8(%r12), RL1, RL1;
+ vpxor 1*16+8(%r12), RR2, RR2;
+ vpxor 2*16+8(%r12), RL2, RL2;
+ vpxor 3*16+8(%r12), RR3, RR3;
+ vpxor 4*16+8(%r12), RL3, RL3;
+ vpxor 5*16+8(%r12), RR4, RR4;
+ vpxor 6*16+8(%r12), RL4, RL4;
+
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r12;
+
+ ret;
+
+.align 16
+.global cast5_ctr_16way
+.type cast5_ctr_16way,@function;
+
+cast5_ctr_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (big endian, 64bit)
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vpcmpeqd RTMP, RTMP, RTMP;
+ vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+
+ vpcmpeqd RKR, RKR, RKR;
+ vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+ vmovdqa .Lbswap_iv_mask, R1ST;
+ vmovdqa .Lbswap128_mask, RKM;
+
+ /* load IV and byteswap */
+ vmovq (%rcx), RX;
+ vpshufb R1ST, RX, RX;
+
+ /* construct IVs */
+ vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
+ vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+
+ /* store last IV */
+ vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+ vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+ vmovq RX, (%rcx);
+
+ call __cast5_enc_blk16;
+
+ /* dst = src ^ iv */
+ vpxor (0*16)(%r12), RR1, RR1;
+ vpxor (1*16)(%r12), RL1, RL1;
+ vpxor (2*16)(%r12), RR2, RR2;
+ vpxor (3*16)(%r12), RL2, RL2;
+ vpxor (4*16)(%r12), RR3, RR3;
+ vpxor (5*16)(%r12), RL3, RL3;
+ vpxor (6*16)(%r12), RR4, RR4;
+ vpxor (7*16)(%r12), RL4, RL4;
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r12;
+
+ ret;
#define CAST5_PARALLEL_BLOCKS 16
-asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
-
-static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast5_enc_blk_16way(ctx, dst, src, false);
-}
-
-static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast5_enc_blk_16way(ctx, dst, src, true);
-}
-
-static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- cast5_dec_blk_16way(ctx, dst, src);
-}
-
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
+ __be64 *iv);
static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes;
+ void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
int err;
+ fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
+
err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
- if (enc)
- cast5_enc_blk_xway(ctx, wdst, wsrc);
- else
- cast5_dec_blk_xway(ctx, wdst, wsrc);
+ fn(ctx, wdst, wsrc);
wsrc += bsize * CAST5_PARALLEL_BLOCKS;
wdst += bsize * CAST5_PARALLEL_BLOCKS;
goto done;
}
+ fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
+
/* Handle leftovers */
do {
- if (enc)
- __cast5_encrypt(ctx, wdst, wsrc);
- else
- __cast5_decrypt(ctx, wdst, wsrc);
+ fn(ctx, wdst, wsrc);
wsrc += bsize;
wdst += bsize;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
u64 last_iv;
- int i;
/* Start of the last block. */
src += nbytes / bsize - 1;
src -= CAST5_PARALLEL_BLOCKS - 1;
dst -= CAST5_PARALLEL_BLOCKS - 1;
- for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
- ivs[i] = src[i];
-
- cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
- *(dst + (i + 1)) ^= *(ivs + i);
+ cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
nbytes -= bsize;
if (nbytes < bsize)
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
- int i;
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
- /* create ctrblks for parallel encrypt */
- for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- ctrblocks[i] = cpu_to_be64(ctrblk++);
- }
-
- cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
- (u8 *)ctrblocks);
+ cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
+ (__be64 *)walk->iv);
src += CAST5_PARALLEL_BLOCKS;
dst += CAST5_PARALLEL_BLOCKS;
/* Handle leftovers */
do {
+ u64 ctrblk;
+
if (dst != src)
*dst = *src;
- ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblk = *(u64 *)walk->iv;
+ be64_add_cpu((__be64 *)walk->iv, 1);
- __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
- *dst ^= ctrblocks[0];
+ __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ *dst ^= ctrblk;
src += 1;
dst += 1;
} while (nbytes >= bsize);
done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
return nbytes;
}