]>
Commit | Line | Data |
---|---|---|
a4397635 AB |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* | |
3 | * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> | |
4 | */ | |
5 | ||
6 | #ifdef CONFIG_ARM64 | |
7 | #include <asm/neon-intrinsics.h> | |
8 | ||
9 | #define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b" | |
10 | #else | |
11 | #include <arm_neon.h> | |
12 | ||
13 | #define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0" | |
14 | #endif | |
15 | ||
16 | #define AEGIS_BLOCK_SIZE 16 | |
17 | ||
18 | #include <stddef.h> | |
19 | ||
20 | void *memcpy(void *dest, const void *src, size_t n); | |
21 | void *memset(void *s, int c, size_t n); | |
22 | ||
23 | struct aegis128_state { | |
24 | uint8x16_t v[5]; | |
25 | }; | |
26 | ||
27 | static struct aegis128_state aegis128_load_state_neon(const void *state) | |
28 | { | |
29 | return (struct aegis128_state){ { | |
30 | vld1q_u8(state), | |
31 | vld1q_u8(state + 16), | |
32 | vld1q_u8(state + 32), | |
33 | vld1q_u8(state + 48), | |
34 | vld1q_u8(state + 64) | |
35 | } }; | |
36 | } | |
37 | ||
38 | static void aegis128_save_state_neon(struct aegis128_state st, void *state) | |
39 | { | |
40 | vst1q_u8(state, st.v[0]); | |
41 | vst1q_u8(state + 16, st.v[1]); | |
42 | vst1q_u8(state + 32, st.v[2]); | |
43 | vst1q_u8(state + 48, st.v[3]); | |
44 | vst1q_u8(state + 64, st.v[4]); | |
45 | } | |
46 | ||
47 | static inline __attribute__((always_inline)) | |
48 | uint8x16_t aegis_aes_round(uint8x16_t w) | |
49 | { | |
50 | uint8x16_t z = {}; | |
51 | ||
52 | /* | |
53 | * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics | |
54 | * to force the compiler to issue the aese/aesmc instructions in pairs. | |
55 | * This is much faster on many cores, where the instruction pair can | |
56 | * execute in a single cycle. | |
57 | */ | |
58 | asm(AES_ROUND : "+w"(w) : "w"(z)); | |
59 | return w; | |
60 | } | |
61 | ||
62 | static inline __attribute__((always_inline)) | |
63 | struct aegis128_state aegis128_update_neon(struct aegis128_state st, | |
64 | uint8x16_t m) | |
65 | { | |
66 | m ^= aegis_aes_round(st.v[4]); | |
67 | st.v[4] ^= aegis_aes_round(st.v[3]); | |
68 | st.v[3] ^= aegis_aes_round(st.v[2]); | |
69 | st.v[2] ^= aegis_aes_round(st.v[1]); | |
70 | st.v[1] ^= aegis_aes_round(st.v[0]); | |
71 | st.v[0] ^= m; | |
72 | ||
73 | return st; | |
74 | } | |
75 | ||
76 | void crypto_aegis128_update_neon(void *state, const void *msg) | |
77 | { | |
78 | struct aegis128_state st = aegis128_load_state_neon(state); | |
79 | ||
80 | st = aegis128_update_neon(st, vld1q_u8(msg)); | |
81 | ||
82 | aegis128_save_state_neon(st, state); | |
83 | } | |
84 | ||
85 | void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, | |
86 | unsigned int size) | |
87 | { | |
88 | struct aegis128_state st = aegis128_load_state_neon(state); | |
89 | uint8x16_t msg; | |
90 | ||
91 | while (size >= AEGIS_BLOCK_SIZE) { | |
92 | uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; | |
93 | ||
94 | msg = vld1q_u8(src); | |
95 | st = aegis128_update_neon(st, msg); | |
96 | vst1q_u8(dst, msg ^ s); | |
97 | ||
98 | size -= AEGIS_BLOCK_SIZE; | |
99 | src += AEGIS_BLOCK_SIZE; | |
100 | dst += AEGIS_BLOCK_SIZE; | |
101 | } | |
102 | ||
103 | if (size > 0) { | |
104 | uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; | |
105 | uint8_t buf[AEGIS_BLOCK_SIZE] = {}; | |
106 | ||
107 | memcpy(buf, src, size); | |
108 | msg = vld1q_u8(buf); | |
109 | st = aegis128_update_neon(st, msg); | |
110 | vst1q_u8(buf, msg ^ s); | |
111 | memcpy(dst, buf, size); | |
112 | } | |
113 | ||
114 | aegis128_save_state_neon(st, state); | |
115 | } | |
116 | ||
117 | void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, | |
118 | unsigned int size) | |
119 | { | |
120 | struct aegis128_state st = aegis128_load_state_neon(state); | |
121 | uint8x16_t msg; | |
122 | ||
123 | while (size >= AEGIS_BLOCK_SIZE) { | |
124 | msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; | |
125 | st = aegis128_update_neon(st, msg); | |
126 | vst1q_u8(dst, msg); | |
127 | ||
128 | size -= AEGIS_BLOCK_SIZE; | |
129 | src += AEGIS_BLOCK_SIZE; | |
130 | dst += AEGIS_BLOCK_SIZE; | |
131 | } | |
132 | ||
133 | if (size > 0) { | |
134 | uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; | |
135 | uint8_t buf[AEGIS_BLOCK_SIZE]; | |
136 | ||
137 | vst1q_u8(buf, s); | |
138 | memcpy(buf, src, size); | |
139 | msg = vld1q_u8(buf) ^ s; | |
140 | vst1q_u8(buf, msg); | |
141 | memcpy(dst, buf, size); | |
142 | ||
143 | st = aegis128_update_neon(st, msg); | |
144 | } | |
145 | ||
146 | aegis128_save_state_neon(st, state); | |
147 | } |