]>
Commit | Line | Data |
---|---|---|
b7171ce9 AB |
1 | /* |
2 | * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions | |
3 | * | |
4 | * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 as | |
8 | * published by the Free Software Foundation. | |
9 | * | |
10 | * Based on: | |
11 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions | |
12 | * | |
13 | * Copyright (C) 2015 Martin Willi | |
14 | * | |
15 | * This program is free software; you can redistribute it and/or modify | |
16 | * it under the terms of the GNU General Public License as published by | |
17 | * the Free Software Foundation; either version 2 of the License, or | |
18 | * (at your option) any later version. | |
19 | */ | |
20 | ||
21 | #include <linux/linkage.h> | |
22 | ||
23 | .text | |
24 | .align 6 | |
25 | ||
26 | ENTRY(chacha20_block_xor_neon) | |
27 | // x0: Input state matrix, s | |
28 | // x1: 1 data block output, o | |
29 | // x2: 1 data block input, i | |
30 | ||
31 | // | |
32 | // This function encrypts one ChaCha20 block by loading the state matrix | |
33 | // in four NEON registers. It performs matrix operation on four words in | |
34 | // parallel, but requires shuffling to rearrange the words after each | |
35 | // round. | |
36 | // | |
37 | ||
38 | // x0..3 = s0..3 | |
39 | adr x3, ROT8 | |
40 | ld1 {v0.4s-v3.4s}, [x0] | |
41 | ld1 {v8.4s-v11.4s}, [x0] | |
42 | ld1 {v12.4s}, [x3] | |
43 | ||
44 | mov x3, #10 | |
45 | ||
46 | .Ldoubleround: | |
47 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
48 | add v0.4s, v0.4s, v1.4s | |
49 | eor v3.16b, v3.16b, v0.16b | |
50 | rev32 v3.8h, v3.8h | |
51 | ||
52 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
53 | add v2.4s, v2.4s, v3.4s | |
54 | eor v4.16b, v1.16b, v2.16b | |
55 | shl v1.4s, v4.4s, #12 | |
56 | sri v1.4s, v4.4s, #20 | |
57 | ||
58 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
59 | add v0.4s, v0.4s, v1.4s | |
60 | eor v3.16b, v3.16b, v0.16b | |
61 | tbl v3.16b, {v3.16b}, v12.16b | |
62 | ||
63 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
64 | add v2.4s, v2.4s, v3.4s | |
65 | eor v4.16b, v1.16b, v2.16b | |
66 | shl v1.4s, v4.4s, #7 | |
67 | sri v1.4s, v4.4s, #25 | |
68 | ||
69 | // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | |
70 | ext v1.16b, v1.16b, v1.16b, #4 | |
71 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
72 | ext v2.16b, v2.16b, v2.16b, #8 | |
73 | // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | |
74 | ext v3.16b, v3.16b, v3.16b, #12 | |
75 | ||
76 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
77 | add v0.4s, v0.4s, v1.4s | |
78 | eor v3.16b, v3.16b, v0.16b | |
79 | rev32 v3.8h, v3.8h | |
80 | ||
81 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
82 | add v2.4s, v2.4s, v3.4s | |
83 | eor v4.16b, v1.16b, v2.16b | |
84 | shl v1.4s, v4.4s, #12 | |
85 | sri v1.4s, v4.4s, #20 | |
86 | ||
87 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
88 | add v0.4s, v0.4s, v1.4s | |
89 | eor v3.16b, v3.16b, v0.16b | |
90 | tbl v3.16b, {v3.16b}, v12.16b | |
91 | ||
92 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
93 | add v2.4s, v2.4s, v3.4s | |
94 | eor v4.16b, v1.16b, v2.16b | |
95 | shl v1.4s, v4.4s, #7 | |
96 | sri v1.4s, v4.4s, #25 | |
97 | ||
98 | // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | |
99 | ext v1.16b, v1.16b, v1.16b, #12 | |
100 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
101 | ext v2.16b, v2.16b, v2.16b, #8 | |
102 | // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | |
103 | ext v3.16b, v3.16b, v3.16b, #4 | |
104 | ||
105 | subs x3, x3, #1 | |
106 | b.ne .Ldoubleround | |
107 | ||
108 | ld1 {v4.16b-v7.16b}, [x2] | |
109 | ||
110 | // o0 = i0 ^ (x0 + s0) | |
111 | add v0.4s, v0.4s, v8.4s | |
112 | eor v0.16b, v0.16b, v4.16b | |
113 | ||
114 | // o1 = i1 ^ (x1 + s1) | |
115 | add v1.4s, v1.4s, v9.4s | |
116 | eor v1.16b, v1.16b, v5.16b | |
117 | ||
118 | // o2 = i2 ^ (x2 + s2) | |
119 | add v2.4s, v2.4s, v10.4s | |
120 | eor v2.16b, v2.16b, v6.16b | |
121 | ||
122 | // o3 = i3 ^ (x3 + s3) | |
123 | add v3.4s, v3.4s, v11.4s | |
124 | eor v3.16b, v3.16b, v7.16b | |
125 | ||
126 | st1 {v0.16b-v3.16b}, [x1] | |
127 | ||
128 | ret | |
129 | ENDPROC(chacha20_block_xor_neon) | |
130 | ||
131 | .align 6 | |
132 | ENTRY(chacha20_4block_xor_neon) | |
133 | // x0: Input state matrix, s | |
134 | // x1: 4 data blocks output, o | |
135 | // x2: 4 data blocks input, i | |
136 | ||
137 | // | |
138 | // This function encrypts four consecutive ChaCha20 blocks by loading | |
139 | // the state matrix in NEON registers four times. The algorithm performs | |
140 | // each operation on the corresponding word of each state matrix, hence | |
141 | // requires no word shuffling. For final XORing step we transpose the | |
142 | // matrix by interleaving 32- and then 64-bit words, which allows us to | |
143 | // do XOR in NEON registers. | |
144 | // | |
145 | adr x3, CTRINC // ... and ROT8 | |
146 | ld1 {v30.4s-v31.4s}, [x3] | |
147 | ||
148 | // x0..15[0-3] = s0..3[0..3] | |
149 | mov x4, x0 | |
150 | ld4r { v0.4s- v3.4s}, [x4], #16 | |
151 | ld4r { v4.4s- v7.4s}, [x4], #16 | |
152 | ld4r { v8.4s-v11.4s}, [x4], #16 | |
153 | ld4r {v12.4s-v15.4s}, [x4] | |
154 | ||
155 | // x12 += counter values 0-3 | |
156 | add v12.4s, v12.4s, v30.4s | |
157 | ||
158 | mov x3, #10 | |
159 | ||
160 | .Ldoubleround4: | |
161 | // x0 += x4, x12 = rotl32(x12 ^ x0, 16) | |
162 | // x1 += x5, x13 = rotl32(x13 ^ x1, 16) | |
163 | // x2 += x6, x14 = rotl32(x14 ^ x2, 16) | |
164 | // x3 += x7, x15 = rotl32(x15 ^ x3, 16) | |
165 | add v0.4s, v0.4s, v4.4s | |
166 | add v1.4s, v1.4s, v5.4s | |
167 | add v2.4s, v2.4s, v6.4s | |
168 | add v3.4s, v3.4s, v7.4s | |
169 | ||
170 | eor v12.16b, v12.16b, v0.16b | |
171 | eor v13.16b, v13.16b, v1.16b | |
172 | eor v14.16b, v14.16b, v2.16b | |
173 | eor v15.16b, v15.16b, v3.16b | |
174 | ||
175 | rev32 v12.8h, v12.8h | |
176 | rev32 v13.8h, v13.8h | |
177 | rev32 v14.8h, v14.8h | |
178 | rev32 v15.8h, v15.8h | |
179 | ||
180 | // x8 += x12, x4 = rotl32(x4 ^ x8, 12) | |
181 | // x9 += x13, x5 = rotl32(x5 ^ x9, 12) | |
182 | // x10 += x14, x6 = rotl32(x6 ^ x10, 12) | |
183 | // x11 += x15, x7 = rotl32(x7 ^ x11, 12) | |
184 | add v8.4s, v8.4s, v12.4s | |
185 | add v9.4s, v9.4s, v13.4s | |
186 | add v10.4s, v10.4s, v14.4s | |
187 | add v11.4s, v11.4s, v15.4s | |
188 | ||
189 | eor v16.16b, v4.16b, v8.16b | |
190 | eor v17.16b, v5.16b, v9.16b | |
191 | eor v18.16b, v6.16b, v10.16b | |
192 | eor v19.16b, v7.16b, v11.16b | |
193 | ||
194 | shl v4.4s, v16.4s, #12 | |
195 | shl v5.4s, v17.4s, #12 | |
196 | shl v6.4s, v18.4s, #12 | |
197 | shl v7.4s, v19.4s, #12 | |
198 | ||
199 | sri v4.4s, v16.4s, #20 | |
200 | sri v5.4s, v17.4s, #20 | |
201 | sri v6.4s, v18.4s, #20 | |
202 | sri v7.4s, v19.4s, #20 | |
203 | ||
204 | // x0 += x4, x12 = rotl32(x12 ^ x0, 8) | |
205 | // x1 += x5, x13 = rotl32(x13 ^ x1, 8) | |
206 | // x2 += x6, x14 = rotl32(x14 ^ x2, 8) | |
207 | // x3 += x7, x15 = rotl32(x15 ^ x3, 8) | |
208 | add v0.4s, v0.4s, v4.4s | |
209 | add v1.4s, v1.4s, v5.4s | |
210 | add v2.4s, v2.4s, v6.4s | |
211 | add v3.4s, v3.4s, v7.4s | |
212 | ||
213 | eor v12.16b, v12.16b, v0.16b | |
214 | eor v13.16b, v13.16b, v1.16b | |
215 | eor v14.16b, v14.16b, v2.16b | |
216 | eor v15.16b, v15.16b, v3.16b | |
217 | ||
218 | tbl v12.16b, {v12.16b}, v31.16b | |
219 | tbl v13.16b, {v13.16b}, v31.16b | |
220 | tbl v14.16b, {v14.16b}, v31.16b | |
221 | tbl v15.16b, {v15.16b}, v31.16b | |
222 | ||
223 | // x8 += x12, x4 = rotl32(x4 ^ x8, 7) | |
224 | // x9 += x13, x5 = rotl32(x5 ^ x9, 7) | |
225 | // x10 += x14, x6 = rotl32(x6 ^ x10, 7) | |
226 | // x11 += x15, x7 = rotl32(x7 ^ x11, 7) | |
227 | add v8.4s, v8.4s, v12.4s | |
228 | add v9.4s, v9.4s, v13.4s | |
229 | add v10.4s, v10.4s, v14.4s | |
230 | add v11.4s, v11.4s, v15.4s | |
231 | ||
232 | eor v16.16b, v4.16b, v8.16b | |
233 | eor v17.16b, v5.16b, v9.16b | |
234 | eor v18.16b, v6.16b, v10.16b | |
235 | eor v19.16b, v7.16b, v11.16b | |
236 | ||
237 | shl v4.4s, v16.4s, #7 | |
238 | shl v5.4s, v17.4s, #7 | |
239 | shl v6.4s, v18.4s, #7 | |
240 | shl v7.4s, v19.4s, #7 | |
241 | ||
242 | sri v4.4s, v16.4s, #25 | |
243 | sri v5.4s, v17.4s, #25 | |
244 | sri v6.4s, v18.4s, #25 | |
245 | sri v7.4s, v19.4s, #25 | |
246 | ||
247 | // x0 += x5, x15 = rotl32(x15 ^ x0, 16) | |
248 | // x1 += x6, x12 = rotl32(x12 ^ x1, 16) | |
249 | // x2 += x7, x13 = rotl32(x13 ^ x2, 16) | |
250 | // x3 += x4, x14 = rotl32(x14 ^ x3, 16) | |
251 | add v0.4s, v0.4s, v5.4s | |
252 | add v1.4s, v1.4s, v6.4s | |
253 | add v2.4s, v2.4s, v7.4s | |
254 | add v3.4s, v3.4s, v4.4s | |
255 | ||
256 | eor v15.16b, v15.16b, v0.16b | |
257 | eor v12.16b, v12.16b, v1.16b | |
258 | eor v13.16b, v13.16b, v2.16b | |
259 | eor v14.16b, v14.16b, v3.16b | |
260 | ||
261 | rev32 v15.8h, v15.8h | |
262 | rev32 v12.8h, v12.8h | |
263 | rev32 v13.8h, v13.8h | |
264 | rev32 v14.8h, v14.8h | |
265 | ||
266 | // x10 += x15, x5 = rotl32(x5 ^ x10, 12) | |
267 | // x11 += x12, x6 = rotl32(x6 ^ x11, 12) | |
268 | // x8 += x13, x7 = rotl32(x7 ^ x8, 12) | |
269 | // x9 += x14, x4 = rotl32(x4 ^ x9, 12) | |
270 | add v10.4s, v10.4s, v15.4s | |
271 | add v11.4s, v11.4s, v12.4s | |
272 | add v8.4s, v8.4s, v13.4s | |
273 | add v9.4s, v9.4s, v14.4s | |
274 | ||
275 | eor v16.16b, v5.16b, v10.16b | |
276 | eor v17.16b, v6.16b, v11.16b | |
277 | eor v18.16b, v7.16b, v8.16b | |
278 | eor v19.16b, v4.16b, v9.16b | |
279 | ||
280 | shl v5.4s, v16.4s, #12 | |
281 | shl v6.4s, v17.4s, #12 | |
282 | shl v7.4s, v18.4s, #12 | |
283 | shl v4.4s, v19.4s, #12 | |
284 | ||
285 | sri v5.4s, v16.4s, #20 | |
286 | sri v6.4s, v17.4s, #20 | |
287 | sri v7.4s, v18.4s, #20 | |
288 | sri v4.4s, v19.4s, #20 | |
289 | ||
290 | // x0 += x5, x15 = rotl32(x15 ^ x0, 8) | |
291 | // x1 += x6, x12 = rotl32(x12 ^ x1, 8) | |
292 | // x2 += x7, x13 = rotl32(x13 ^ x2, 8) | |
293 | // x3 += x4, x14 = rotl32(x14 ^ x3, 8) | |
294 | add v0.4s, v0.4s, v5.4s | |
295 | add v1.4s, v1.4s, v6.4s | |
296 | add v2.4s, v2.4s, v7.4s | |
297 | add v3.4s, v3.4s, v4.4s | |
298 | ||
299 | eor v15.16b, v15.16b, v0.16b | |
300 | eor v12.16b, v12.16b, v1.16b | |
301 | eor v13.16b, v13.16b, v2.16b | |
302 | eor v14.16b, v14.16b, v3.16b | |
303 | ||
304 | tbl v15.16b, {v15.16b}, v31.16b | |
305 | tbl v12.16b, {v12.16b}, v31.16b | |
306 | tbl v13.16b, {v13.16b}, v31.16b | |
307 | tbl v14.16b, {v14.16b}, v31.16b | |
308 | ||
309 | // x10 += x15, x5 = rotl32(x5 ^ x10, 7) | |
310 | // x11 += x12, x6 = rotl32(x6 ^ x11, 7) | |
311 | // x8 += x13, x7 = rotl32(x7 ^ x8, 7) | |
312 | // x9 += x14, x4 = rotl32(x4 ^ x9, 7) | |
313 | add v10.4s, v10.4s, v15.4s | |
314 | add v11.4s, v11.4s, v12.4s | |
315 | add v8.4s, v8.4s, v13.4s | |
316 | add v9.4s, v9.4s, v14.4s | |
317 | ||
318 | eor v16.16b, v5.16b, v10.16b | |
319 | eor v17.16b, v6.16b, v11.16b | |
320 | eor v18.16b, v7.16b, v8.16b | |
321 | eor v19.16b, v4.16b, v9.16b | |
322 | ||
323 | shl v5.4s, v16.4s, #7 | |
324 | shl v6.4s, v17.4s, #7 | |
325 | shl v7.4s, v18.4s, #7 | |
326 | shl v4.4s, v19.4s, #7 | |
327 | ||
328 | sri v5.4s, v16.4s, #25 | |
329 | sri v6.4s, v17.4s, #25 | |
330 | sri v7.4s, v18.4s, #25 | |
331 | sri v4.4s, v19.4s, #25 | |
332 | ||
333 | subs x3, x3, #1 | |
334 | b.ne .Ldoubleround4 | |
335 | ||
336 | ld4r {v16.4s-v19.4s}, [x0], #16 | |
337 | ld4r {v20.4s-v23.4s}, [x0], #16 | |
338 | ||
339 | // x12 += counter values 0-3 | |
340 | add v12.4s, v12.4s, v30.4s | |
341 | ||
342 | // x0[0-3] += s0[0] | |
343 | // x1[0-3] += s0[1] | |
344 | // x2[0-3] += s0[2] | |
345 | // x3[0-3] += s0[3] | |
346 | add v0.4s, v0.4s, v16.4s | |
347 | add v1.4s, v1.4s, v17.4s | |
348 | add v2.4s, v2.4s, v18.4s | |
349 | add v3.4s, v3.4s, v19.4s | |
350 | ||
351 | ld4r {v24.4s-v27.4s}, [x0], #16 | |
352 | ld4r {v28.4s-v31.4s}, [x0] | |
353 | ||
354 | // x4[0-3] += s1[0] | |
355 | // x5[0-3] += s1[1] | |
356 | // x6[0-3] += s1[2] | |
357 | // x7[0-3] += s1[3] | |
358 | add v4.4s, v4.4s, v20.4s | |
359 | add v5.4s, v5.4s, v21.4s | |
360 | add v6.4s, v6.4s, v22.4s | |
361 | add v7.4s, v7.4s, v23.4s | |
362 | ||
363 | // x8[0-3] += s2[0] | |
364 | // x9[0-3] += s2[1] | |
365 | // x10[0-3] += s2[2] | |
366 | // x11[0-3] += s2[3] | |
367 | add v8.4s, v8.4s, v24.4s | |
368 | add v9.4s, v9.4s, v25.4s | |
369 | add v10.4s, v10.4s, v26.4s | |
370 | add v11.4s, v11.4s, v27.4s | |
371 | ||
372 | // x12[0-3] += s3[0] | |
373 | // x13[0-3] += s3[1] | |
374 | // x14[0-3] += s3[2] | |
375 | // x15[0-3] += s3[3] | |
376 | add v12.4s, v12.4s, v28.4s | |
377 | add v13.4s, v13.4s, v29.4s | |
378 | add v14.4s, v14.4s, v30.4s | |
379 | add v15.4s, v15.4s, v31.4s | |
380 | ||
381 | // interleave 32-bit words in state n, n+1 | |
382 | zip1 v16.4s, v0.4s, v1.4s | |
383 | zip2 v17.4s, v0.4s, v1.4s | |
384 | zip1 v18.4s, v2.4s, v3.4s | |
385 | zip2 v19.4s, v2.4s, v3.4s | |
386 | zip1 v20.4s, v4.4s, v5.4s | |
387 | zip2 v21.4s, v4.4s, v5.4s | |
388 | zip1 v22.4s, v6.4s, v7.4s | |
389 | zip2 v23.4s, v6.4s, v7.4s | |
390 | zip1 v24.4s, v8.4s, v9.4s | |
391 | zip2 v25.4s, v8.4s, v9.4s | |
392 | zip1 v26.4s, v10.4s, v11.4s | |
393 | zip2 v27.4s, v10.4s, v11.4s | |
394 | zip1 v28.4s, v12.4s, v13.4s | |
395 | zip2 v29.4s, v12.4s, v13.4s | |
396 | zip1 v30.4s, v14.4s, v15.4s | |
397 | zip2 v31.4s, v14.4s, v15.4s | |
398 | ||
399 | // interleave 64-bit words in state n, n+2 | |
400 | zip1 v0.2d, v16.2d, v18.2d | |
401 | zip2 v4.2d, v16.2d, v18.2d | |
402 | zip1 v8.2d, v17.2d, v19.2d | |
403 | zip2 v12.2d, v17.2d, v19.2d | |
404 | ld1 {v16.16b-v19.16b}, [x2], #64 | |
405 | ||
406 | zip1 v1.2d, v20.2d, v22.2d | |
407 | zip2 v5.2d, v20.2d, v22.2d | |
408 | zip1 v9.2d, v21.2d, v23.2d | |
409 | zip2 v13.2d, v21.2d, v23.2d | |
410 | ld1 {v20.16b-v23.16b}, [x2], #64 | |
411 | ||
412 | zip1 v2.2d, v24.2d, v26.2d | |
413 | zip2 v6.2d, v24.2d, v26.2d | |
414 | zip1 v10.2d, v25.2d, v27.2d | |
415 | zip2 v14.2d, v25.2d, v27.2d | |
416 | ld1 {v24.16b-v27.16b}, [x2], #64 | |
417 | ||
418 | zip1 v3.2d, v28.2d, v30.2d | |
419 | zip2 v7.2d, v28.2d, v30.2d | |
420 | zip1 v11.2d, v29.2d, v31.2d | |
421 | zip2 v15.2d, v29.2d, v31.2d | |
422 | ld1 {v28.16b-v31.16b}, [x2] | |
423 | ||
424 | // xor with corresponding input, write to output | |
425 | eor v16.16b, v16.16b, v0.16b | |
426 | eor v17.16b, v17.16b, v1.16b | |
427 | eor v18.16b, v18.16b, v2.16b | |
428 | eor v19.16b, v19.16b, v3.16b | |
429 | eor v20.16b, v20.16b, v4.16b | |
430 | eor v21.16b, v21.16b, v5.16b | |
431 | st1 {v16.16b-v19.16b}, [x1], #64 | |
432 | eor v22.16b, v22.16b, v6.16b | |
433 | eor v23.16b, v23.16b, v7.16b | |
434 | eor v24.16b, v24.16b, v8.16b | |
435 | eor v25.16b, v25.16b, v9.16b | |
436 | st1 {v20.16b-v23.16b}, [x1], #64 | |
437 | eor v26.16b, v26.16b, v10.16b | |
438 | eor v27.16b, v27.16b, v11.16b | |
439 | eor v28.16b, v28.16b, v12.16b | |
440 | st1 {v24.16b-v27.16b}, [x1], #64 | |
441 | eor v29.16b, v29.16b, v13.16b | |
442 | eor v30.16b, v30.16b, v14.16b | |
443 | eor v31.16b, v31.16b, v15.16b | |
444 | st1 {v28.16b-v31.16b}, [x1] | |
445 | ||
446 | ret | |
447 | ENDPROC(chacha20_4block_xor_neon) | |
448 | ||
449 | CTRINC: .word 0, 1, 2, 3 | |
450 | ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f |