]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/arm64/crypto/chacha20-neon-core.S
Merge branch 'overlayfs-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mszer...
[mirror_ubuntu-artful-kernel.git] / arch / arm64 / crypto / chacha20-neon-core.S
CommitLineData
b7171ce9
AB
1/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
3 *
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22
23 .text
24 .align 6
25
26ENTRY(chacha20_block_xor_neon)
27 // x0: Input state matrix, s
28 // x1: 1 data block output, o
29 // x2: 1 data block input, i
30
31 //
32 // This function encrypts one ChaCha20 block by loading the state matrix
33 // in four NEON registers. It performs matrix operation on four words in
34 // parallel, but requires shuffling to rearrange the words after each
35 // round.
36 //
37
38 // x0..3 = s0..3
39 adr x3, ROT8
40 ld1 {v0.4s-v3.4s}, [x0]
41 ld1 {v8.4s-v11.4s}, [x0]
42 ld1 {v12.4s}, [x3]
43
44 mov x3, #10
45
46.Ldoubleround:
47 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
48 add v0.4s, v0.4s, v1.4s
49 eor v3.16b, v3.16b, v0.16b
50 rev32 v3.8h, v3.8h
51
52 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
53 add v2.4s, v2.4s, v3.4s
54 eor v4.16b, v1.16b, v2.16b
55 shl v1.4s, v4.4s, #12
56 sri v1.4s, v4.4s, #20
57
58 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
59 add v0.4s, v0.4s, v1.4s
60 eor v3.16b, v3.16b, v0.16b
61 tbl v3.16b, {v3.16b}, v12.16b
62
63 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
64 add v2.4s, v2.4s, v3.4s
65 eor v4.16b, v1.16b, v2.16b
66 shl v1.4s, v4.4s, #7
67 sri v1.4s, v4.4s, #25
68
69 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
70 ext v1.16b, v1.16b, v1.16b, #4
71 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
72 ext v2.16b, v2.16b, v2.16b, #8
73 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
74 ext v3.16b, v3.16b, v3.16b, #12
75
76 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
77 add v0.4s, v0.4s, v1.4s
78 eor v3.16b, v3.16b, v0.16b
79 rev32 v3.8h, v3.8h
80
81 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
82 add v2.4s, v2.4s, v3.4s
83 eor v4.16b, v1.16b, v2.16b
84 shl v1.4s, v4.4s, #12
85 sri v1.4s, v4.4s, #20
86
87 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
88 add v0.4s, v0.4s, v1.4s
89 eor v3.16b, v3.16b, v0.16b
90 tbl v3.16b, {v3.16b}, v12.16b
91
92 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
93 add v2.4s, v2.4s, v3.4s
94 eor v4.16b, v1.16b, v2.16b
95 shl v1.4s, v4.4s, #7
96 sri v1.4s, v4.4s, #25
97
98 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
99 ext v1.16b, v1.16b, v1.16b, #12
100 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
101 ext v2.16b, v2.16b, v2.16b, #8
102 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
103 ext v3.16b, v3.16b, v3.16b, #4
104
105 subs x3, x3, #1
106 b.ne .Ldoubleround
107
108 ld1 {v4.16b-v7.16b}, [x2]
109
110 // o0 = i0 ^ (x0 + s0)
111 add v0.4s, v0.4s, v8.4s
112 eor v0.16b, v0.16b, v4.16b
113
114 // o1 = i1 ^ (x1 + s1)
115 add v1.4s, v1.4s, v9.4s
116 eor v1.16b, v1.16b, v5.16b
117
118 // o2 = i2 ^ (x2 + s2)
119 add v2.4s, v2.4s, v10.4s
120 eor v2.16b, v2.16b, v6.16b
121
122 // o3 = i3 ^ (x3 + s3)
123 add v3.4s, v3.4s, v11.4s
124 eor v3.16b, v3.16b, v7.16b
125
126 st1 {v0.16b-v3.16b}, [x1]
127
128 ret
129ENDPROC(chacha20_block_xor_neon)
130
131 .align 6
132ENTRY(chacha20_4block_xor_neon)
133 // x0: Input state matrix, s
134 // x1: 4 data blocks output, o
135 // x2: 4 data blocks input, i
136
137 //
138 // This function encrypts four consecutive ChaCha20 blocks by loading
139 // the state matrix in NEON registers four times. The algorithm performs
140 // each operation on the corresponding word of each state matrix, hence
141 // requires no word shuffling. For final XORing step we transpose the
142 // matrix by interleaving 32- and then 64-bit words, which allows us to
143 // do XOR in NEON registers.
144 //
145 adr x3, CTRINC // ... and ROT8
146 ld1 {v30.4s-v31.4s}, [x3]
147
148 // x0..15[0-3] = s0..3[0..3]
149 mov x4, x0
150 ld4r { v0.4s- v3.4s}, [x4], #16
151 ld4r { v4.4s- v7.4s}, [x4], #16
152 ld4r { v8.4s-v11.4s}, [x4], #16
153 ld4r {v12.4s-v15.4s}, [x4]
154
155 // x12 += counter values 0-3
156 add v12.4s, v12.4s, v30.4s
157
158 mov x3, #10
159
160.Ldoubleround4:
161 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
162 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
163 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
164 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
165 add v0.4s, v0.4s, v4.4s
166 add v1.4s, v1.4s, v5.4s
167 add v2.4s, v2.4s, v6.4s
168 add v3.4s, v3.4s, v7.4s
169
170 eor v12.16b, v12.16b, v0.16b
171 eor v13.16b, v13.16b, v1.16b
172 eor v14.16b, v14.16b, v2.16b
173 eor v15.16b, v15.16b, v3.16b
174
175 rev32 v12.8h, v12.8h
176 rev32 v13.8h, v13.8h
177 rev32 v14.8h, v14.8h
178 rev32 v15.8h, v15.8h
179
180 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
181 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
182 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
183 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
184 add v8.4s, v8.4s, v12.4s
185 add v9.4s, v9.4s, v13.4s
186 add v10.4s, v10.4s, v14.4s
187 add v11.4s, v11.4s, v15.4s
188
189 eor v16.16b, v4.16b, v8.16b
190 eor v17.16b, v5.16b, v9.16b
191 eor v18.16b, v6.16b, v10.16b
192 eor v19.16b, v7.16b, v11.16b
193
194 shl v4.4s, v16.4s, #12
195 shl v5.4s, v17.4s, #12
196 shl v6.4s, v18.4s, #12
197 shl v7.4s, v19.4s, #12
198
199 sri v4.4s, v16.4s, #20
200 sri v5.4s, v17.4s, #20
201 sri v6.4s, v18.4s, #20
202 sri v7.4s, v19.4s, #20
203
204 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
205 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
206 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
207 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
208 add v0.4s, v0.4s, v4.4s
209 add v1.4s, v1.4s, v5.4s
210 add v2.4s, v2.4s, v6.4s
211 add v3.4s, v3.4s, v7.4s
212
213 eor v12.16b, v12.16b, v0.16b
214 eor v13.16b, v13.16b, v1.16b
215 eor v14.16b, v14.16b, v2.16b
216 eor v15.16b, v15.16b, v3.16b
217
218 tbl v12.16b, {v12.16b}, v31.16b
219 tbl v13.16b, {v13.16b}, v31.16b
220 tbl v14.16b, {v14.16b}, v31.16b
221 tbl v15.16b, {v15.16b}, v31.16b
222
223 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
224 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
225 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
226 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
227 add v8.4s, v8.4s, v12.4s
228 add v9.4s, v9.4s, v13.4s
229 add v10.4s, v10.4s, v14.4s
230 add v11.4s, v11.4s, v15.4s
231
232 eor v16.16b, v4.16b, v8.16b
233 eor v17.16b, v5.16b, v9.16b
234 eor v18.16b, v6.16b, v10.16b
235 eor v19.16b, v7.16b, v11.16b
236
237 shl v4.4s, v16.4s, #7
238 shl v5.4s, v17.4s, #7
239 shl v6.4s, v18.4s, #7
240 shl v7.4s, v19.4s, #7
241
242 sri v4.4s, v16.4s, #25
243 sri v5.4s, v17.4s, #25
244 sri v6.4s, v18.4s, #25
245 sri v7.4s, v19.4s, #25
246
247 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
248 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
249 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
250 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
251 add v0.4s, v0.4s, v5.4s
252 add v1.4s, v1.4s, v6.4s
253 add v2.4s, v2.4s, v7.4s
254 add v3.4s, v3.4s, v4.4s
255
256 eor v15.16b, v15.16b, v0.16b
257 eor v12.16b, v12.16b, v1.16b
258 eor v13.16b, v13.16b, v2.16b
259 eor v14.16b, v14.16b, v3.16b
260
261 rev32 v15.8h, v15.8h
262 rev32 v12.8h, v12.8h
263 rev32 v13.8h, v13.8h
264 rev32 v14.8h, v14.8h
265
266 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
267 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
268 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
269 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
270 add v10.4s, v10.4s, v15.4s
271 add v11.4s, v11.4s, v12.4s
272 add v8.4s, v8.4s, v13.4s
273 add v9.4s, v9.4s, v14.4s
274
275 eor v16.16b, v5.16b, v10.16b
276 eor v17.16b, v6.16b, v11.16b
277 eor v18.16b, v7.16b, v8.16b
278 eor v19.16b, v4.16b, v9.16b
279
280 shl v5.4s, v16.4s, #12
281 shl v6.4s, v17.4s, #12
282 shl v7.4s, v18.4s, #12
283 shl v4.4s, v19.4s, #12
284
285 sri v5.4s, v16.4s, #20
286 sri v6.4s, v17.4s, #20
287 sri v7.4s, v18.4s, #20
288 sri v4.4s, v19.4s, #20
289
290 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
291 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
292 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
293 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
294 add v0.4s, v0.4s, v5.4s
295 add v1.4s, v1.4s, v6.4s
296 add v2.4s, v2.4s, v7.4s
297 add v3.4s, v3.4s, v4.4s
298
299 eor v15.16b, v15.16b, v0.16b
300 eor v12.16b, v12.16b, v1.16b
301 eor v13.16b, v13.16b, v2.16b
302 eor v14.16b, v14.16b, v3.16b
303
304 tbl v15.16b, {v15.16b}, v31.16b
305 tbl v12.16b, {v12.16b}, v31.16b
306 tbl v13.16b, {v13.16b}, v31.16b
307 tbl v14.16b, {v14.16b}, v31.16b
308
309 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
310 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
311 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
312 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
313 add v10.4s, v10.4s, v15.4s
314 add v11.4s, v11.4s, v12.4s
315 add v8.4s, v8.4s, v13.4s
316 add v9.4s, v9.4s, v14.4s
317
318 eor v16.16b, v5.16b, v10.16b
319 eor v17.16b, v6.16b, v11.16b
320 eor v18.16b, v7.16b, v8.16b
321 eor v19.16b, v4.16b, v9.16b
322
323 shl v5.4s, v16.4s, #7
324 shl v6.4s, v17.4s, #7
325 shl v7.4s, v18.4s, #7
326 shl v4.4s, v19.4s, #7
327
328 sri v5.4s, v16.4s, #25
329 sri v6.4s, v17.4s, #25
330 sri v7.4s, v18.4s, #25
331 sri v4.4s, v19.4s, #25
332
333 subs x3, x3, #1
334 b.ne .Ldoubleround4
335
336 ld4r {v16.4s-v19.4s}, [x0], #16
337 ld4r {v20.4s-v23.4s}, [x0], #16
338
339 // x12 += counter values 0-3
340 add v12.4s, v12.4s, v30.4s
341
342 // x0[0-3] += s0[0]
343 // x1[0-3] += s0[1]
344 // x2[0-3] += s0[2]
345 // x3[0-3] += s0[3]
346 add v0.4s, v0.4s, v16.4s
347 add v1.4s, v1.4s, v17.4s
348 add v2.4s, v2.4s, v18.4s
349 add v3.4s, v3.4s, v19.4s
350
351 ld4r {v24.4s-v27.4s}, [x0], #16
352 ld4r {v28.4s-v31.4s}, [x0]
353
354 // x4[0-3] += s1[0]
355 // x5[0-3] += s1[1]
356 // x6[0-3] += s1[2]
357 // x7[0-3] += s1[3]
358 add v4.4s, v4.4s, v20.4s
359 add v5.4s, v5.4s, v21.4s
360 add v6.4s, v6.4s, v22.4s
361 add v7.4s, v7.4s, v23.4s
362
363 // x8[0-3] += s2[0]
364 // x9[0-3] += s2[1]
365 // x10[0-3] += s2[2]
366 // x11[0-3] += s2[3]
367 add v8.4s, v8.4s, v24.4s
368 add v9.4s, v9.4s, v25.4s
369 add v10.4s, v10.4s, v26.4s
370 add v11.4s, v11.4s, v27.4s
371
372 // x12[0-3] += s3[0]
373 // x13[0-3] += s3[1]
374 // x14[0-3] += s3[2]
375 // x15[0-3] += s3[3]
376 add v12.4s, v12.4s, v28.4s
377 add v13.4s, v13.4s, v29.4s
378 add v14.4s, v14.4s, v30.4s
379 add v15.4s, v15.4s, v31.4s
380
381 // interleave 32-bit words in state n, n+1
382 zip1 v16.4s, v0.4s, v1.4s
383 zip2 v17.4s, v0.4s, v1.4s
384 zip1 v18.4s, v2.4s, v3.4s
385 zip2 v19.4s, v2.4s, v3.4s
386 zip1 v20.4s, v4.4s, v5.4s
387 zip2 v21.4s, v4.4s, v5.4s
388 zip1 v22.4s, v6.4s, v7.4s
389 zip2 v23.4s, v6.4s, v7.4s
390 zip1 v24.4s, v8.4s, v9.4s
391 zip2 v25.4s, v8.4s, v9.4s
392 zip1 v26.4s, v10.4s, v11.4s
393 zip2 v27.4s, v10.4s, v11.4s
394 zip1 v28.4s, v12.4s, v13.4s
395 zip2 v29.4s, v12.4s, v13.4s
396 zip1 v30.4s, v14.4s, v15.4s
397 zip2 v31.4s, v14.4s, v15.4s
398
399 // interleave 64-bit words in state n, n+2
400 zip1 v0.2d, v16.2d, v18.2d
401 zip2 v4.2d, v16.2d, v18.2d
402 zip1 v8.2d, v17.2d, v19.2d
403 zip2 v12.2d, v17.2d, v19.2d
404 ld1 {v16.16b-v19.16b}, [x2], #64
405
406 zip1 v1.2d, v20.2d, v22.2d
407 zip2 v5.2d, v20.2d, v22.2d
408 zip1 v9.2d, v21.2d, v23.2d
409 zip2 v13.2d, v21.2d, v23.2d
410 ld1 {v20.16b-v23.16b}, [x2], #64
411
412 zip1 v2.2d, v24.2d, v26.2d
413 zip2 v6.2d, v24.2d, v26.2d
414 zip1 v10.2d, v25.2d, v27.2d
415 zip2 v14.2d, v25.2d, v27.2d
416 ld1 {v24.16b-v27.16b}, [x2], #64
417
418 zip1 v3.2d, v28.2d, v30.2d
419 zip2 v7.2d, v28.2d, v30.2d
420 zip1 v11.2d, v29.2d, v31.2d
421 zip2 v15.2d, v29.2d, v31.2d
422 ld1 {v28.16b-v31.16b}, [x2]
423
424 // xor with corresponding input, write to output
425 eor v16.16b, v16.16b, v0.16b
426 eor v17.16b, v17.16b, v1.16b
427 eor v18.16b, v18.16b, v2.16b
428 eor v19.16b, v19.16b, v3.16b
429 eor v20.16b, v20.16b, v4.16b
430 eor v21.16b, v21.16b, v5.16b
431 st1 {v16.16b-v19.16b}, [x1], #64
432 eor v22.16b, v22.16b, v6.16b
433 eor v23.16b, v23.16b, v7.16b
434 eor v24.16b, v24.16b, v8.16b
435 eor v25.16b, v25.16b, v9.16b
436 st1 {v20.16b-v23.16b}, [x1], #64
437 eor v26.16b, v26.16b, v10.16b
438 eor v27.16b, v27.16b, v11.16b
439 eor v28.16b, v28.16b, v12.16b
440 st1 {v24.16b-v27.16b}, [x1], #64
441 eor v29.16b, v29.16b, v13.16b
442 eor v30.16b, v30.16b, v14.16b
443 eor v31.16b, v31.16b, v15.16b
444 st1 {v28.16b-v31.16b}, [x1]
445
446 ret
447ENDPROC(chacha20_4block_xor_neon)
448
449CTRINC: .word 0, 1, 2, 3
450ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f