]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/arm/crypto/ghash-ce-core.S
Merge drm-upstream/drm-next into drm-intel-next-queued
[mirror_ubuntu-bionic-kernel.git] / arch / arm / crypto / ghash-ce-core.S
1 /*
2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
3 *
4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14 SHASH .req q0
15 T1 .req q1
16 XL .req q2
17 XM .req q3
18 XH .req q4
19 IN1 .req q4
20
21 SHASH_L .req d0
22 SHASH_H .req d1
23 T1_L .req d2
24 T1_H .req d3
25 XL_L .req d4
26 XL_H .req d5
27 XM_L .req d6
28 XM_H .req d7
29 XH_L .req d8
30
31 t0l .req d10
32 t0h .req d11
33 t1l .req d12
34 t1h .req d13
35 t2l .req d14
36 t2h .req d15
37 t3l .req d16
38 t3h .req d17
39 t4l .req d18
40 t4h .req d19
41
42 t0q .req q5
43 t1q .req q6
44 t2q .req q7
45 t3q .req q8
46 t4q .req q9
47 T2 .req q9
48
49 s1l .req d20
50 s1h .req d21
51 s2l .req d22
52 s2h .req d23
53 s3l .req d24
54 s3h .req d25
55 s4l .req d26
56 s4h .req d27
57
58 MASK .req d28
59 SHASH2_p8 .req d28
60
61 k16 .req d29
62 k32 .req d30
63 k48 .req d31
64 SHASH2_p64 .req d31
65
66 .text
67 .fpu crypto-neon-fp-armv8
68
69 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
70 vmull.p64 \rd, \rn, \rm
71 .endm
72
73 /*
74 * This implementation of 64x64 -> 128 bit polynomial multiplication
75 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
76 * "Fast Software Polynomial Multiplication on ARM Processors Using
77 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
78 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
79 *
80 * It has been slightly tweaked for in-order performance, and to allow
81 * 'rq' to overlap with 'ad' or 'bd'.
82 */
83 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
84 vext.8 t0l, \ad, \ad, #1 @ A1
85 .ifc \b1, t4l
86 vext.8 t4l, \bd, \bd, #1 @ B1
87 .endif
88 vmull.p8 t0q, t0l, \bd @ F = A1*B
89 vext.8 t1l, \ad, \ad, #2 @ A2
90 vmull.p8 t4q, \ad, \b1 @ E = A*B1
91 .ifc \b2, t3l
92 vext.8 t3l, \bd, \bd, #2 @ B2
93 .endif
94 vmull.p8 t1q, t1l, \bd @ H = A2*B
95 vext.8 t2l, \ad, \ad, #3 @ A3
96 vmull.p8 t3q, \ad, \b2 @ G = A*B2
97 veor t0q, t0q, t4q @ L = E + F
98 .ifc \b3, t4l
99 vext.8 t4l, \bd, \bd, #3 @ B3
100 .endif
101 vmull.p8 t2q, t2l, \bd @ J = A3*B
102 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
103 veor t1q, t1q, t3q @ M = G + H
104 .ifc \b4, t3l
105 vext.8 t3l, \bd, \bd, #4 @ B4
106 .endif
107 vmull.p8 t4q, \ad, \b3 @ I = A*B3
108 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
109 vmull.p8 t3q, \ad, \b4 @ K = A*B4
110 vand t0h, t0h, k48
111 vand t1h, t1h, k32
112 veor t2q, t2q, t4q @ N = I + J
113 veor t0l, t0l, t0h
114 veor t1l, t1l, t1h
115 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
116 vand t2h, t2h, k16
117 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
118 vmov.i64 t3h, #0
119 vext.8 t0q, t0q, t0q, #15
120 veor t2l, t2l, t2h
121 vext.8 t1q, t1q, t1q, #14
122 vmull.p8 \rq, \ad, \bd @ D = A*B
123 vext.8 t2q, t2q, t2q, #13
124 vext.8 t3q, t3q, t3q, #12
125 veor t0q, t0q, t1q
126 veor t2q, t2q, t3q
127 veor \rq, \rq, t0q
128 veor \rq, \rq, t2q
129 .endm
130
131 //
132 // PMULL (64x64->128) based reduction for CPUs that can do
133 // it in a single instruction.
134 //
135 .macro __pmull_reduce_p64
136 vmull.p64 T1, XL_L, MASK
137
138 veor XH_L, XH_L, XM_H
139 vext.8 T1, T1, T1, #8
140 veor XL_H, XL_H, XM_L
141 veor T1, T1, XL
142
143 vmull.p64 XL, T1_H, MASK
144 .endm
145
146 //
147 // Alternative reduction for CPUs that lack support for the
148 // 64x64->128 PMULL instruction
149 //
150 .macro __pmull_reduce_p8
151 veor XL_H, XL_H, XM_L
152 veor XH_L, XH_L, XM_H
153
154 vshl.i64 T1, XL, #57
155 vshl.i64 T2, XL, #62
156 veor T1, T1, T2
157 vshl.i64 T2, XL, #63
158 veor T1, T1, T2
159 veor XL_H, XL_H, T1_L
160 veor XH_L, XH_L, T1_H
161
162 vshr.u64 T1, XL, #1
163 veor XH, XH, XL
164 veor XL, XL, T1
165 vshr.u64 T1, T1, #6
166 vshr.u64 XL, XL, #1
167 .endm
168
169 .macro ghash_update, pn
170 vld1.64 {XL}, [r1]
171
172 /* do the head block first, if supplied */
173 ldr ip, [sp]
174 teq ip, #0
175 beq 0f
176 vld1.64 {T1}, [ip]
177 teq r0, #0
178 b 1f
179
180 0: vld1.64 {T1}, [r2]!
181 subs r0, r0, #1
182
183 1: /* multiply XL by SHASH in GF(2^128) */
184 #ifndef CONFIG_CPU_BIG_ENDIAN
185 vrev64.8 T1, T1
186 #endif
187 vext.8 IN1, T1, T1, #8
188 veor T1_L, T1_L, XL_H
189 veor XL, XL, IN1
190
191 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
192 veor T1, T1, XL
193 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
194 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
195
196 veor T1, XL, XH
197 veor XM, XM, T1
198
199 __pmull_reduce_\pn
200
201 veor T1, T1, XH
202 veor XL, XL, T1
203
204 bne 0b
205
206 vst1.64 {XL}, [r1]
207 bx lr
208 .endm
209
210 /*
211 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
212 * struct ghash_key const *k, const char *head)
213 */
214 ENTRY(pmull_ghash_update_p64)
215 vld1.64 {SHASH}, [r3]
216 veor SHASH2_p64, SHASH_L, SHASH_H
217
218 vmov.i8 MASK, #0xe1
219 vshl.u64 MASK, MASK, #57
220
221 ghash_update p64
222 ENDPROC(pmull_ghash_update_p64)
223
224 ENTRY(pmull_ghash_update_p8)
225 vld1.64 {SHASH}, [r3]
226 veor SHASH2_p8, SHASH_L, SHASH_H
227
228 vext.8 s1l, SHASH_L, SHASH_L, #1
229 vext.8 s2l, SHASH_L, SHASH_L, #2
230 vext.8 s3l, SHASH_L, SHASH_L, #3
231 vext.8 s4l, SHASH_L, SHASH_L, #4
232 vext.8 s1h, SHASH_H, SHASH_H, #1
233 vext.8 s2h, SHASH_H, SHASH_H, #2
234 vext.8 s3h, SHASH_H, SHASH_H, #3
235 vext.8 s4h, SHASH_H, SHASH_H, #4
236
237 vmov.i64 k16, #0xffff
238 vmov.i64 k32, #0xffffffff
239 vmov.i64 k48, #0xffffffffffff
240
241 ghash_update p8
242 ENDPROC(pmull_ghash_update_p8)