]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sm3_mb / aarch64 / sm3_mb_asimd_x4.S
1 /**********************************************************************
2 Copyright(c) 2020 Arm Corporation All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Arm Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
27 OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 .arch armv8.2-a
30 .text
31 .align 2
32 .p2align 3,,7
33
34 .macro declare_var_vector_reg name:req,reg:req
35 q\name\() .req q\reg
36 v\name\() .req v\reg
37 s\name\() .req s\reg
38 .endm
39
40 job0 .req x0
41 job1 .req x1
42 job2 .req x2
43 job3 .req x3
44 len .req x4
45
46 job0_data .req x5
47 job1_data .req x6
48 job2_data .req x7
49 job3_data .req x9
50
51 job0_digest .req x0
52 job1_digest .req x1
53 job2_digest .req x2
54 job3_digest .req x3
55 job0_tmp .req x10
56 job1_tmp .req x11
57 job2_tmp .req x12
58 job3_tmp .req x13
59 const_adr .req x14
60
61
62 declare_var_vector_reg msg0,0
63 declare_var_vector_reg msg1,1
64 declare_var_vector_reg msg2,2
65 declare_var_vector_reg msg3,3
66 declare_var_vector_reg msg4,4
67 declare_var_vector_reg msg5,5
68 declare_var_vector_reg msg6,6
69 declare_var_vector_reg msg7,7
70 declare_var_vector_reg msg8,8
71 declare_var_vector_reg msg9,9
72 declare_var_vector_reg msg10,10
73 declare_var_vector_reg msg11,11
74 declare_var_vector_reg msg12,12
75 declare_var_vector_reg msg13,13
76 declare_var_vector_reg msg14,14
77 declare_var_vector_reg msg15,15
78 declare_var_vector_reg msg16,16
79
80
81 declare_var_vector_reg dig_A,24
82 declare_var_vector_reg dig_B,25
83 declare_var_vector_reg dig_C,26
84 declare_var_vector_reg dig_D,27
85 declare_var_vector_reg dig_E,28
86 declare_var_vector_reg dig_F,29
87 declare_var_vector_reg dig_G,30
88 declare_var_vector_reg dig_H,31
89
90 declare_var_vector_reg TT1,17
91 declare_var_vector_reg TT2,18
92 declare_var_vector_reg SS1,19
93 declare_var_vector_reg SS2,20
94 declare_var_vector_reg tmp0,21
95 declare_var_vector_reg word_pair,23
96 declare_var_vector_reg Tj,22
97
98
99 .macro rol32 target:req,reg:req,bit:req
100 ushr v\target\().4s,v\reg\().4s,32 - \bit
101 sli v\target\().4s,v\reg\().4s,\bit
102 .endm
103
104 // round 0-11
105 .macro sm3_round_0 round:req,wp:req
106
107 ushr vtmp0.4s,vdig_A.4s,32 - 12
108
109 add vSS1.4s,vdig_E.4s,vTj.4s
110 sli vtmp0.4s,vdig_A.4s,12
111 rev32 vmsg\round\().16b,vmsg\round\().16b
112 rev32 vmsg\wp\().16b,vmsg\wp\().16b
113 add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
114 rol32 SS1,TT1,7
115 eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
116 eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
117
118 eor vTT1.16b,vdig_A.16b,vdig_B.16b
119 eor vTT2.16b,vdig_E.16b,vdig_F.16b
120 eor vTT1.16b,vTT1.16b,vdig_C.16b
121 eor vTT2.16b,vTT2.16b,vdig_G.16b
122
123 add vSS1.4s,vSS1.4s,vmsg\round\().4s
124 add vSS2.4s,vSS2.4s,vword_pair.4s
125 add vTT1.4s,vTT1.4s,vdig_D.4s
126 add vTT2.4s,vTT2.4s,vdig_H.4s
127 ushr vtmp0.4s,vTj.4s,32-1
128 add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
129 sli vtmp0.4s,vTj.4s,1
130 add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
131 mov vTj.16b,vtmp0.16b
132 //D=C
133 mov vdig_D.16b,vdig_C.16b
134 //C = ROTL32(B, 9);
135 ushr vdig_C.4s,vdig_B.4s,32 - 9
136 sli vdig_C.4s,vdig_B.4s,9
137 //B=A
138 mov vdig_B.16b,vdig_A.16b
139 //A=TT1
140 mov vdig_A.16b,vTT1.16b
141 // H=G
142 mov vdig_H.16b,vdig_G.16b
143 //G = ROTL32(F,19)
144 rol32 dig_G,dig_F,19
145 //F = E
146 mov vdig_F.16b,vdig_E.16b
147 // E=Target, TT2=src, TT1,SS1,SS2 is free
148 // E = P0(TT2);
149 ushr vSS2.4s, vTT2.4s, 32 - 9
150 ushr vSS1.4s, vTT2.4s, 32 - 17
151 sli vSS2.4s, vTT2.4s, 9
152 sli vSS1.4s, vTT2.4s, 17
153 eor vdig_E.16b, vTT2.16b, vSS1.16b
154 eor vdig_E.16b, vdig_E.16b, vSS2.16b
155
156 .endm
157
158
159 .macro sm3_round_4 round:req,wp:req
160
161 ushr vtmp0.4s,vdig_A.4s,32 - 12
162 add vSS1.4s,vdig_E.4s,vTj.4s
163 sli vtmp0.4s,vdig_A.4s,12
164 rev32 vmsg\wp\().16b,vmsg\wp\().16b
165 add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
166 rol32 SS1,TT1,7
167 eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
168 eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
169 eor vTT1.16b,vdig_A.16b,vdig_B.16b
170 eor vTT2.16b,vdig_E.16b,vdig_F.16b
171 eor vTT1.16b,vTT1.16b,vdig_C.16b
172 eor vTT2.16b,vTT2.16b,vdig_G.16b
173 add vSS1.4s,vSS1.4s,vmsg\round\().4s
174 add vSS2.4s,vSS2.4s,vword_pair.4s
175 add vTT1.4s,vTT1.4s,vdig_D.4s
176 add vTT2.4s,vTT2.4s,vdig_H.4s
177 ushr vtmp0.4s,vTj.4s,32-1
178 add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
179 sli vtmp0.4s,vTj.4s,1
180 add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
181 mov vTj.16b,vtmp0.16b
182 //D=C
183 mov vdig_D.16b,vdig_C.16b
184 //C = ROTL32(B, 9);
185 ushr vdig_C.4s,vdig_B.4s,32 - 9
186 sli vdig_C.4s,vdig_B.4s,9
187 //B=A
188 mov vdig_B.16b,vdig_A.16b
189 //A=TT1
190 mov vdig_A.16b,vTT1.16b
191 // H=G
192 mov vdig_H.16b,vdig_G.16b
193 //G = ROTL32(F,19)
194 rol32 dig_G,dig_F,19
195 //F = E
196 mov vdig_F.16b,vdig_E.16b
197 // E=Target, TT2=src, TT1,SS1,SS2 is free
198 // E = P0(TT2);
199 ushr vSS2.4s, vTT2.4s, 32 - 9
200 ushr vSS1.4s, vTT2.4s, 32 - 17
201 sli vSS2.4s, vTT2.4s, 9
202 sli vSS1.4s, vTT2.4s, 17
203 eor vdig_E.16b, vTT2.16b, vSS1.16b
204 eor vdig_E.16b, vdig_E.16b, vSS2.16b
205
206 .endm
207
208 //round 12-15
209 .macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4
210 rol32 msg\plus_4,msg\m2,15
211 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
212 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
213 rol32 tmp0,msg\plus_4,15
214 rol32 word_pair,msg\plus_4,23
215 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
216 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
217 rol32 tmp0,msg\m3,7
218 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
219 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
220 ushr vtmp0.4s,vdig_A.4s,32 - 12
221 sli vtmp0.4s,vdig_A.4s,12
222 add vSS1.4s,vdig_E.4s,vTj.4s
223 add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
224 rol32 SS1,SS2,7
225 eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
226 eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
227 eor vTT1.16b,vdig_A.16b,vdig_B.16b
228 eor vTT1.16b,vTT1.16b,vdig_C.16b
229 eor vTT2.16b,vdig_E.16b,vdig_F.16b
230 eor vTT2.16b,vTT2.16b,vdig_G.16b
231 add vSS1.4s,vSS1.4s,vmsg\round\().4s
232 add vSS2.4s,vSS2.4s,vword_pair.4s
233 add vTT1.4s,vTT1.4s,vdig_D.4s
234 add vTT2.4s,vTT2.4s,vdig_H.4s
235 ushr vtmp0.4s,vTj.4s,32-1
236 add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
237 sli vtmp0.4s,vTj.4s,1
238 add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
239 mov vTj.16b,vtmp0.16b
240 //D=C
241 mov vdig_D.16b,vdig_C.16b
242 //C = ROTL32(B, 9);
243 ushr vdig_C.4s,vdig_B.4s,32 - 9
244 sli vdig_C.4s,vdig_B.4s,9
245 //B=A
246 mov vdig_B.16b,vdig_A.16b
247 //A=TT1
248 mov vdig_A.16b,vTT1.16b
249 // H=G
250 mov vdig_H.16b,vdig_G.16b
251 //G = ROTL32(F,19)
252 rol32 dig_G,dig_F,19
253 //F = E
254 mov vdig_F.16b,vdig_E.16b
255 // E=Target, TT2=src, TT1,SS1,SS2 is free
256 // E = P0(TT2);
257 ushr vSS2.4s, vTT2.4s, 32 - 9
258 ushr vSS1.4s, vTT2.4s, 32 - 17
259 sli vSS2.4s, vTT2.4s, 9
260 sli vSS1.4s, vTT2.4s, 17
261 eor vdig_E.16b, vTT2.16b, vSS1.16b
262 eor vdig_E.16b, vdig_E.16b, vSS2.16b
263 .endm
264
265 // round 16-62
266 .macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4
267 rol32 msg\plus_4,msg\m2,15
268 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
269 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
270 rol32 tmp0,msg\plus_4,15
271 rol32 word_pair,msg\plus_4,23
272 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
273 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
274 rol32 tmp0,msg\m3,7
275 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
276 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
277 ushr vtmp0.4s,vdig_A.4s,32 - 12
278 sli vtmp0.4s,vdig_A.4s,12
279 add vSS1.4s,vdig_E.4s,vTj.4s
280 add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
281 rol32 SS1,SS2,7
282 eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
283 eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
284 mov vTT2.16b,vdig_E.16b
285 orr vTT1.16b,vdig_B.16b,vdig_C.16b
286 and vtmp0.16b,vdig_B.16b,vdig_C.16b
287 bsl vTT2.16b,vdig_F.16b,vdig_G.16b
288 and vTT1.16b,vTT1.16b,vdig_A.16b
289 add vSS1.4s,vSS1.4s,vmsg\round\().4s
290 orr vTT1.16b,vTT1.16b,vtmp0.16b
291 add vSS2.4s,vSS2.4s,vword_pair.4s
292 add vTT1.4s,vTT1.4s,vdig_D.4s
293 add vTT2.4s,vTT2.4s,vdig_H.4s
294 ushr vtmp0.4s,vTj.4s,32-1
295 add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
296 sli vtmp0.4s,vTj.4s,1
297 add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
298 mov vTj.16b,vtmp0.16b
299 //D=C
300 mov vdig_D.16b,vdig_C.16b
301 //C = ROTL32(B, 9);
302 ushr vdig_C.4s,vdig_B.4s,32 - 9
303 sli vdig_C.4s,vdig_B.4s,9
304 //B=A
305 mov vdig_B.16b,vdig_A.16b
306 //A=TT1
307 mov vdig_A.16b,vTT1.16b
308 // H=G
309 mov vdig_H.16b,vdig_G.16b
310 //G = ROTL32(F,19)
311 rol32 dig_G,dig_F,19
312 //F = E
313 mov vdig_F.16b,vdig_E.16b
314 // E=Target, TT2=src, TT1,SS1,SS2 is free
315 // E = P0(TT2);
316 ushr vSS2.4s, vTT2.4s, 32 - 9
317 ushr vSS1.4s, vTT2.4s, 32 - 17
318 sli vSS2.4s, vTT2.4s, 9
319 sli vSS1.4s, vTT2.4s, 17
320 eor vdig_E.16b, vTT2.16b, vSS1.16b
321 eor vdig_E.16b, vdig_E.16b, vSS2.16b
322 .endm
323
324 //round 63
325 .macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4
326 rol32 msg\plus_4,msg\m2,15
327 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
328 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
329 rol32 tmp0,msg\plus_4,15
330 rol32 word_pair,msg\plus_4,23
331 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
332 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
333 rol32 tmp0,msg\m3,7
334 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
335 eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
336 ushr vtmp0.4s,vdig_A.4s,32 - 12
337 sli vtmp0.4s,vdig_A.4s,12
338 add vSS1.4s,vdig_E.4s,vTj.4s
339 add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
340 rol32 SS1,SS2,7
341 eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
342 eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
343
344 ldp qmsg0,qmsg1,[sp,dig_off+ 0]
345 mov vTT2.16b,vdig_E.16b
346 ldp qmsg2,qmsg3,[sp,dig_off+ 32]
347 orr vTT1.16b,vdig_B.16b,vdig_C.16b
348 ldp qmsg4,qmsg5,[sp,dig_off+ 64]
349 and vtmp0.16b,vdig_B.16b,vdig_C.16b
350 bsl vTT2.16b,vdig_F.16b,vdig_G.16b
351 ldp qmsg6,qmsg7,[sp,dig_off+ 96]
352 and vTT1.16b,vTT1.16b,vdig_A.16b
353 add vSS1.4s,vSS1.4s,vmsg\round\().4s
354 orr vTT1.16b,vTT1.16b,vtmp0.16b
355 add vSS2.4s,vSS2.4s,vword_pair.4s
356 add vTT1.4s,vTT1.4s,vdig_D.4s
357 add vTT2.4s,vTT2.4s,vdig_H.4s
358 add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
359 add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
360 //D=C
361 eor vdig_D.16b,vdig_C.16b,vmsg3.16b
362 //C = ROTL32(B, 9);
363 ushr vdig_C.4s,vdig_B.4s,32 - 9
364 sli vdig_C.4s,vdig_B.4s,9
365 eor vdig_C.16b,vdig_C.16b,vmsg2.16b
366 //B=A
367 eor vdig_B.16b,vdig_A.16b,vmsg1.16b
368 stp qdig_C,qdig_D,[sp,dig_off+ 32]
369 //A=TT1
370 eor vdig_A.16b,vTT1.16b,vmsg0.16b
371 // H=G
372 eor vdig_H.16b,vdig_G.16b,vmsg7.16b
373 stp qdig_A,qdig_B,[sp,dig_off+ 0]
374 //G = ROTL32(F,19)
375 rol32 dig_G,dig_F,19
376 eor vdig_G.16b,vdig_G.16b,vmsg6.16b
377 //F = E
378 eor vdig_F.16b,vdig_E.16b,vmsg5.16b
379 stp qdig_G,qdig_H,[sp,dig_off+ 96]
380 // E=Target, TT2=src, TT1,SS1,SS2 is free
381 // E = P0(TT2);
382 ushr vSS2.4s, vTT2.4s, 32 - 9
383 ushr vSS1.4s, vTT2.4s, 32 - 17
384 sli vSS2.4s, vTT2.4s, 9
385 sli vSS1.4s, vTT2.4s, 17
386 eor vdig_E.16b, vTT2.16b, vSS1.16b
387 eor vdig_E.16b, vdig_E.16b, vSS2.16b
388 eor vdig_E.16b, vdig_E.16b, vmsg4.16b
389 stp qdig_E,qdig_F,[sp,dig_off+ 64]
390 .endm
391
392 .set dig_off , 80
393
394 #define STACK_SIZE 224
395 .global sm3_mb_asimd_x4
396 .type sm3_mb_asimd_x4, %function
397 sm3_mb_asimd_x4:
398 stp x29,x30, [sp,-STACK_SIZE]!
399 cmp len,0
400 //push d8~d15
401 ldr job0_data, [job0],64
402 stp d8,d9, [sp,16]
403 ldr job1_data, [job1],64
404 stp d10,d11,[sp,32]
405 ldr job2_data, [job2],64
406 stp d12,d13,[sp,48]
407 ldr job3_data, [job3],64
408 stp d14,d15,[sp,64]
409 ble .exit_func
410
411 mov job0_tmp,job0_digest
412 mov job1_tmp,job1_digest
413 mov job2_tmp,job2_digest
414 mov job3_tmp,job3_digest
415 //load digests
416 ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16
417 ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16
418 ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16
419 adrp const_adr, .consts
420 ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16
421 add const_adr, const_adr, #:lo12:.consts
422 ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp]
423 rev32 vdig_A.16b,vdig_A.16b
424 ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp]
425 rev32 vdig_B.16b,vdig_B.16b
426 ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp]
427 rev32 vdig_C.16b,vdig_C.16b
428 ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp]
429 rev32 vdig_D.16b,vdig_D.16b
430 stp qdig_A,qdig_B,[sp,dig_off+ 0]
431 rev32 vdig_E.16b,vdig_E.16b
432 rev32 vdig_F.16b,vdig_F.16b
433 stp qdig_C,qdig_D,[sp,dig_off+ 32]
434 rev32 vdig_G.16b,vdig_G.16b
435 rev32 vdig_H.16b,vdig_H.16b
436 stp qdig_E,qdig_F,[sp,dig_off+ 64]
437 stp qdig_G,qdig_H,[sp,dig_off+ 96]
438
439 .start_loop:
440 ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16
441 ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16
442 ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16
443 ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16
444 ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16
445 ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16
446 ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16
447 ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16
448 ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16
449 ldr qTj,[const_adr]
450
451 sm3_round_0 0, 4
452
453 ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16
454 sm3_round_0 1, 5
455
456 ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16
457 sm3_round_0 2, 6
458 ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16
459 sm3_round_0 3, 7
460
461 ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16
462
463 sm3_round_4 4, 8
464 ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16
465 sm3_round_4 5, 9
466 ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16
467 sm3_round_4 6,10
468 ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16
469 sm3_round_4 7,11
470 sm3_round_4 8,12
471 sm3_round_4 9,13
472 sm3_round_4 10,14
473 sm3_round_4 11,15
474
475 sm3_round_12 12,16, 0, 7,13, 3,10 //12
476 sm3_round_12 13, 0, 1, 8,14, 4,11 //13
477 sm3_round_12 14, 1, 2, 9,15, 5,12 //14
478 sm3_round_12 15, 2, 3,10,16, 6,13 //15
479
480 ldr qTj,[const_adr,16]
481 sm3_round_16 16, 3, 4,11, 0, 7,14 //16
482 #if 0
483 stp sdig_A,sdig_B,[job0_digest]
484 stp sdig_C,sdig_D,[job0_digest,8]
485 stp sdig_E,sdig_F,[job0_digest,16]
486 stp sdig_G,sdig_H,[job0_digest,24]
487 b .exit_func
488 #endif
489 sm3_round_16 0, 4, 5,12, 1, 8,15 //17
490
491 sm3_round_16 1, 5, 6,13, 2, 9,16 //18
492 sm3_round_16 2, 6, 7,14, 3,10, 0 //19
493 sm3_round_16 3, 7, 8,15, 4,11, 1 //20
494 sm3_round_16 4, 8, 9,16, 5,12, 2 //21
495 sm3_round_16 5, 9,10, 0, 6,13, 3 //22
496 sm3_round_16 6,10,11, 1, 7,14, 4 //23
497 sm3_round_16 7,11,12, 2, 8,15, 5 //24
498 sm3_round_16 8,12,13, 3, 9,16, 6 //25
499 sm3_round_16 9,13,14, 4,10, 0, 7 //26
500 sm3_round_16 10,14,15, 5,11, 1, 8 //27
501 sm3_round_16 11,15,16, 6,12, 2, 9 //28
502 sm3_round_16 12,16, 0, 7,13, 3,10 //29
503 sm3_round_16 13, 0, 1, 8,14, 4,11 //30
504 sm3_round_16 14, 1, 2, 9,15, 5,12 //31
505 sm3_round_16 15, 2, 3,10,16, 6,13 //32
506 sm3_round_16 16, 3, 4,11, 0, 7,14 //33
507 sm3_round_16 0, 4, 5,12, 1, 8,15 //34
508 sm3_round_16 1, 5, 6,13, 2, 9,16 //35
509 sm3_round_16 2, 6, 7,14, 3,10, 0 //36
510 sm3_round_16 3, 7, 8,15, 4,11, 1 //37
511 sm3_round_16 4, 8, 9,16, 5,12, 2 //38
512 sm3_round_16 5, 9,10, 0, 6,13, 3 //39
513 sm3_round_16 6,10,11, 1, 7,14, 4 //40
514 sm3_round_16 7,11,12, 2, 8,15, 5 //41
515 sm3_round_16 8,12,13, 3, 9,16, 6 //42
516 sm3_round_16 9,13,14, 4,10, 0, 7 //43
517 sm3_round_16 10,14,15, 5,11, 1, 8 //44
518 sm3_round_16 11,15,16, 6,12, 2, 9 //45
519 sm3_round_16 12,16, 0, 7,13, 3,10 //46
520 sm3_round_16 13, 0, 1, 8,14, 4,11 //47
521 sm3_round_16 14, 1, 2, 9,15, 5,12 //48
522 sm3_round_16 15, 2, 3,10,16, 6,13 //49
523 sm3_round_16 16, 3, 4,11, 0, 7,14 //50
524 sm3_round_16 0, 4, 5,12, 1, 8,15 //51
525 sm3_round_16 1, 5, 6,13, 2, 9,16 //52
526 sm3_round_16 2, 6, 7,14, 3,10, 0 //53
527 sm3_round_16 3, 7, 8,15, 4,11, 1 //54
528 sm3_round_16 4, 8, 9,16, 5,12, 2 //55
529 sm3_round_16 5, 9,10, 0, 6,13, 3 //56
530 sm3_round_16 6,10,11, 1, 7,14, 4 //57
531 sm3_round_16 7,11,12, 2, 8,15, 5 //58
532 sm3_round_16 8,12,13, 3, 9,16, 6 //59
533 sm3_round_16 9,13,14, 4,10, 0, 7 //60
534 sm3_round_16 10,14,15, 5,11, 1, 8 //61
535 sm3_round_16 11,15,16, 6,12, 2, 9 //62
536 sm3_round_63 12,16, 0, 7,13, 3,10 //63
537
538 subs len,len,1
539 bne .start_loop
540
541 //save digests with big endian
542 rev32 vdig_A.16b,vdig_A.16b
543 rev32 vdig_B.16b,vdig_B.16b
544 rev32 vdig_C.16b,vdig_C.16b
545 rev32 vdig_D.16b,vdig_D.16b
546 st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16
547 rev32 vdig_E.16b,vdig_E.16b
548 rev32 vdig_F.16b,vdig_F.16b
549 st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16
550 rev32 vdig_G.16b,vdig_G.16b
551 rev32 vdig_H.16b,vdig_H.16b
552 st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16
553 st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16
554 st4 {vdig_E.s-vdig_H.s}[0],[job0_digest]
555 st4 {vdig_E.s-vdig_H.s}[1],[job1_digest]
556 st4 {vdig_E.s-vdig_H.s}[2],[job2_digest]
557 st4 {vdig_E.s-vdig_H.s}[3],[job3_digest]
558
559 .exit_func:
560 ldp d8, d9, [sp,16]
561 ldp d10,d11,[sp,32]
562 ldp d12,d13,[sp,48]
563 ldp d14,d15,[sp,64]
564 ldp x29, x30, [sp], STACK_SIZE
565 ret
566 .consts:
567 .word 0x79cc4519
568 .word 0x79cc4519
569 .word 0x79cc4519
570 .word 0x79cc4519
571 .word 0x9d8a7a87
572 .word 0x9d8a7a87
573 .word 0x9d8a7a87
574 .word 0x9d8a7a87
575 .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
576