1 /**********************************************************************
2 Copyright(c) 2020 Arm Corporation All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Arm Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
38 .macro declare_var_vector_reg name:req,reg:req
50 declare_var_vector_reg lane0_msg_0, 0
51 declare_var_vector_reg lane1_msg_0, 1
52 declare_var_vector_reg lane2_msg_0, 2
53 declare_var_vector_reg lane3_msg_0, 3
54 declare_var_vector_reg lane0_msg_1, 4
55 declare_var_vector_reg lane1_msg_1, 5
56 declare_var_vector_reg lane2_msg_1, 6
57 declare_var_vector_reg lane3_msg_1, 7
58 declare_var_vector_reg lane0_msg_2, 8
59 declare_var_vector_reg lane1_msg_2, 9
60 declare_var_vector_reg lane2_msg_2,10
61 declare_var_vector_reg lane3_msg_2,11
62 declare_var_vector_reg lane0_msg_3,12
63 declare_var_vector_reg lane1_msg_3,13
64 declare_var_vector_reg lane2_msg_3,14
65 declare_var_vector_reg lane3_msg_3,15
67 declare_var_vector_reg lane0_abcd ,16
68 declare_var_vector_reg lane1_abcd ,17
69 declare_var_vector_reg lane2_abcd ,18
70 declare_var_vector_reg lane3_abcd ,19
71 declare_var_vector_reg lane0_tmp0 ,20
72 declare_var_vector_reg lane1_tmp0 ,21
73 declare_var_vector_reg lane2_tmp0 ,22
74 declare_var_vector_reg lane3_tmp0 ,23
75 declare_var_vector_reg lane0_tmp1 ,24
76 declare_var_vector_reg lane1_tmp1 ,25
77 declare_var_vector_reg lane2_tmp1 ,26
78 declare_var_vector_reg lane3_tmp1 ,27
81 declare_var_vector_reg e0 ,28
82 declare_var_vector_reg e1 ,29
83 declare_var_vector_reg key ,30
84 declare_var_vector_reg tmp ,31
97 .macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req
98 sha1h lane0_\tmp0\()_s, lane0_\abcd\()_s
99 sha1h lane1_\tmp0\()_s, lane1_\abcd\()_s
100 sha1h lane2_\tmp0\()_s, lane2_\abcd\()_s
101 sha1h lane3_\tmp0\()_s, lane3_\abcd\()_s
102 mov \e0\()_v.S[0],lane0_\tmp0\()_v.S[0]
103 mov \e0\()_v.S[1],lane1_\tmp0\()_v.S[0]
104 mov \e0\()_v.S[2],lane2_\tmp0\()_v.S[0]
105 mov \e0\()_v.S[3],lane3_\tmp0\()_v.S[0]
106 mov lane0_\tmp0\()_v.S[0],\e1\()_v.S[0]
107 mov lane1_\tmp0\()_v.S[0],\e1\()_v.S[1]
108 mov lane2_\tmp0\()_v.S[0],\e1\()_v.S[2]
109 mov lane3_\tmp0\()_v.S[0],\e1\()_v.S[3]
110 \inst lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s
111 \inst lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s
112 \inst lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s
113 \inst lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s
114 ld1 {lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr]
115 add lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s
116 add lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s
117 add lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s
118 add lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s
119 st1 {lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr]
120 sha1su1 lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s
121 sha1su1 lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s
122 sha1su1 lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s
123 sha1su1 lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s
124 sha1su0 lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s
125 sha1su0 lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s
126 sha1su0 lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s
127 sha1su0 lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s
133 void mh_sha1_block_ce(const uint8_t * input_data,
134 uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
135 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
145 .global mh_sha1_block_ce
146 .type mh_sha1_block_ce, %function
148 //save temp vector registers
149 stp d8, d9, [sp, -128]!
151 stp d10, d11, [sp, 16]
152 stp d12, d13, [sp, 32]
153 stp d14, d15, [sp, 48]
154 mov tmp0_adr,frame_buffer
155 add tmp1_adr,tmp0_adr,128
160 mov msg_adr,input_data
165 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs
166 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs
167 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs
168 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs
170 ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs
171 ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs
172 ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs
173 ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs
175 ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs
176 ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs
177 ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs
178 ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs
180 ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs
181 ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs
182 ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs
183 ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs
185 add digest_adr,digests,block_cnt
186 ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
187 ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
188 ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
189 ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
190 ldr e0_q,[digest_adr]
195 rev32 lane0_msg_0_v.16b,lane0_msg_0_v.16b
196 rev32 lane1_msg_0_v.16b,lane1_msg_0_v.16b
197 rev32 lane2_msg_0_v.16b,lane2_msg_0_v.16b
198 rev32 lane3_msg_0_v.16b,lane3_msg_0_v.16b
199 rev32 lane0_msg_1_v.16b,lane0_msg_1_v.16b
200 rev32 lane1_msg_1_v.16b,lane1_msg_1_v.16b
201 rev32 lane2_msg_1_v.16b,lane2_msg_1_v.16b
202 rev32 lane3_msg_1_v.16b,lane3_msg_1_v.16b
203 rev32 lane0_msg_2_v.16b,lane0_msg_2_v.16b
204 rev32 lane1_msg_2_v.16b,lane1_msg_2_v.16b
205 rev32 lane2_msg_2_v.16b,lane2_msg_2_v.16b
206 rev32 lane3_msg_2_v.16b,lane3_msg_2_v.16b
207 rev32 lane0_msg_3_v.16b,lane0_msg_3_v.16b
208 rev32 lane1_msg_3_v.16b,lane1_msg_3_v.16b
209 rev32 lane2_msg_3_v.16b,lane2_msg_3_v.16b
210 rev32 lane3_msg_3_v.16b,lane3_msg_3_v.16b
212 add lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s
213 add lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s
214 add lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s
215 add lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s
216 st1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
218 add lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s
219 add lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s
220 add lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s
221 add lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s
224 sha1h lane0_tmp1_s,lane0_abcd_s
225 sha1h lane1_tmp1_s,lane1_abcd_s
226 sha1h lane2_tmp1_s,lane2_abcd_s
227 sha1h lane3_tmp1_s,lane3_abcd_s
228 mov e1_v.S[0],lane0_tmp1_v.S[0]
229 mov e1_v.S[1],lane1_tmp1_v.S[0]
230 mov e1_v.S[2],lane2_tmp1_v.S[0]
231 mov e1_v.S[3],lane3_tmp1_v.S[0]
232 mov lane0_tmp1_v.S[0],e0_v.S[0]
233 mov lane1_tmp1_v.S[0],e0_v.S[1]
234 mov lane2_tmp1_v.S[0],e0_v.S[2]
235 mov lane3_tmp1_v.S[0],e0_v.S[3]
236 sha1c lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s
237 sha1c lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s
238 sha1c lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s
239 sha1c lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s
240 ld1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
241 add lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s
242 sha1su0 lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s
243 add lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s
244 sha1su0 lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s
245 add lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s
246 sha1su0 lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s
247 add lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s
248 sha1su0 lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s
249 st1 {lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr]
251 sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 4-7 */
252 sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
257 sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 /* rounds 12-15 */
258 sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
259 sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 20-23 */
260 sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
261 sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
265 sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
266 sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 36-39 */
267 sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
268 sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
269 sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
273 sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 52-55 */
274 sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
275 sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
276 sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
278 //msg2 and msg1 are free
279 mov lane0_msg_2_v.S[0],e1_v.S[0]
280 mov lane1_msg_2_v.S[0],e1_v.S[1]
281 mov lane2_msg_2_v.S[0],e1_v.S[2]
282 mov lane3_msg_2_v.S[0],e1_v.S[3]
285 sha1h lane0_msg_1_s,lane0_abcd_s
286 sha1h lane1_msg_1_s,lane1_abcd_s
287 sha1h lane2_msg_1_s,lane2_abcd_s
288 sha1h lane3_msg_1_s,lane3_abcd_s
289 sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
290 sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
291 sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
292 sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
293 add lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s
294 add lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s
295 add lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s
296 add lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s
297 sha1su1 lane0_msg_0_v.4s,lane0_msg_3_v.4s
298 sha1su1 lane1_msg_0_v.4s,lane1_msg_3_v.4s
299 sha1su1 lane2_msg_0_v.4s,lane2_msg_3_v.4s
300 sha1su1 lane3_msg_0_v.4s,lane3_msg_3_v.4s
303 sha1h lane0_msg_2_s,lane0_abcd_s
304 sha1h lane1_msg_2_s,lane1_abcd_s
305 sha1h lane2_msg_2_s,lane2_abcd_s
306 sha1h lane3_msg_2_s,lane3_abcd_s
307 sha1p lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s
308 sha1p lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s
309 sha1p lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s
310 sha1p lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s
313 sha1h lane0_msg_1_s,lane0_abcd_s
314 sha1h lane1_msg_1_s,lane1_abcd_s
315 sha1h lane2_msg_1_s,lane2_abcd_s
316 sha1h lane3_msg_1_s,lane3_abcd_s
317 sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
318 sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
319 sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
320 sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
321 add digest_adr,digests,block_cnt
322 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs
323 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs
324 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs
325 ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs
326 ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr]
328 add lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S
329 add lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S
330 add lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S
331 add lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S
333 add lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S
334 add lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S
335 add lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S
336 add lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S
338 add digest_adr,digests,block_cnt
339 st4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
340 st4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
341 st4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
342 st4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
343 st4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr]
345 add block_cnt,block_cnt,16
347 add msg_adr,input_data,block_cnt
348 add digest_adr,digests,block_cnt
351 subs num_blocks,num_blocks,1
352 add input_data,input_data,1024
355 //restore temp register
356 ldp d10, d11, [sp, 16]
357 ldp d12, d13, [sp, 32]
358 ldp d14, d15, [sp, 48]
359 ldp d8, d9, [sp], 128
362 .size mh_sha1_block_ce, .-mh_sha1_block_ce
363 .section .rodata.cst16,"aM",@progbits,16