]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blob - arch/powerpc/lib/checksum_64.S
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
[mirror_ubuntu-kernels.git] / arch / powerpc / lib / checksum_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11 #include <linux/sys.h>
12 #include <asm/processor.h>
13 #include <asm/errno.h>
14 #include <asm/ppc_asm.h>
15 #include <asm/export.h>
16
17 /*
18 * Computes the checksum of a memory block at buff, length len,
19 * and adds in "sum" (32-bit).
20 *
21 * __csum_partial(r3=buff, r4=len, r5=sum)
22 */
23 _GLOBAL(__csum_partial)
24 addic r0,r5,0 /* clear carry */
25
26 srdi. r6,r4,3 /* less than 8 bytes? */
27 beq .Lcsum_tail_word
28
29 /*
30 * If only halfword aligned, align to a double word. Since odd
31 * aligned addresses should be rare and they would require more
32 * work to calculate the correct checksum, we ignore that case
33 * and take the potential slowdown of unaligned loads.
34 */
35 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
36 beq .Lcsum_aligned
37
38 li r7,4
39 sub r6,r7,r6
40 mtctr r6
41
42 1:
43 lhz r6,0(r3) /* align to doubleword */
44 subi r4,r4,2
45 addi r3,r3,2
46 adde r0,r0,r6
47 bdnz 1b
48
49 .Lcsum_aligned:
50 /*
51 * We unroll the loop such that each iteration is 64 bytes with an
52 * entry and exit limb of 64 bytes, meaning a minimum size of
53 * 128 bytes.
54 */
55 srdi. r6,r4,7
56 beq .Lcsum_tail_doublewords /* len < 128 */
57
58 srdi r6,r4,6
59 subi r6,r6,1
60 mtctr r6
61
62 stdu r1,-STACKFRAMESIZE(r1)
63 std r14,STK_REG(R14)(r1)
64 std r15,STK_REG(R15)(r1)
65 std r16,STK_REG(R16)(r1)
66
67 ld r6,0(r3)
68 ld r9,8(r3)
69
70 ld r10,16(r3)
71 ld r11,24(r3)
72
73 /*
74 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75 * because of the XER dependency. This means the fastest this loop can
76 * go is 16 cycles per iteration. The scheduling of the loop below has
77 * been shown to hit this on both POWER6 and POWER7.
78 */
79 .align 5
80 2:
81 adde r0,r0,r6
82 ld r12,32(r3)
83 ld r14,40(r3)
84
85 adde r0,r0,r9
86 ld r15,48(r3)
87 ld r16,56(r3)
88 addi r3,r3,64
89
90 adde r0,r0,r10
91
92 adde r0,r0,r11
93
94 adde r0,r0,r12
95
96 adde r0,r0,r14
97
98 adde r0,r0,r15
99 ld r6,0(r3)
100 ld r9,8(r3)
101
102 adde r0,r0,r16
103 ld r10,16(r3)
104 ld r11,24(r3)
105 bdnz 2b
106
107
108 adde r0,r0,r6
109 ld r12,32(r3)
110 ld r14,40(r3)
111
112 adde r0,r0,r9
113 ld r15,48(r3)
114 ld r16,56(r3)
115 addi r3,r3,64
116
117 adde r0,r0,r10
118 adde r0,r0,r11
119 adde r0,r0,r12
120 adde r0,r0,r14
121 adde r0,r0,r15
122 adde r0,r0,r16
123
124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
127 addi r1,r1,STACKFRAMESIZE
128
129 andi. r4,r4,63
130
131 .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
132 srdi. r6,r4,3
133 beq .Lcsum_tail_word
134
135 mtctr r6
136 3:
137 ld r6,0(r3)
138 addi r3,r3,8
139 adde r0,r0,r6
140 bdnz 3b
141
142 andi. r4,r4,7
143
144 .Lcsum_tail_word: /* Up to 7 bytes to go */
145 srdi. r6,r4,2
146 beq .Lcsum_tail_halfword
147
148 lwz r6,0(r3)
149 addi r3,r3,4
150 adde r0,r0,r6
151 subi r4,r4,4
152
153 .Lcsum_tail_halfword: /* Up to 3 bytes to go */
154 srdi. r6,r4,1
155 beq .Lcsum_tail_byte
156
157 lhz r6,0(r3)
158 addi r3,r3,2
159 adde r0,r0,r6
160 subi r4,r4,2
161
162 .Lcsum_tail_byte: /* Up to 1 byte to go */
163 andi. r6,r4,1
164 beq .Lcsum_finish
165
166 lbz r6,0(r3)
167 #ifdef __BIG_ENDIAN__
168 sldi r9,r6,8 /* Pad the byte out to 16 bits */
169 adde r0,r0,r9
170 #else
171 adde r0,r0,r6
172 #endif
173
174 .Lcsum_finish:
175 addze r0,r0 /* add in final carry */
176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
177 add r3,r4,r0
178 srdi r3,r3,32
179 blr
180 EXPORT_SYMBOL(__csum_partial)
181
182
183 .macro srcnr
184 100:
185 EX_TABLE(100b,.Lsrc_error_nr)
186 .endm
187
188 .macro source
189 150:
190 EX_TABLE(150b,.Lsrc_error)
191 .endm
192
193 .macro dstnr
194 200:
195 EX_TABLE(200b,.Ldest_error_nr)
196 .endm
197
198 .macro dest
199 250:
200 EX_TABLE(250b,.Ldest_error)
201 .endm
202
203 /*
204 * Computes the checksum of a memory block at src, length len,
205 * and adds in "sum" (32-bit), while copying the block to dst.
206 * If an access exception occurs on src or dst, it stores -EFAULT
207 * to *src_err or *dst_err respectively. The caller must take any action
208 * required in this case (zeroing memory, recalculating partial checksum etc).
209 *
210 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
211 */
212 _GLOBAL(csum_partial_copy_generic)
213 addic r0,r6,0 /* clear carry */
214
215 srdi. r6,r5,3 /* less than 8 bytes? */
216 beq .Lcopy_tail_word
217
218 /*
219 * If only halfword aligned, align to a double word. Since odd
220 * aligned addresses should be rare and they would require more
221 * work to calculate the correct checksum, we ignore that case
222 * and take the potential slowdown of unaligned loads.
223 *
224 * If the source and destination are relatively unaligned we only
225 * align the source. This keeps things simple.
226 */
227 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
228 beq .Lcopy_aligned
229
230 li r9,4
231 sub r6,r9,r6
232 mtctr r6
233
234 1:
235 srcnr; lhz r6,0(r3) /* align to doubleword */
236 subi r5,r5,2
237 addi r3,r3,2
238 adde r0,r0,r6
239 dstnr; sth r6,0(r4)
240 addi r4,r4,2
241 bdnz 1b
242
243 .Lcopy_aligned:
244 /*
245 * We unroll the loop such that each iteration is 64 bytes with an
246 * entry and exit limb of 64 bytes, meaning a minimum size of
247 * 128 bytes.
248 */
249 srdi. r6,r5,7
250 beq .Lcopy_tail_doublewords /* len < 128 */
251
252 srdi r6,r5,6
253 subi r6,r6,1
254 mtctr r6
255
256 stdu r1,-STACKFRAMESIZE(r1)
257 std r14,STK_REG(R14)(r1)
258 std r15,STK_REG(R15)(r1)
259 std r16,STK_REG(R16)(r1)
260
261 source; ld r6,0(r3)
262 source; ld r9,8(r3)
263
264 source; ld r10,16(r3)
265 source; ld r11,24(r3)
266
267 /*
268 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
269 * because of the XER dependency. This means the fastest this loop can
270 * go is 16 cycles per iteration. The scheduling of the loop below has
271 * been shown to hit this on both POWER6 and POWER7.
272 */
273 .align 5
274 2:
275 adde r0,r0,r6
276 source; ld r12,32(r3)
277 source; ld r14,40(r3)
278
279 adde r0,r0,r9
280 source; ld r15,48(r3)
281 source; ld r16,56(r3)
282 addi r3,r3,64
283
284 adde r0,r0,r10
285 dest; std r6,0(r4)
286 dest; std r9,8(r4)
287
288 adde r0,r0,r11
289 dest; std r10,16(r4)
290 dest; std r11,24(r4)
291
292 adde r0,r0,r12
293 dest; std r12,32(r4)
294 dest; std r14,40(r4)
295
296 adde r0,r0,r14
297 dest; std r15,48(r4)
298 dest; std r16,56(r4)
299 addi r4,r4,64
300
301 adde r0,r0,r15
302 source; ld r6,0(r3)
303 source; ld r9,8(r3)
304
305 adde r0,r0,r16
306 source; ld r10,16(r3)
307 source; ld r11,24(r3)
308 bdnz 2b
309
310
311 adde r0,r0,r6
312 source; ld r12,32(r3)
313 source; ld r14,40(r3)
314
315 adde r0,r0,r9
316 source; ld r15,48(r3)
317 source; ld r16,56(r3)
318 addi r3,r3,64
319
320 adde r0,r0,r10
321 dest; std r6,0(r4)
322 dest; std r9,8(r4)
323
324 adde r0,r0,r11
325 dest; std r10,16(r4)
326 dest; std r11,24(r4)
327
328 adde r0,r0,r12
329 dest; std r12,32(r4)
330 dest; std r14,40(r4)
331
332 adde r0,r0,r14
333 dest; std r15,48(r4)
334 dest; std r16,56(r4)
335 addi r4,r4,64
336
337 adde r0,r0,r15
338 adde r0,r0,r16
339
340 ld r14,STK_REG(R14)(r1)
341 ld r15,STK_REG(R15)(r1)
342 ld r16,STK_REG(R16)(r1)
343 addi r1,r1,STACKFRAMESIZE
344
345 andi. r5,r5,63
346
347 .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
348 srdi. r6,r5,3
349 beq .Lcopy_tail_word
350
351 mtctr r6
352 3:
353 srcnr; ld r6,0(r3)
354 addi r3,r3,8
355 adde r0,r0,r6
356 dstnr; std r6,0(r4)
357 addi r4,r4,8
358 bdnz 3b
359
360 andi. r5,r5,7
361
362 .Lcopy_tail_word: /* Up to 7 bytes to go */
363 srdi. r6,r5,2
364 beq .Lcopy_tail_halfword
365
366 srcnr; lwz r6,0(r3)
367 addi r3,r3,4
368 adde r0,r0,r6
369 dstnr; stw r6,0(r4)
370 addi r4,r4,4
371 subi r5,r5,4
372
373 .Lcopy_tail_halfword: /* Up to 3 bytes to go */
374 srdi. r6,r5,1
375 beq .Lcopy_tail_byte
376
377 srcnr; lhz r6,0(r3)
378 addi r3,r3,2
379 adde r0,r0,r6
380 dstnr; sth r6,0(r4)
381 addi r4,r4,2
382 subi r5,r5,2
383
384 .Lcopy_tail_byte: /* Up to 1 byte to go */
385 andi. r6,r5,1
386 beq .Lcopy_finish
387
388 srcnr; lbz r6,0(r3)
389 #ifdef __BIG_ENDIAN__
390 sldi r9,r6,8 /* Pad the byte out to 16 bits */
391 adde r0,r0,r9
392 #else
393 adde r0,r0,r6
394 #endif
395 dstnr; stb r6,0(r4)
396
397 .Lcopy_finish:
398 addze r0,r0 /* add in final carry */
399 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
400 add r3,r4,r0
401 srdi r3,r3,32
402 blr
403
404 .Lsrc_error:
405 ld r14,STK_REG(R14)(r1)
406 ld r15,STK_REG(R15)(r1)
407 ld r16,STK_REG(R16)(r1)
408 addi r1,r1,STACKFRAMESIZE
409 .Lsrc_error_nr:
410 cmpdi 0,r7,0
411 beqlr
412 li r6,-EFAULT
413 stw r6,0(r7)
414 blr
415
416 .Ldest_error:
417 ld r14,STK_REG(R14)(r1)
418 ld r15,STK_REG(R15)(r1)
419 ld r16,STK_REG(R16)(r1)
420 addi r1,r1,STACKFRAMESIZE
421 .Ldest_error_nr:
422 cmpdi 0,r8,0
423 beqlr
424 li r6,-EFAULT
425 stw r6,0(r8)
426 blr
427 EXPORT_SYMBOL(csum_partial_copy_generic)
428
429 /*
430 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
431 * const struct in6_addr *daddr,
432 * __u32 len, __u8 proto, __wsum sum)
433 */
434
435 _GLOBAL(csum_ipv6_magic)
436 ld r8, 0(r3)
437 ld r9, 8(r3)
438 add r5, r5, r6
439 addc r0, r8, r9
440 ld r10, 0(r4)
441 ld r11, 8(r4)
442 #ifdef CONFIG_CPU_LITTLE_ENDIAN
443 rotldi r5, r5, 8
444 #endif
445 adde r0, r0, r10
446 add r5, r5, r7
447 adde r0, r0, r11
448 adde r0, r0, r5
449 addze r0, r0
450 rotldi r3, r0, 32 /* fold two 32 bit halves together */
451 add r3, r0, r3
452 srdi r0, r3, 32
453 rotlwi r3, r0, 16 /* fold two 16 bit halves together */
454 add r3, r0, r3
455 not r3, r3
456 rlwinm r3, r3, 16, 16, 31
457 blr
458 EXPORT_SYMBOL(csum_ipv6_magic)