]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - arch/powerpc/lib/checksum_64.S
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-zesty-kernel.git] / arch / powerpc / lib / checksum_64.S
1 /*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15 #include <linux/sys.h>
16 #include <asm/processor.h>
17 #include <asm/errno.h>
18 #include <asm/ppc_asm.h>
19
20 /*
21 * Computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit).
23 *
24 * __csum_partial(r3=buff, r4=len, r5=sum)
25 */
26 _GLOBAL(__csum_partial)
27 addic r0,r5,0 /* clear carry */
28
29 srdi. r6,r4,3 /* less than 8 bytes? */
30 beq .Lcsum_tail_word
31
32 /*
33 * If only halfword aligned, align to a double word. Since odd
34 * aligned addresses should be rare and they would require more
35 * work to calculate the correct checksum, we ignore that case
36 * and take the potential slowdown of unaligned loads.
37 */
38 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
39 beq .Lcsum_aligned
40
41 li r7,4
42 sub r6,r7,r6
43 mtctr r6
44
45 1:
46 lhz r6,0(r3) /* align to doubleword */
47 subi r4,r4,2
48 addi r3,r3,2
49 adde r0,r0,r6
50 bdnz 1b
51
52 .Lcsum_aligned:
53 /*
54 * We unroll the loop such that each iteration is 64 bytes with an
55 * entry and exit limb of 64 bytes, meaning a minimum size of
56 * 128 bytes.
57 */
58 srdi. r6,r4,7
59 beq .Lcsum_tail_doublewords /* len < 128 */
60
61 srdi r6,r4,6
62 subi r6,r6,1
63 mtctr r6
64
65 stdu r1,-STACKFRAMESIZE(r1)
66 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
69
70 ld r6,0(r3)
71 ld r9,8(r3)
72
73 ld r10,16(r3)
74 ld r11,24(r3)
75
76 /*
77 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
78 * because of the XER dependency. This means the fastest this loop can
79 * go is 16 cycles per iteration. The scheduling of the loop below has
80 * been shown to hit this on both POWER6 and POWER7.
81 */
82 .align 5
83 2:
84 adde r0,r0,r6
85 ld r12,32(r3)
86 ld r14,40(r3)
87
88 adde r0,r0,r9
89 ld r15,48(r3)
90 ld r16,56(r3)
91 addi r3,r3,64
92
93 adde r0,r0,r10
94
95 adde r0,r0,r11
96
97 adde r0,r0,r12
98
99 adde r0,r0,r14
100
101 adde r0,r0,r15
102 ld r6,0(r3)
103 ld r9,8(r3)
104
105 adde r0,r0,r16
106 ld r10,16(r3)
107 ld r11,24(r3)
108 bdnz 2b
109
110
111 adde r0,r0,r6
112 ld r12,32(r3)
113 ld r14,40(r3)
114
115 adde r0,r0,r9
116 ld r15,48(r3)
117 ld r16,56(r3)
118 addi r3,r3,64
119
120 adde r0,r0,r10
121 adde r0,r0,r11
122 adde r0,r0,r12
123 adde r0,r0,r14
124 adde r0,r0,r15
125 adde r0,r0,r16
126
127 ld r14,STK_REG(R14)(r1)
128 ld r15,STK_REG(R15)(r1)
129 ld r16,STK_REG(R16)(r1)
130 addi r1,r1,STACKFRAMESIZE
131
132 andi. r4,r4,63
133
134 .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
135 srdi. r6,r4,3
136 beq .Lcsum_tail_word
137
138 mtctr r6
139 3:
140 ld r6,0(r3)
141 addi r3,r3,8
142 adde r0,r0,r6
143 bdnz 3b
144
145 andi. r4,r4,7
146
147 .Lcsum_tail_word: /* Up to 7 bytes to go */
148 srdi. r6,r4,2
149 beq .Lcsum_tail_halfword
150
151 lwz r6,0(r3)
152 addi r3,r3,4
153 adde r0,r0,r6
154 subi r4,r4,4
155
156 .Lcsum_tail_halfword: /* Up to 3 bytes to go */
157 srdi. r6,r4,1
158 beq .Lcsum_tail_byte
159
160 lhz r6,0(r3)
161 addi r3,r3,2
162 adde r0,r0,r6
163 subi r4,r4,2
164
165 .Lcsum_tail_byte: /* Up to 1 byte to go */
166 andi. r6,r4,1
167 beq .Lcsum_finish
168
169 lbz r6,0(r3)
170 sldi r9,r6,8 /* Pad the byte out to 16 bits */
171 adde r0,r0,r9
172
173 .Lcsum_finish:
174 addze r0,r0 /* add in final carry */
175 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
176 add r3,r4,r0
177 srdi r3,r3,32
178 blr
179
180
181 .macro srcnr
182 100:
183 .section __ex_table,"a"
184 .align 3
185 .llong 100b,.Lsrc_error_nr
186 .previous
187 .endm
188
189 .macro source
190 150:
191 .section __ex_table,"a"
192 .align 3
193 .llong 150b,.Lsrc_error
194 .previous
195 .endm
196
197 .macro dstnr
198 200:
199 .section __ex_table,"a"
200 .align 3
201 .llong 200b,.Ldest_error_nr
202 .previous
203 .endm
204
205 .macro dest
206 250:
207 .section __ex_table,"a"
208 .align 3
209 .llong 250b,.Ldest_error
210 .previous
211 .endm
212
213 /*
214 * Computes the checksum of a memory block at src, length len,
215 * and adds in "sum" (32-bit), while copying the block to dst.
216 * If an access exception occurs on src or dst, it stores -EFAULT
217 * to *src_err or *dst_err respectively. The caller must take any action
218 * required in this case (zeroing memory, recalculating partial checksum etc).
219 *
220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
221 */
222 _GLOBAL(csum_partial_copy_generic)
223 addic r0,r6,0 /* clear carry */
224
225 srdi. r6,r5,3 /* less than 8 bytes? */
226 beq .Lcopy_tail_word
227
228 /*
229 * If only halfword aligned, align to a double word. Since odd
230 * aligned addresses should be rare and they would require more
231 * work to calculate the correct checksum, we ignore that case
232 * and take the potential slowdown of unaligned loads.
233 *
234 * If the source and destination are relatively unaligned we only
235 * align the source. This keeps things simple.
236 */
237 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
238 beq .Lcopy_aligned
239
240 li r9,4
241 sub r6,r9,r6
242 mtctr r6
243
244 1:
245 srcnr; lhz r6,0(r3) /* align to doubleword */
246 subi r5,r5,2
247 addi r3,r3,2
248 adde r0,r0,r6
249 dstnr; sth r6,0(r4)
250 addi r4,r4,2
251 bdnz 1b
252
253 .Lcopy_aligned:
254 /*
255 * We unroll the loop such that each iteration is 64 bytes with an
256 * entry and exit limb of 64 bytes, meaning a minimum size of
257 * 128 bytes.
258 */
259 srdi. r6,r5,7
260 beq .Lcopy_tail_doublewords /* len < 128 */
261
262 srdi r6,r5,6
263 subi r6,r6,1
264 mtctr r6
265
266 stdu r1,-STACKFRAMESIZE(r1)
267 std r14,STK_REG(R14)(r1)
268 std r15,STK_REG(R15)(r1)
269 std r16,STK_REG(R16)(r1)
270
271 source; ld r6,0(r3)
272 source; ld r9,8(r3)
273
274 source; ld r10,16(r3)
275 source; ld r11,24(r3)
276
277 /*
278 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
279 * because of the XER dependency. This means the fastest this loop can
280 * go is 16 cycles per iteration. The scheduling of the loop below has
281 * been shown to hit this on both POWER6 and POWER7.
282 */
283 .align 5
284 2:
285 adde r0,r0,r6
286 source; ld r12,32(r3)
287 source; ld r14,40(r3)
288
289 adde r0,r0,r9
290 source; ld r15,48(r3)
291 source; ld r16,56(r3)
292 addi r3,r3,64
293
294 adde r0,r0,r10
295 dest; std r6,0(r4)
296 dest; std r9,8(r4)
297
298 adde r0,r0,r11
299 dest; std r10,16(r4)
300 dest; std r11,24(r4)
301
302 adde r0,r0,r12
303 dest; std r12,32(r4)
304 dest; std r14,40(r4)
305
306 adde r0,r0,r14
307 dest; std r15,48(r4)
308 dest; std r16,56(r4)
309 addi r4,r4,64
310
311 adde r0,r0,r15
312 source; ld r6,0(r3)
313 source; ld r9,8(r3)
314
315 adde r0,r0,r16
316 source; ld r10,16(r3)
317 source; ld r11,24(r3)
318 bdnz 2b
319
320
321 adde r0,r0,r6
322 source; ld r12,32(r3)
323 source; ld r14,40(r3)
324
325 adde r0,r0,r9
326 source; ld r15,48(r3)
327 source; ld r16,56(r3)
328 addi r3,r3,64
329
330 adde r0,r0,r10
331 dest; std r6,0(r4)
332 dest; std r9,8(r4)
333
334 adde r0,r0,r11
335 dest; std r10,16(r4)
336 dest; std r11,24(r4)
337
338 adde r0,r0,r12
339 dest; std r12,32(r4)
340 dest; std r14,40(r4)
341
342 adde r0,r0,r14
343 dest; std r15,48(r4)
344 dest; std r16,56(r4)
345 addi r4,r4,64
346
347 adde r0,r0,r15
348 adde r0,r0,r16
349
350 ld r14,STK_REG(R14)(r1)
351 ld r15,STK_REG(R15)(r1)
352 ld r16,STK_REG(R16)(r1)
353 addi r1,r1,STACKFRAMESIZE
354
355 andi. r5,r5,63
356
357 .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
358 srdi. r6,r5,3
359 beq .Lcopy_tail_word
360
361 mtctr r6
362 3:
363 srcnr; ld r6,0(r3)
364 addi r3,r3,8
365 adde r0,r0,r6
366 dstnr; std r6,0(r4)
367 addi r4,r4,8
368 bdnz 3b
369
370 andi. r5,r5,7
371
372 .Lcopy_tail_word: /* Up to 7 bytes to go */
373 srdi. r6,r5,2
374 beq .Lcopy_tail_halfword
375
376 srcnr; lwz r6,0(r3)
377 addi r3,r3,4
378 adde r0,r0,r6
379 dstnr; stw r6,0(r4)
380 addi r4,r4,4
381 subi r5,r5,4
382
383 .Lcopy_tail_halfword: /* Up to 3 bytes to go */
384 srdi. r6,r5,1
385 beq .Lcopy_tail_byte
386
387 srcnr; lhz r6,0(r3)
388 addi r3,r3,2
389 adde r0,r0,r6
390 dstnr; sth r6,0(r4)
391 addi r4,r4,2
392 subi r5,r5,2
393
394 .Lcopy_tail_byte: /* Up to 1 byte to go */
395 andi. r6,r5,1
396 beq .Lcopy_finish
397
398 srcnr; lbz r6,0(r3)
399 sldi r9,r6,8 /* Pad the byte out to 16 bits */
400 adde r0,r0,r9
401 dstnr; stb r6,0(r4)
402
403 .Lcopy_finish:
404 addze r0,r0 /* add in final carry */
405 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
406 add r3,r4,r0
407 srdi r3,r3,32
408 blr
409
410 .Lsrc_error:
411 ld r14,STK_REG(R14)(r1)
412 ld r15,STK_REG(R15)(r1)
413 ld r16,STK_REG(R16)(r1)
414 addi r1,r1,STACKFRAMESIZE
415 .Lsrc_error_nr:
416 cmpdi 0,r7,0
417 beqlr
418 li r6,-EFAULT
419 stw r6,0(r7)
420 blr
421
422 .Ldest_error:
423 ld r14,STK_REG(R14)(r1)
424 ld r15,STK_REG(R15)(r1)
425 ld r16,STK_REG(R16)(r1)
426 addi r1,r1,STACKFRAMESIZE
427 .Ldest_error_nr:
428 cmpdi 0,r8,0
429 beqlr
430 li r6,-EFAULT
431 stw r6,0(r8)
432 blr