]>
Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
17 | #include <asm/errno.h> | |
18 | #include <asm/ppc_asm.h> | |
19 | ||
20 | /* | |
21 | * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header | |
22 | * len is in words and is always >= 5. | |
23 | * | |
24 | * In practice len == 5, but this is not guaranteed. So this code does not | |
25 | * attempt to use doubleword instructions. | |
26 | */ | |
27 | _GLOBAL(ip_fast_csum) | |
28 | lwz r0,0(r3) | |
29 | lwzu r5,4(r3) | |
30 | addic. r4,r4,-2 | |
31 | addc r0,r0,r5 | |
32 | mtctr r4 | |
33 | blelr- | |
34 | 1: lwzu r4,4(r3) | |
35 | adde r0,r0,r4 | |
36 | bdnz 1b | |
37 | addze r0,r0 /* add in final carry */ | |
38 | rldicl r4,r0,32,0 /* fold two 32-bit halves together */ | |
39 | add r0,r0,r4 | |
40 | srdi r0,r0,32 | |
41 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
42 | add r3,r0,r3 | |
43 | not r3,r3 | |
44 | srwi r3,r3,16 | |
45 | blr | |
46 | ||
47 | /* | |
48 | * Compute checksum of TCP or UDP pseudo-header: | |
49 | * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) | |
50 | * No real gain trying to do this specially for 64 bit, but | |
51 | * the 32 bit addition may spill into the upper bits of | |
52 | * the doubleword so we still must fold it down from 64. | |
53 | */ | |
54 | _GLOBAL(csum_tcpudp_magic) | |
55 | rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ | |
56 | addc r0,r3,r4 /* add 4 32-bit words together */ | |
57 | adde r0,r0,r5 | |
58 | adde r0,r0,r7 | |
59 | rldicl r4,r0,32,0 /* fold 64 bit value */ | |
60 | add r0,r4,r0 | |
61 | srdi r0,r0,32 | |
62 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
63 | add r3,r0,r3 | |
64 | not r3,r3 | |
65 | srwi r3,r3,16 | |
66 | blr | |
67 | ||
68 | /* | |
69 | * Computes the checksum of a memory block at buff, length len, | |
70 | * and adds in "sum" (32-bit). | |
71 | * | |
14cf11af PM |
72 | * csum_partial(r3=buff, r4=len, r5=sum) |
73 | */ | |
74 | _GLOBAL(csum_partial) | |
9b83ecb0 AB |
75 | addic r0,r5,0 /* clear carry */ |
76 | ||
77 | srdi. r6,r4,3 /* less than 8 bytes? */ | |
78 | beq .Lcsum_tail_word | |
79 | ||
80 | /* | |
81 | * If only halfword aligned, align to a double word. Since odd | |
82 | * aligned addresses should be rare and they would require more | |
83 | * work to calculate the correct checksum, we ignore that case | |
84 | * and take the potential slowdown of unaligned loads. | |
85 | */ | |
86 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
87 | beq .Lcsum_aligned | |
88 | ||
89 | li r7,4 | |
90 | sub r6,r7,r6 | |
91 | mtctr r6 | |
92 | ||
93 | 1: | |
94 | lhz r6,0(r3) /* align to doubleword */ | |
95 | subi r4,r4,2 | |
96 | addi r3,r3,2 | |
97 | adde r0,r0,r6 | |
98 | bdnz 1b | |
99 | ||
100 | .Lcsum_aligned: | |
101 | /* | |
102 | * We unroll the loop such that each iteration is 64 bytes with an | |
103 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
104 | * 128 bytes. | |
105 | */ | |
106 | srdi. r6,r4,7 | |
107 | beq .Lcsum_tail_doublewords /* len < 128 */ | |
108 | ||
109 | srdi r6,r4,6 | |
110 | subi r6,r6,1 | |
111 | mtctr r6 | |
112 | ||
113 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
114 | std r14,STK_REG(R14)(r1) |
115 | std r15,STK_REG(R15)(r1) | |
116 | std r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
117 | |
118 | ld r6,0(r3) | |
119 | ld r9,8(r3) | |
120 | ||
121 | ld r10,16(r3) | |
122 | ld r11,24(r3) | |
123 | ||
124 | /* | |
125 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
126 | * the XER dependency. This means the fastest this loop can go is | |
127 | * 16 cycles per iteration. The scheduling of the loop below has | |
128 | * been shown to hit this on both POWER6 and POWER7. | |
129 | */ | |
130 | .align 5 | |
131 | 2: | |
132 | adde r0,r0,r6 | |
133 | ld r12,32(r3) | |
134 | ld r14,40(r3) | |
135 | ||
136 | adde r0,r0,r9 | |
137 | ld r15,48(r3) | |
138 | ld r16,56(r3) | |
139 | addi r3,r3,64 | |
140 | ||
141 | adde r0,r0,r10 | |
142 | ||
143 | adde r0,r0,r11 | |
144 | ||
145 | adde r0,r0,r12 | |
146 | ||
147 | adde r0,r0,r14 | |
148 | ||
149 | adde r0,r0,r15 | |
150 | ld r6,0(r3) | |
151 | ld r9,8(r3) | |
152 | ||
153 | adde r0,r0,r16 | |
154 | ld r10,16(r3) | |
155 | ld r11,24(r3) | |
156 | bdnz 2b | |
157 | ||
158 | ||
159 | adde r0,r0,r6 | |
160 | ld r12,32(r3) | |
161 | ld r14,40(r3) | |
162 | ||
163 | adde r0,r0,r9 | |
164 | ld r15,48(r3) | |
165 | ld r16,56(r3) | |
166 | addi r3,r3,64 | |
167 | ||
168 | adde r0,r0,r10 | |
169 | adde r0,r0,r11 | |
170 | adde r0,r0,r12 | |
171 | adde r0,r0,r14 | |
172 | adde r0,r0,r15 | |
173 | adde r0,r0,r16 | |
174 | ||
c75df6f9 MN |
175 | ld r14,STK_REG(R14)(r1) |
176 | ld r15,STK_REG(R15)(r1) | |
177 | ld r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
178 | addi r1,r1,STACKFRAMESIZE |
179 | ||
180 | andi. r4,r4,63 | |
181 | ||
182 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ | |
183 | srdi. r6,r4,3 | |
184 | beq .Lcsum_tail_word | |
185 | ||
186 | mtctr r6 | |
187 | 3: | |
188 | ld r6,0(r3) | |
189 | addi r3,r3,8 | |
190 | adde r0,r0,r6 | |
191 | bdnz 3b | |
192 | ||
193 | andi. r4,r4,7 | |
194 | ||
195 | .Lcsum_tail_word: /* Up to 7 bytes to go */ | |
196 | srdi. r6,r4,2 | |
197 | beq .Lcsum_tail_halfword | |
198 | ||
199 | lwz r6,0(r3) | |
14cf11af | 200 | addi r3,r3,4 |
9b83ecb0 | 201 | adde r0,r0,r6 |
14cf11af | 202 | subi r4,r4,4 |
9b83ecb0 AB |
203 | |
204 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ | |
205 | srdi. r6,r4,1 | |
206 | beq .Lcsum_tail_byte | |
207 | ||
208 | lhz r6,0(r3) | |
209 | addi r3,r3,2 | |
210 | adde r0,r0,r6 | |
211 | subi r4,r4,2 | |
212 | ||
213 | .Lcsum_tail_byte: /* Up to 1 byte to go */ | |
214 | andi. r6,r4,1 | |
215 | beq .Lcsum_finish | |
216 | ||
217 | lbz r6,0(r3) | |
218 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
219 | adde r0,r0,r9 | |
220 | ||
221 | .Lcsum_finish: | |
222 | addze r0,r0 /* add in final carry */ | |
223 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
224 | add r3,r4,r0 | |
225 | srdi r3,r3,32 | |
226 | blr | |
14cf11af | 227 | |
fdd374b6 AB |
228 | |
229 | .macro source | |
230 | 100: | |
231 | .section __ex_table,"a" | |
232 | .align 3 | |
233 | .llong 100b,.Lsrc_error | |
234 | .previous | |
235 | .endm | |
236 | ||
237 | .macro dest | |
238 | 200: | |
239 | .section __ex_table,"a" | |
240 | .align 3 | |
241 | .llong 200b,.Ldest_error | |
242 | .previous | |
243 | .endm | |
244 | ||
14cf11af PM |
245 | /* |
246 | * Computes the checksum of a memory block at src, length len, | |
247 | * and adds in "sum" (32-bit), while copying the block to dst. | |
248 | * If an access exception occurs on src or dst, it stores -EFAULT | |
fdd374b6 AB |
249 | * to *src_err or *dst_err respectively. The caller must take any action |
250 | * required in this case (zeroing memory, recalculating partial checksum etc). | |
14cf11af PM |
251 | * |
252 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | |
253 | */ | |
254 | _GLOBAL(csum_partial_copy_generic) | |
fdd374b6 AB |
255 | addic r0,r6,0 /* clear carry */ |
256 | ||
257 | srdi. r6,r5,3 /* less than 8 bytes? */ | |
258 | beq .Lcopy_tail_word | |
259 | ||
260 | /* | |
261 | * If only halfword aligned, align to a double word. Since odd | |
262 | * aligned addresses should be rare and they would require more | |
263 | * work to calculate the correct checksum, we ignore that case | |
264 | * and take the potential slowdown of unaligned loads. | |
265 | * | |
266 | * If the source and destination are relatively unaligned we only | |
267 | * align the source. This keeps things simple. | |
268 | */ | |
269 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
270 | beq .Lcopy_aligned | |
271 | ||
272 | li r7,4 | |
273 | sub r6,r7,r6 | |
274 | mtctr r6 | |
275 | ||
276 | 1: | |
277 | source; lhz r6,0(r3) /* align to doubleword */ | |
14cf11af | 278 | subi r5,r5,2 |
14cf11af | 279 | addi r3,r3,2 |
fdd374b6 AB |
280 | adde r0,r0,r6 |
281 | dest; sth r6,0(r4) | |
14cf11af | 282 | addi r4,r4,2 |
fdd374b6 AB |
283 | bdnz 1b |
284 | ||
285 | .Lcopy_aligned: | |
286 | /* | |
287 | * We unroll the loop such that each iteration is 64 bytes with an | |
288 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
289 | * 128 bytes. | |
290 | */ | |
291 | srdi. r6,r5,7 | |
292 | beq .Lcopy_tail_doublewords /* len < 128 */ | |
293 | ||
294 | srdi r6,r5,6 | |
295 | subi r6,r6,1 | |
296 | mtctr r6 | |
297 | ||
298 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
299 | std r14,STK_REG(R14)(r1) |
300 | std r15,STK_REG(R15)(r1) | |
301 | std r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
302 | |
303 | source; ld r6,0(r3) | |
304 | source; ld r9,8(r3) | |
305 | ||
306 | source; ld r10,16(r3) | |
307 | source; ld r11,24(r3) | |
308 | ||
309 | /* | |
310 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
311 | * the XER dependency. This means the fastest this loop can go is | |
312 | * 16 cycles per iteration. The scheduling of the loop below has | |
313 | * been shown to hit this on both POWER6 and POWER7. | |
314 | */ | |
315 | .align 5 | |
316 | 2: | |
317 | adde r0,r0,r6 | |
318 | source; ld r12,32(r3) | |
319 | source; ld r14,40(r3) | |
320 | ||
321 | adde r0,r0,r9 | |
322 | source; ld r15,48(r3) | |
323 | source; ld r16,56(r3) | |
324 | addi r3,r3,64 | |
325 | ||
326 | adde r0,r0,r10 | |
327 | dest; std r6,0(r4) | |
328 | dest; std r9,8(r4) | |
329 | ||
330 | adde r0,r0,r11 | |
331 | dest; std r10,16(r4) | |
332 | dest; std r11,24(r4) | |
333 | ||
334 | adde r0,r0,r12 | |
335 | dest; std r12,32(r4) | |
336 | dest; std r14,40(r4) | |
337 | ||
338 | adde r0,r0,r14 | |
339 | dest; std r15,48(r4) | |
340 | dest; std r16,56(r4) | |
341 | addi r4,r4,64 | |
342 | ||
343 | adde r0,r0,r15 | |
344 | source; ld r6,0(r3) | |
345 | source; ld r9,8(r3) | |
346 | ||
347 | adde r0,r0,r16 | |
348 | source; ld r10,16(r3) | |
349 | source; ld r11,24(r3) | |
350 | bdnz 2b | |
351 | ||
352 | ||
14cf11af | 353 | adde r0,r0,r6 |
fdd374b6 AB |
354 | source; ld r12,32(r3) |
355 | source; ld r14,40(r3) | |
356 | ||
357 | adde r0,r0,r9 | |
358 | source; ld r15,48(r3) | |
359 | source; ld r16,56(r3) | |
360 | addi r3,r3,64 | |
361 | ||
362 | adde r0,r0,r10 | |
363 | dest; std r6,0(r4) | |
364 | dest; std r9,8(r4) | |
365 | ||
366 | adde r0,r0,r11 | |
367 | dest; std r10,16(r4) | |
368 | dest; std r11,24(r4) | |
369 | ||
370 | adde r0,r0,r12 | |
371 | dest; std r12,32(r4) | |
372 | dest; std r14,40(r4) | |
373 | ||
374 | adde r0,r0,r14 | |
375 | dest; std r15,48(r4) | |
376 | dest; std r16,56(r4) | |
377 | addi r4,r4,64 | |
378 | ||
379 | adde r0,r0,r15 | |
380 | adde r0,r0,r16 | |
381 | ||
c75df6f9 MN |
382 | ld r14,STK_REG(R14)(r1) |
383 | ld r15,STK_REG(R15)(r1) | |
384 | ld r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
385 | addi r1,r1,STACKFRAMESIZE |
386 | ||
387 | andi. r5,r5,63 | |
388 | ||
389 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ | |
390 | srdi. r6,r5,3 | |
391 | beq .Lcopy_tail_word | |
392 | ||
393 | mtctr r6 | |
394 | 3: | |
395 | source; ld r6,0(r3) | |
396 | addi r3,r3,8 | |
14cf11af | 397 | adde r0,r0,r6 |
fdd374b6 AB |
398 | dest; std r6,0(r4) |
399 | addi r4,r4,8 | |
400 | bdnz 3b | |
14cf11af | 401 | |
fdd374b6 | 402 | andi. r5,r5,7 |
14cf11af | 403 | |
fdd374b6 AB |
404 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
405 | srdi. r6,r5,2 | |
406 | beq .Lcopy_tail_halfword | |
407 | ||
408 | source; lwz r6,0(r3) | |
409 | addi r3,r3,4 | |
410 | adde r0,r0,r6 | |
411 | dest; stw r6,0(r4) | |
412 | addi r4,r4,4 | |
413 | subi r5,r5,4 | |
414 | ||
415 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ | |
416 | srdi. r6,r5,1 | |
417 | beq .Lcopy_tail_byte | |
418 | ||
419 | source; lhz r6,0(r3) | |
420 | addi r3,r3,2 | |
421 | adde r0,r0,r6 | |
422 | dest; sth r6,0(r4) | |
14cf11af | 423 | addi r4,r4,2 |
fdd374b6 AB |
424 | subi r5,r5,2 |
425 | ||
426 | .Lcopy_tail_byte: /* Up to 1 byte to go */ | |
427 | andi. r6,r5,1 | |
428 | beq .Lcopy_finish | |
429 | ||
430 | source; lbz r6,0(r3) | |
431 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
432 | adde r0,r0,r9 | |
433 | dest; stb r6,0(r4) | |
434 | ||
435 | .Lcopy_finish: | |
436 | addze r0,r0 /* add in final carry */ | |
437 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
438 | add r3,r4,r0 | |
439 | srdi r3,r3,32 | |
440 | blr | |
441 | ||
442 | .Lsrc_error: | |
14cf11af | 443 | cmpdi 0,r7,0 |
fdd374b6 | 444 | beqlr |
14cf11af PM |
445 | li r6,-EFAULT |
446 | stw r6,0(r7) | |
14cf11af PM |
447 | blr |
448 | ||
fdd374b6 | 449 | .Ldest_error: |
14cf11af | 450 | cmpdi 0,r8,0 |
fdd374b6 | 451 | beqlr |
14cf11af PM |
452 | li r6,-EFAULT |
453 | stw r6,0(r8) | |
14cf11af | 454 | blr |