]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - arch/xtensa/lib/checksum.S
Merge branch 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[mirror_ubuntu-jammy-kernel.git] / arch / xtensa / lib / checksum.S
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 * Optimized by Joe Taylor
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17 #include <linux/errno.h>
18 #include <linux/linkage.h>
19 #include <variant/core.h>
20 #include <asm/asmmacro.h>
21
22 /*
23 * computes a partial checksum, e.g. for TCP/UDP fragments
24 */
25
26 /*
27 * unsigned int csum_partial(const unsigned char *buf, int len,
28 * unsigned int sum);
29 * a2 = buf
30 * a3 = len
31 * a4 = sum
32 *
33 * This function assumes 2- or 4-byte alignment. Other alignments will fail!
34 */
35
36 /* ONES_ADD converts twos-complement math to ones-complement. */
37 #define ONES_ADD(sum, val) \
38 add sum, sum, val ; \
39 bgeu sum, val, 99f ; \
40 addi sum, sum, 1 ; \
41 99: ;
42
43 .text
44 ENTRY(csum_partial)
45
46 /*
47 * Experiments with Ethernet and SLIP connections show that buf
48 * is aligned on either a 2-byte or 4-byte boundary.
49 */
50 entry sp, 32
51 extui a5, a2, 0, 2
52 bnez a5, 8f /* branch if 2-byte aligned */
53 /* Fall-through on common case, 4-byte alignment */
54 1:
55 srli a5, a3, 5 /* 32-byte chunks */
56 #if XCHAL_HAVE_LOOPS
57 loopgtz a5, 2f
58 #else
59 beqz a5, 2f
60 slli a5, a5, 5
61 add a5, a5, a2 /* a5 = end of last 32-byte chunk */
62 .Loop1:
63 #endif
64 l32i a6, a2, 0
65 l32i a7, a2, 4
66 ONES_ADD(a4, a6)
67 ONES_ADD(a4, a7)
68 l32i a6, a2, 8
69 l32i a7, a2, 12
70 ONES_ADD(a4, a6)
71 ONES_ADD(a4, a7)
72 l32i a6, a2, 16
73 l32i a7, a2, 20
74 ONES_ADD(a4, a6)
75 ONES_ADD(a4, a7)
76 l32i a6, a2, 24
77 l32i a7, a2, 28
78 ONES_ADD(a4, a6)
79 ONES_ADD(a4, a7)
80 addi a2, a2, 4*8
81 #if !XCHAL_HAVE_LOOPS
82 blt a2, a5, .Loop1
83 #endif
84 2:
85 extui a5, a3, 2, 3 /* remaining 4-byte chunks */
86 #if XCHAL_HAVE_LOOPS
87 loopgtz a5, 3f
88 #else
89 beqz a5, 3f
90 slli a5, a5, 2
91 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
92 .Loop2:
93 #endif
94 l32i a6, a2, 0
95 ONES_ADD(a4, a6)
96 addi a2, a2, 4
97 #if !XCHAL_HAVE_LOOPS
98 blt a2, a5, .Loop2
99 #endif
100 3:
101 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
102 l16ui a6, a2, 0
103 ONES_ADD(a4, a6)
104 addi a2, a2, 2
105 5:
106 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
107 6: l8ui a6, a2, 0
108 #ifdef __XTENSA_EB__
109 slli a6, a6, 8 /* load byte into bits 8..15 */
110 #endif
111 ONES_ADD(a4, a6)
112 7:
113 mov a2, a4
114 retw
115
116 /* uncommon case, buf is 2-byte aligned */
117 8:
118 beqz a3, 7b /* branch if len == 0 */
119 beqi a3, 1, 6b /* branch if len == 1 */
120
121 extui a5, a2, 0, 1
122 bnez a5, 8f /* branch if 1-byte aligned */
123
124 l16ui a6, a2, 0 /* common case, len >= 2 */
125 ONES_ADD(a4, a6)
126 addi a2, a2, 2 /* adjust buf */
127 addi a3, a3, -2 /* adjust len */
128 j 1b /* now buf is 4-byte aligned */
129
130 /* case: odd-byte aligned, len > 1
131 * This case is dog slow, so don't give us an odd address.
132 * (I don't think this ever happens, but just in case.)
133 */
134 8:
135 srli a5, a3, 2 /* 4-byte chunks */
136 #if XCHAL_HAVE_LOOPS
137 loopgtz a5, 2f
138 #else
139 beqz a5, 2f
140 slli a5, a5, 2
141 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
142 .Loop3:
143 #endif
144 l8ui a6, a2, 0 /* bits 24..31 */
145 l16ui a7, a2, 1 /* bits 8..23 */
146 l8ui a8, a2, 3 /* bits 0.. 8 */
147 #ifdef __XTENSA_EB__
148 slli a6, a6, 24
149 #else
150 slli a8, a8, 24
151 #endif
152 slli a7, a7, 8
153 or a7, a7, a6
154 or a7, a7, a8
155 ONES_ADD(a4, a7)
156 addi a2, a2, 4
157 #if !XCHAL_HAVE_LOOPS
158 blt a2, a5, .Loop3
159 #endif
160 2:
161 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
162 l8ui a6, a2, 0
163 l8ui a7, a2, 1
164 #ifdef __XTENSA_EB__
165 slli a6, a6, 8
166 #else
167 slli a7, a7, 8
168 #endif
169 or a7, a7, a6
170 ONES_ADD(a4, a7)
171 addi a2, a2, 2
172 3:
173 j 5b /* branch to handle the remaining byte */
174
175 ENDPROC(csum_partial)
176
177 /*
178 * Copy from ds while checksumming, otherwise like csum_partial
179 */
180
181 /*
182 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
183 int sum, int *src_err_ptr, int *dst_err_ptr)
184 a2 = src
185 a3 = dst
186 a4 = len
187 a5 = sum
188 a6 = src_err_ptr
189 a7 = dst_err_ptr
190 a8 = temp
191 a9 = temp
192 a10 = temp
193 a11 = original len for exception handling
194 a12 = original dst for exception handling
195
196 This function is optimized for 4-byte aligned addresses. Other
197 alignments work, but not nearly as efficiently.
198 */
199
200 ENTRY(csum_partial_copy_generic)
201
202 entry sp, 32
203 mov a12, a3
204 mov a11, a4
205 or a10, a2, a3
206
207 /* We optimize the following alignment tests for the 4-byte
208 aligned case. Two bbsi.l instructions might seem more optimal
209 (commented out below). However, both labels 5: and 3: are out
210 of the imm8 range, so the assembler relaxes them into
211 equivalent bbci.l, j combinations, which is actually
212 slower. */
213
214 extui a9, a10, 0, 2
215 beqz a9, 1f /* branch if both are 4-byte aligned */
216 bbsi.l a10, 0, 5f /* branch if one address is odd */
217 j 3f /* one address is 2-byte aligned */
218
219 /* _bbsi.l a10, 0, 5f */ /* branch if odd address */
220 /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
221
222 1:
223 /* src and dst are both 4-byte aligned */
224 srli a10, a4, 5 /* 32-byte chunks */
225 #if XCHAL_HAVE_LOOPS
226 loopgtz a10, 2f
227 #else
228 beqz a10, 2f
229 slli a10, a10, 5
230 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
231 .Loop5:
232 #endif
233 EX(10f) l32i a9, a2, 0
234 EX(10f) l32i a8, a2, 4
235 EX(11f) s32i a9, a3, 0
236 EX(11f) s32i a8, a3, 4
237 ONES_ADD(a5, a9)
238 ONES_ADD(a5, a8)
239 EX(10f) l32i a9, a2, 8
240 EX(10f) l32i a8, a2, 12
241 EX(11f) s32i a9, a3, 8
242 EX(11f) s32i a8, a3, 12
243 ONES_ADD(a5, a9)
244 ONES_ADD(a5, a8)
245 EX(10f) l32i a9, a2, 16
246 EX(10f) l32i a8, a2, 20
247 EX(11f) s32i a9, a3, 16
248 EX(11f) s32i a8, a3, 20
249 ONES_ADD(a5, a9)
250 ONES_ADD(a5, a8)
251 EX(10f) l32i a9, a2, 24
252 EX(10f) l32i a8, a2, 28
253 EX(11f) s32i a9, a3, 24
254 EX(11f) s32i a8, a3, 28
255 ONES_ADD(a5, a9)
256 ONES_ADD(a5, a8)
257 addi a2, a2, 32
258 addi a3, a3, 32
259 #if !XCHAL_HAVE_LOOPS
260 blt a2, a10, .Loop5
261 #endif
262 2:
263 extui a10, a4, 2, 3 /* remaining 4-byte chunks */
264 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
265 #if XCHAL_HAVE_LOOPS
266 loopgtz a10, 3f
267 #else
268 beqz a10, 3f
269 slli a10, a10, 2
270 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
271 .Loop6:
272 #endif
273 EX(10f) l32i a9, a2, 0
274 EX(11f) s32i a9, a3, 0
275 ONES_ADD(a5, a9)
276 addi a2, a2, 4
277 addi a3, a3, 4
278 #if !XCHAL_HAVE_LOOPS
279 blt a2, a10, .Loop6
280 #endif
281 3:
282 /*
283 Control comes to here in two cases: (1) It may fall through
284 to here from the 4-byte alignment case to process, at most,
285 one 2-byte chunk. (2) It branches to here from above if
286 either src or dst is 2-byte aligned, and we process all bytes
287 here, except for perhaps a trailing odd byte. It's
288 inefficient, so align your addresses to 4-byte boundaries.
289
290 a2 = src
291 a3 = dst
292 a4 = len
293 a5 = sum
294 */
295 srli a10, a4, 1 /* 2-byte chunks */
296 #if XCHAL_HAVE_LOOPS
297 loopgtz a10, 4f
298 #else
299 beqz a10, 4f
300 slli a10, a10, 1
301 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
302 .Loop7:
303 #endif
304 EX(10f) l16ui a9, a2, 0
305 EX(11f) s16i a9, a3, 0
306 ONES_ADD(a5, a9)
307 addi a2, a2, 2
308 addi a3, a3, 2
309 #if !XCHAL_HAVE_LOOPS
310 blt a2, a10, .Loop7
311 #endif
312 4:
313 /* This section processes a possible trailing odd byte. */
314 _bbci.l a4, 0, 8f /* 1-byte chunk */
315 EX(10f) l8ui a9, a2, 0
316 EX(11f) s8i a9, a3, 0
317 #ifdef __XTENSA_EB__
318 slli a9, a9, 8 /* shift byte to bits 8..15 */
319 #endif
320 ONES_ADD(a5, a9)
321 8:
322 mov a2, a5
323 retw
324
325 5:
326 /* Control branch to here when either src or dst is odd. We
327 process all bytes using 8-bit accesses. Grossly inefficient,
328 so don't feed us an odd address. */
329
330 srli a10, a4, 1 /* handle in pairs for 16-bit csum */
331 #if XCHAL_HAVE_LOOPS
332 loopgtz a10, 6f
333 #else
334 beqz a10, 6f
335 slli a10, a10, 1
336 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
337 .Loop8:
338 #endif
339 EX(10f) l8ui a9, a2, 0
340 EX(10f) l8ui a8, a2, 1
341 EX(11f) s8i a9, a3, 0
342 EX(11f) s8i a8, a3, 1
343 #ifdef __XTENSA_EB__
344 slli a9, a9, 8 /* combine into a single 16-bit value */
345 #else /* for checksum computation */
346 slli a8, a8, 8
347 #endif
348 or a9, a9, a8
349 ONES_ADD(a5, a9)
350 addi a2, a2, 2
351 addi a3, a3, 2
352 #if !XCHAL_HAVE_LOOPS
353 blt a2, a10, .Loop8
354 #endif
355 6:
356 j 4b /* process the possible trailing odd byte */
357
358 ENDPROC(csum_partial_copy_generic)
359
360
361 # Exception handler:
362 .section .fixup, "ax"
363 /*
364 a6 = src_err_ptr
365 a7 = dst_err_ptr
366 a11 = original len for exception handling
367 a12 = original dst for exception handling
368 */
369
370 10:
371 _movi a2, -EFAULT
372 s32i a2, a6, 0 /* src_err_ptr */
373
374 # clear the complete destination - computing the rest
375 # is too much work
376 movi a2, 0
377 #if XCHAL_HAVE_LOOPS
378 loopgtz a11, 2f
379 #else
380 beqz a11, 2f
381 add a11, a11, a12 /* a11 = ending address */
382 .Leloop:
383 #endif
384 s8i a2, a12, 0
385 addi a12, a12, 1
386 #if !XCHAL_HAVE_LOOPS
387 blt a12, a11, .Leloop
388 #endif
389 2:
390 retw
391
392 11:
393 movi a2, -EFAULT
394 s32i a2, a7, 0 /* dst_err_ptr */
395 movi a2, 0
396 retw
397
398 .previous