arch/xtensa/lib/checksum.S

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              IP/TCP/UDP checksumming routines
   7  *
   8  * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
   9  *                  Optimized by Joe Taylor
  10  *
  11  *              This program is free software; you can redistribute it and/or
  12  *              modify it under the terms of the GNU General Public License
  13  *              as published by the Free Software Foundation; either version
  14  *              2 of the License, or (at your option) any later version.
  15  */
  16
  17 #include <linux/errno.h>
  18 #include <linux/linkage.h>
  19 #include <variant/core.h>
  20 #include <asm/asmmacro.h>
  21
  22 /*
  23  * computes a partial checksum, e.g. for TCP/UDP fragments
  24  */
  25
  26 /*
  27  * unsigned int csum_partial(const unsigned char *buf, int len,
  28  *                           unsigned int sum);
  29  *    a2 = buf
  30  *    a3 = len
  31  *    a4 = sum
  32  *
  33  * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  34  */
  35
  36 /* ONES_ADD converts twos-complement math to ones-complement. */
  37 #define ONES_ADD(sum, val)        \
  38         add     sum, sum, val   ; \
  39         bgeu    sum, val, 99f   ; \
  40         addi    sum, sum, 1     ; \
  41 99:                             ;
  42
  43 .text
  44 ENTRY(csum_partial)
  45
  46         /*
  47          * Experiments with Ethernet and SLIP connections show that buf
  48          * is aligned on either a 2-byte or 4-byte boundary.
  49          */
  50         entry   sp, 32
  51         extui   a5, a2, 0, 2
  52         bnez    a5, 8f          /* branch if 2-byte aligned */
  53         /* Fall-through on common case, 4-byte alignment */
  54 1:
  55         srli    a5, a3, 5       /* 32-byte chunks */
  56 #if XCHAL_HAVE_LOOPS
  57         loopgtz a5, 2f
  58 #else
  59         beqz    a5, 2f
  60         slli    a5, a5, 5
  61         add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  62 .Loop1:
  63 #endif
  64         l32i    a6, a2, 0
  65         l32i    a7, a2, 4
  66         ONES_ADD(a4, a6)
  67         ONES_ADD(a4, a7)
  68         l32i    a6, a2, 8
  69         l32i    a7, a2, 12
  70         ONES_ADD(a4, a6)
  71         ONES_ADD(a4, a7)
  72         l32i    a6, a2, 16
  73         l32i    a7, a2, 20
  74         ONES_ADD(a4, a6)
  75         ONES_ADD(a4, a7)
  76         l32i    a6, a2, 24
  77         l32i    a7, a2, 28
  78         ONES_ADD(a4, a6)
  79         ONES_ADD(a4, a7)
  80         addi    a2, a2, 4*8
  81 #if !XCHAL_HAVE_LOOPS
  82         blt     a2, a5, .Loop1
  83 #endif
  84 2:
  85         extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  86 #if XCHAL_HAVE_LOOPS
  87         loopgtz a5, 3f
  88 #else
  89         beqz    a5, 3f
  90         slli    a5, a5, 2
  91         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  92 .Loop2:
  93 #endif
  94         l32i    a6, a2, 0
  95         ONES_ADD(a4, a6)
  96         addi    a2, a2, 4
  97 #if !XCHAL_HAVE_LOOPS
  98         blt     a2, a5, .Loop2
  99 #endif
 100 3:
 101         _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
 102         l16ui   a6, a2, 0
 103         ONES_ADD(a4, a6)
 104         addi    a2, a2, 2
 105 5:
 106         _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 107 6:      l8ui    a6, a2, 0
 108 #ifdef __XTENSA_EB__
 109         slli    a6, a6, 8       /* load byte into bits 8..15 */
 110 #endif
 111         ONES_ADD(a4, a6)
 112 7:
 113         mov     a2, a4
 114         retw
 115
 116         /* uncommon case, buf is 2-byte aligned */
 117 8:
 118         beqz    a3, 7b          /* branch if len == 0 */
 119         beqi    a3, 1, 6b       /* branch if len == 1 */
 120
 121         extui   a5, a2, 0, 1
 122         bnez    a5, 8f          /* branch if 1-byte aligned */
 123
 124         l16ui   a6, a2, 0       /* common case, len >= 2 */
 125         ONES_ADD(a4, a6)
 126         addi    a2, a2, 2       /* adjust buf */
 127         addi    a3, a3, -2      /* adjust len */
 128         j       1b              /* now buf is 4-byte aligned */
 129
 130         /* case: odd-byte aligned, len > 1
 131          * This case is dog slow, so don't give us an odd address.
 132          * (I don't think this ever happens, but just in case.)
 133          */
 134 8:
 135         srli    a5, a3, 2       /* 4-byte chunks */
 136 #if XCHAL_HAVE_LOOPS
 137         loopgtz a5, 2f
 138 #else
 139         beqz    a5, 2f
 140         slli    a5, a5, 2
 141         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 142 .Loop3:
 143 #endif
 144         l8ui    a6, a2, 0       /* bits 24..31 */
 145         l16ui   a7, a2, 1       /* bits  8..23 */
 146         l8ui    a8, a2, 3       /* bits  0.. 8 */
 147 #ifdef  __XTENSA_EB__
 148         slli    a6, a6, 24
 149 #else
 150         slli    a8, a8, 24
 151 #endif
 152         slli    a7, a7, 8
 153         or      a7, a7, a6
 154         or      a7, a7, a8
 155         ONES_ADD(a4, a7)
 156         addi    a2, a2, 4
 157 #if !XCHAL_HAVE_LOOPS
 158         blt     a2, a5, .Loop3
 159 #endif
 160 2:
 161         _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 162         l8ui    a6, a2, 0
 163         l8ui    a7, a2, 1
 164 #ifdef  __XTENSA_EB__
 165         slli    a6, a6, 8
 166 #else
 167         slli    a7, a7, 8
 168 #endif
 169         or      a7, a7, a6
 170         ONES_ADD(a4, a7)
 171         addi    a2, a2, 2
 172 3:
 173         j       5b              /* branch to handle the remaining byte */
 174
 175 ENDPROC(csum_partial)
 176
 177 /*
 178  * Copy from ds while checksumming, otherwise like csum_partial
 179  */
 180
 181 /*
 182 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
 183                                         int sum, int *src_err_ptr, int *dst_err_ptr)
 184         a2  = src
 185         a3  = dst
 186         a4  = len
 187         a5  = sum
 188         a6  = src_err_ptr
 189         a7  = dst_err_ptr
 190         a8  = temp
 191         a9  = temp
 192         a10 = temp
 193         a11 = original len for exception handling
 194         a12 = original dst for exception handling
 195
 196     This function is optimized for 4-byte aligned addresses.  Other
 197     alignments work, but not nearly as efficiently.
 198  */
 199
 200 ENTRY(csum_partial_copy_generic)
 201
 202         entry   sp, 32
 203         mov     a12, a3
 204         mov     a11, a4
 205         or      a10, a2, a3
 206
 207         /* We optimize the following alignment tests for the 4-byte
 208         aligned case.  Two bbsi.l instructions might seem more optimal
 209         (commented out below).  However, both labels 5: and 3: are out
 210         of the imm8 range, so the assembler relaxes them into
 211         equivalent bbci.l, j combinations, which is actually
 212         slower. */
 213
 214         extui   a9, a10, 0, 2
 215         beqz    a9, 1f          /* branch if both are 4-byte aligned */
 216         bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 217         j       3f              /* one address is 2-byte aligned */
 218
 219 /*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 220 /*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 221
 222 1:
 223         /* src and dst are both 4-byte aligned */
 224         srli    a10, a4, 5      /* 32-byte chunks */
 225 #if XCHAL_HAVE_LOOPS
 226         loopgtz a10, 2f
 227 #else
 228         beqz    a10, 2f
 229         slli    a10, a10, 5
 230         add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 231 .Loop5:
 232 #endif
 233 EX(10f) l32i    a9, a2, 0
 234 EX(10f) l32i    a8, a2, 4
 235 EX(11f) s32i    a9, a3, 0
 236 EX(11f) s32i    a8, a3, 4
 237         ONES_ADD(a5, a9)
 238         ONES_ADD(a5, a8)
 239 EX(10f) l32i    a9, a2, 8
 240 EX(10f) l32i    a8, a2, 12
 241 EX(11f) s32i    a9, a3, 8
 242 EX(11f) s32i    a8, a3, 12
 243         ONES_ADD(a5, a9)
 244         ONES_ADD(a5, a8)
 245 EX(10f) l32i    a9, a2, 16
 246 EX(10f) l32i    a8, a2, 20
 247 EX(11f) s32i    a9, a3, 16
 248 EX(11f) s32i    a8, a3, 20
 249         ONES_ADD(a5, a9)
 250         ONES_ADD(a5, a8)
 251 EX(10f) l32i    a9, a2, 24
 252 EX(10f) l32i    a8, a2, 28
 253 EX(11f) s32i    a9, a3, 24
 254 EX(11f) s32i    a8, a3, 28
 255         ONES_ADD(a5, a9)
 256         ONES_ADD(a5, a8)
 257         addi    a2, a2, 32
 258         addi    a3, a3, 32
 259 #if !XCHAL_HAVE_LOOPS
 260         blt     a2, a10, .Loop5
 261 #endif
 262 2:
 263         extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 264         extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 265 #if XCHAL_HAVE_LOOPS
 266         loopgtz a10, 3f
 267 #else
 268         beqz    a10, 3f
 269         slli    a10, a10, 2
 270         add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 271 .Loop6:
 272 #endif
 273 EX(10f) l32i    a9, a2, 0
 274 EX(11f) s32i    a9, a3, 0
 275         ONES_ADD(a5, a9)
 276         addi    a2, a2, 4
 277         addi    a3, a3, 4
 278 #if !XCHAL_HAVE_LOOPS
 279         blt     a2, a10, .Loop6
 280 #endif
 281 3:
 282         /*
 283         Control comes to here in two cases: (1) It may fall through
 284         to here from the 4-byte alignment case to process, at most,
 285         one 2-byte chunk.  (2) It branches to here from above if
 286         either src or dst is 2-byte aligned, and we process all bytes
 287         here, except for perhaps a trailing odd byte.  It's
 288         inefficient, so align your addresses to 4-byte boundaries.
 289
 290         a2 = src
 291         a3 = dst
 292         a4 = len
 293         a5 = sum
 294         */
 295         srli    a10, a4, 1      /* 2-byte chunks */
 296 #if XCHAL_HAVE_LOOPS
 297         loopgtz a10, 4f
 298 #else
 299         beqz    a10, 4f
 300         slli    a10, a10, 1
 301         add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 302 .Loop7:
 303 #endif
 304 EX(10f) l16ui   a9, a2, 0
 305 EX(11f) s16i    a9, a3, 0
 306         ONES_ADD(a5, a9)
 307         addi    a2, a2, 2
 308         addi    a3, a3, 2
 309 #if !XCHAL_HAVE_LOOPS
 310         blt     a2, a10, .Loop7
 311 #endif
 312 4:
 313         /* This section processes a possible trailing odd byte. */
 314         _bbci.l a4, 0, 8f       /* 1-byte chunk */
 315 EX(10f) l8ui    a9, a2, 0
 316 EX(11f) s8i     a9, a3, 0
 317 #ifdef __XTENSA_EB__
 318         slli    a9, a9, 8       /* shift byte to bits 8..15 */
 319 #endif
 320         ONES_ADD(a5, a9)
 321 8:
 322         mov     a2, a5
 323         retw
 324
 325 5:
 326         /* Control branch to here when either src or dst is odd.  We
 327         process all bytes using 8-bit accesses.  Grossly inefficient,
 328         so don't feed us an odd address. */
 329
 330         srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 331 #if XCHAL_HAVE_LOOPS
 332         loopgtz a10, 6f
 333 #else
 334         beqz    a10, 6f
 335         slli    a10, a10, 1
 336         add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 337 .Loop8:
 338 #endif
 339 EX(10f) l8ui    a9, a2, 0
 340 EX(10f) l8ui    a8, a2, 1
 341 EX(11f) s8i     a9, a3, 0
 342 EX(11f) s8i     a8, a3, 1
 343 #ifdef __XTENSA_EB__
 344         slli    a9, a9, 8       /* combine into a single 16-bit value */
 345 #else                           /* for checksum computation */
 346         slli    a8, a8, 8
 347 #endif
 348         or      a9, a9, a8
 349         ONES_ADD(a5, a9)
 350         addi    a2, a2, 2
 351         addi    a3, a3, 2
 352 #if !XCHAL_HAVE_LOOPS
 353         blt     a2, a10, .Loop8
 354 #endif
 355 6:
 356         j       4b              /* process the possible trailing odd byte */
 357
 358 ENDPROC(csum_partial_copy_generic)
 359
 360
 361 # Exception handler:
 362 .section .fixup, "ax"
 363 /*
 364         a6  = src_err_ptr
 365         a7  = dst_err_ptr
 366         a11 = original len for exception handling
 367         a12 = original dst for exception handling
 368 */
 369
 370 10:
 371         _movi   a2, -EFAULT
 372         s32i    a2, a6, 0       /* src_err_ptr */
 373
 374         # clear the complete destination - computing the rest
 375         # is too much work
 376         movi    a2, 0
 377 #if XCHAL_HAVE_LOOPS
 378         loopgtz a11, 2f
 379 #else
 380         beqz    a11, 2f
 381         add     a11, a11, a12   /* a11 = ending address */
 382 .Leloop:
 383 #endif
 384         s8i     a2, a12, 0
 385         addi    a12, a12, 1
 386 #if !XCHAL_HAVE_LOOPS
 387         blt     a12, a11, .Leloop
 388 #endif
 389 2:
 390         retw
 391
 392 11:
 393         movi    a2, -EFAULT
 394         s32i    a2, a7, 0       /* dst_err_ptr */
 395         movi    a2, 0
 396         retw
 397
 398 .previous