arch/mips/lib/csum_partial.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Quick'n'dirty IP checksum ...
   7  *
   8  * Copyright (C) 1998, 1999 Ralf Baechle
   9  * Copyright (C) 1999 Silicon Graphics, Inc.
  10  * Copyright (C) 2007  Maciej W. Rozycki
  11  * Copyright (C) 2014 Imagination Technologies Ltd.
  12  */
  13 #include <linux/errno.h>
  14 #include <asm/asm.h>
  15 #include <asm/asm-offsets.h>
  16 #include <asm/export.h>
  17 #include <asm/regdef.h>
  18
  19 #ifdef CONFIG_64BIT
  20 /*
  21  * As we are sharing code base with the mips32 tree (which use the o32 ABI
  22  * register definitions). We need to redefine the register definitions from
  23  * the n64 ABI register naming to the o32 ABI register naming.
  24  */
  25 #undef t0
  26 #undef t1
  27 #undef t2
  28 #undef t3
  29 #define t0      $8
  30 #define t1      $9
  31 #define t2      $10
  32 #define t3      $11
  33 #define t4      $12
  34 #define t5      $13
  35 #define t6      $14
  36 #define t7      $15
  37
  38 #define USE_DOUBLE
  39 #endif
  40
  41 #ifdef USE_DOUBLE
  42
  43 #define LOAD   ld
  44 #define LOAD32 lwu
  45 #define ADD    daddu
  46 #define NBYTES 8
  47
  48 #else
  49
  50 #define LOAD   lw
  51 #define LOAD32 lw
  52 #define ADD    addu
  53 #define NBYTES 4
  54
  55 #endif /* USE_DOUBLE */
  56
  57 #define UNIT(unit)  ((unit)*NBYTES)
  58
  59 #define ADDC(sum,reg)                                           \
  60         .set    push;                                           \
  61         .set    noat;                                           \
  62         ADD     sum, reg;                                       \
  63         sltu    v1, sum, reg;                                   \
  64         ADD     sum, v1;                                        \
  65         .set    pop
  66
  67 #define ADDC32(sum,reg)                                         \
  68         .set    push;                                           \
  69         .set    noat;                                           \
  70         addu    sum, reg;                                       \
  71         sltu    v1, sum, reg;                                   \
  72         addu    sum, v1;                                        \
  73         .set    pop
  74
  75 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  76         LOAD    _t0, (offset + UNIT(0))(src);                   \
  77         LOAD    _t1, (offset + UNIT(1))(src);                   \
  78         LOAD    _t2, (offset + UNIT(2))(src);                   \
  79         LOAD    _t3, (offset + UNIT(3))(src);                   \
  80         ADDC(_t0, _t1);                                         \
  81         ADDC(_t2, _t3);                                         \
  82         ADDC(sum, _t0);                                         \
  83         ADDC(sum, _t2)
  84
  85 #ifdef USE_DOUBLE
  86 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  87         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  88 #else
  89 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  90         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  91         CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  92 #endif
  93
  94 /*
  95  * a0: source address
  96  * a1: length of the area to checksum
  97  * a2: partial checksum
  98  */
  99
 100 #define src a0
 101 #define sum v0
 102
 103         .text
 104         .set    noreorder
 105         .align  5
 106 LEAF(csum_partial)
 107 EXPORT_SYMBOL(csum_partial)
 108         move    sum, zero
 109         move    t7, zero
 110
 111         sltiu   t8, a1, 0x8
 112         bnez    t8, .Lsmall_csumcpy             /* < 8 bytes to copy */
 113          move   t2, a1
 114
 115         andi    t7, src, 0x1                    /* odd buffer? */
 116
 117 .Lhword_align:
 118         beqz    t7, .Lword_align
 119          andi   t8, src, 0x2
 120
 121         lbu     t0, (src)
 122         LONG_SUBU       a1, a1, 0x1
 123 #ifdef __MIPSEL__
 124         sll     t0, t0, 8
 125 #endif
 126         ADDC(sum, t0)
 127         PTR_ADDU        src, src, 0x1
 128         andi    t8, src, 0x2
 129
 130 .Lword_align:
 131         beqz    t8, .Ldword_align
 132          sltiu  t8, a1, 56
 133
 134         lhu     t0, (src)
 135         LONG_SUBU       a1, a1, 0x2
 136         ADDC(sum, t0)
 137         sltiu   t8, a1, 56
 138         PTR_ADDU        src, src, 0x2
 139
 140 .Ldword_align:
 141         bnez    t8, .Ldo_end_words
 142          move   t8, a1
 143
 144         andi    t8, src, 0x4
 145         beqz    t8, .Lqword_align
 146          andi   t8, src, 0x8
 147
 148         LOAD32  t0, 0x00(src)
 149         LONG_SUBU       a1, a1, 0x4
 150         ADDC(sum, t0)
 151         PTR_ADDU        src, src, 0x4
 152         andi    t8, src, 0x8
 153
 154 .Lqword_align:
 155         beqz    t8, .Loword_align
 156          andi   t8, src, 0x10
 157
 158 #ifdef USE_DOUBLE
 159         ld      t0, 0x00(src)
 160         LONG_SUBU       a1, a1, 0x8
 161         ADDC(sum, t0)
 162 #else
 163         lw      t0, 0x00(src)
 164         lw      t1, 0x04(src)
 165         LONG_SUBU       a1, a1, 0x8
 166         ADDC(sum, t0)
 167         ADDC(sum, t1)
 168 #endif
 169         PTR_ADDU        src, src, 0x8
 170         andi    t8, src, 0x10
 171
 172 .Loword_align:
 173         beqz    t8, .Lbegin_movement
 174          LONG_SRL       t8, a1, 0x7
 175
 176 #ifdef USE_DOUBLE
 177         ld      t0, 0x00(src)
 178         ld      t1, 0x08(src)
 179         ADDC(sum, t0)
 180         ADDC(sum, t1)
 181 #else
 182         CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 183 #endif
 184         LONG_SUBU       a1, a1, 0x10
 185         PTR_ADDU        src, src, 0x10
 186         LONG_SRL        t8, a1, 0x7
 187
 188 .Lbegin_movement:
 189         beqz    t8, 1f
 190          andi   t2, a1, 0x40
 191
 192 .Lmove_128bytes:
 193         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 194         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 195         CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 196         CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 197         LONG_SUBU       t8, t8, 0x01
 198         .set    reorder                         /* DADDI_WAR */
 199         PTR_ADDU        src, src, 0x80
 200         bnez    t8, .Lmove_128bytes
 201         .set    noreorder
 202
 203 1:
 204         beqz    t2, 1f
 205          andi   t2, a1, 0x20
 206
 207 .Lmove_64bytes:
 208         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 209         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 210         PTR_ADDU        src, src, 0x40
 211
 212 1:
 213         beqz    t2, .Ldo_end_words
 214          andi   t8, a1, 0x1c
 215
 216 .Lmove_32bytes:
 217         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 218         andi    t8, a1, 0x1c
 219         PTR_ADDU        src, src, 0x20
 220
 221 .Ldo_end_words:
 222         beqz    t8, .Lsmall_csumcpy
 223          andi   t2, a1, 0x3
 224         LONG_SRL        t8, t8, 0x2
 225
 226 .Lend_words:
 227         LOAD32  t0, (src)
 228         LONG_SUBU       t8, t8, 0x1
 229         ADDC(sum, t0)
 230         .set    reorder                         /* DADDI_WAR */
 231         PTR_ADDU        src, src, 0x4
 232         bnez    t8, .Lend_words
 233         .set    noreorder
 234
 235 /* unknown src alignment and < 8 bytes to go  */
 236 .Lsmall_csumcpy:
 237         move    a1, t2
 238
 239         andi    t0, a1, 4
 240         beqz    t0, 1f
 241          andi   t0, a1, 2
 242
 243         /* Still a full word to go  */
 244         ulw     t1, (src)
 245         PTR_ADDIU       src, 4
 246 #ifdef USE_DOUBLE
 247         dsll    t1, t1, 32                      /* clear lower 32bit */
 248 #endif
 249         ADDC(sum, t1)
 250
 251 1:      move    t1, zero
 252         beqz    t0, 1f
 253          andi   t0, a1, 1
 254
 255         /* Still a halfword to go  */
 256         ulhu    t1, (src)
 257         PTR_ADDIU       src, 2
 258
 259 1:      beqz    t0, 1f
 260          sll    t1, t1, 16
 261
 262         lbu     t2, (src)
 263          nop
 264
 265 #ifdef __MIPSEB__
 266         sll     t2, t2, 8
 267 #endif
 268         or      t1, t2
 269
 270 1:      ADDC(sum, t1)
 271
 272         /* fold checksum */
 273 #ifdef USE_DOUBLE
 274         dsll32  v1, sum, 0
 275         daddu   sum, v1
 276         sltu    v1, sum, v1
 277         dsra32  sum, sum, 0
 278         addu    sum, v1
 279 #endif
 280
 281         /* odd buffer alignment? */
 282 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
 283         .set    push
 284         .set    arch=mips32r2
 285         wsbh    v1, sum
 286         movn    sum, v1, t7
 287         .set    pop
 288 #else
 289         beqz    t7, 1f                  /* odd buffer alignment? */
 290          lui    v1, 0x00ff
 291         addu    v1, 0x00ff
 292         and     t0, sum, v1
 293         sll     t0, t0, 8
 294         srl     sum, sum, 8
 295         and     sum, sum, v1
 296         or      sum, sum, t0
 297 1:
 298 #endif
 299         .set    reorder
 300         /* Add the passed partial csum.  */
 301         ADDC32(sum, a2)
 302         jr      ra
 303         .set    noreorder
 304         END(csum_partial)
 305
 306
 307 /*
 308  * checksum and copy routines based on memcpy.S
 309  *
 310  *      csum_partial_copy_nocheck(src, dst, len, sum)
 311  *      __csum_partial_copy_kernel(src, dst, len, sum, errp)
 312  *
 313  * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 314  * function in this file use the standard calling convention.
 315  */
 316
 317 #define src a0
 318 #define dst a1
 319 #define len a2
 320 #define psum a3
 321 #define sum v0
 322 #define odd t8
 323 #define errptr t9
 324
 325 /*
 326  * The exception handler for loads requires that:
 327  *  1- AT contain the address of the byte just past the end of the source
 328  *     of the copy,
 329  *  2- src_entry <= src < AT, and
 330  *  3- (dst - src) == (dst_entry - src_entry),
 331  * The _entry suffix denotes values when __copy_user was called.
 332  *
 333  * (1) is set up up by __csum_partial_copy_from_user and maintained by
 334  *      not writing AT in __csum_partial_copy
 335  * (2) is met by incrementing src by the number of bytes copied
 336  * (3) is met by not doing loads between a pair of increments of dst and src
 337  *
 338  * The exception handlers for stores stores -EFAULT to errptr and return.
 339  * These handlers do not need to overwrite any data.
 340  */
 341
 342 /* Instruction type */
 343 #define LD_INSN 1
 344 #define ST_INSN 2
 345 #define LEGACY_MODE 1
 346 #define EVA_MODE    2
 347 #define USEROP   1
 348 #define KERNELOP 2
 349
 350 /*
 351  * Wrapper to add an entry in the exception table
 352  * in case the insn causes a memory exception.
 353  * Arguments:
 354  * insn    : Load/store instruction
 355  * type    : Instruction type
 356  * reg     : Register
 357  * addr    : Address
 358  * handler : Exception handler
 359  */
 360 #define EXC(insn, type, reg, addr, handler)     \
 361         .if \mode == LEGACY_MODE;               \
 362 9:              insn reg, addr;                 \
 363                 .section __ex_table,"a";        \
 364                 PTR     9b, handler;            \
 365                 .previous;                      \
 366         /* This is enabled in EVA mode */       \
 367         .else;                                  \
 368                 /* If loading from user or storing to user */   \
 369                 .if ((\from == USEROP) && (type == LD_INSN)) || \
 370                     ((\to == USEROP) && (type == ST_INSN));     \
 371 9:                      __BUILD_EVA_INSN(insn##e, reg, addr);   \
 372                         .section __ex_table,"a";                \
 373                         PTR     9b, handler;                    \
 374                         .previous;                              \
 375                 .else;                                          \
 376                         /* EVA without exception */             \
 377                         insn reg, addr;                         \
 378                 .endif;                                         \
 379         .endif
 380
 381 #undef LOAD
 382
 383 #ifdef USE_DOUBLE
 384
 385 #define LOADK   ld /* No exception */
 386 #define LOAD(reg, addr, handler)        EXC(ld, LD_INSN, reg, addr, handler)
 387 #define LOADBU(reg, addr, handler)      EXC(lbu, LD_INSN, reg, addr, handler)
 388 #define LOADL(reg, addr, handler)       EXC(ldl, LD_INSN, reg, addr, handler)
 389 #define LOADR(reg, addr, handler)       EXC(ldr, LD_INSN, reg, addr, handler)
 390 #define STOREB(reg, addr, handler)      EXC(sb, ST_INSN, reg, addr, handler)
 391 #define STOREL(reg, addr, handler)      EXC(sdl, ST_INSN, reg, addr, handler)
 392 #define STORER(reg, addr, handler)      EXC(sdr, ST_INSN, reg, addr, handler)
 393 #define STORE(reg, addr, handler)       EXC(sd, ST_INSN, reg, addr, handler)
 394 #define ADD    daddu
 395 #define SUB    dsubu
 396 #define SRL    dsrl
 397 #define SLL    dsll
 398 #define SLLV   dsllv
 399 #define SRLV   dsrlv
 400 #define NBYTES 8
 401 #define LOG_NBYTES 3
 402
 403 #else
 404
 405 #define LOADK   lw /* No exception */
 406 #define LOAD(reg, addr, handler)        EXC(lw, LD_INSN, reg, addr, handler)
 407 #define LOADBU(reg, addr, handler)      EXC(lbu, LD_INSN, reg, addr, handler)
 408 #define LOADL(reg, addr, handler)       EXC(lwl, LD_INSN, reg, addr, handler)
 409 #define LOADR(reg, addr, handler)       EXC(lwr, LD_INSN, reg, addr, handler)
 410 #define STOREB(reg, addr, handler)      EXC(sb, ST_INSN, reg, addr, handler)
 411 #define STOREL(reg, addr, handler)      EXC(swl, ST_INSN, reg, addr, handler)
 412 #define STORER(reg, addr, handler)      EXC(swr, ST_INSN, reg, addr, handler)
 413 #define STORE(reg, addr, handler)       EXC(sw, ST_INSN, reg, addr, handler)
 414 #define ADD    addu
 415 #define SUB    subu
 416 #define SRL    srl
 417 #define SLL    sll
 418 #define SLLV   sllv
 419 #define SRLV   srlv
 420 #define NBYTES 4
 421 #define LOG_NBYTES 2
 422
 423 #endif /* USE_DOUBLE */
 424
 425 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 426 #define LDFIRST LOADR
 427 #define LDREST  LOADL
 428 #define STFIRST STORER
 429 #define STREST  STOREL
 430 #define SHIFT_DISCARD SLLV
 431 #define SHIFT_DISCARD_REVERT SRLV
 432 #else
 433 #define LDFIRST LOADL
 434 #define LDREST  LOADR
 435 #define STFIRST STOREL
 436 #define STREST  STORER
 437 #define SHIFT_DISCARD SRLV
 438 #define SHIFT_DISCARD_REVERT SLLV
 439 #endif
 440
 441 #define FIRST(unit) ((unit)*NBYTES)
 442 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 443
 444 #define ADDRMASK (NBYTES-1)
 445
 446 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 447         .set    noat
 448 #else
 449         .set    at=v1
 450 #endif
 451
 452         .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to, __nocheck
 453
 454         PTR_ADDU        AT, src, len    /* See (1) above. */
 455         /* initialize __nocheck if this the first time we execute this
 456          * macro
 457          */
 458 #ifdef CONFIG_64BIT
 459         move    errptr, a4
 460 #else
 461         lw      errptr, 16(sp)
 462 #endif
 463         .if \__nocheck == 1
 464         FEXPORT(csum_partial_copy_nocheck)
 465         EXPORT_SYMBOL(csum_partial_copy_nocheck)
 466         .endif
 467         move    sum, zero
 468         move    odd, zero
 469         /*
 470          * Note: dst & src may be unaligned, len may be 0
 471          * Temps
 472          */
 473         /*
 474          * The "issue break"s below are very approximate.
 475          * Issue delays for dcache fills will perturb the schedule, as will
 476          * load queue full replay traps, etc.
 477          *
 478          * If len < NBYTES use byte operations.
 479          */
 480         sltu    t2, len, NBYTES
 481         and     t1, dst, ADDRMASK
 482         bnez    t2, .Lcopy_bytes_checklen\@
 483          and    t0, src, ADDRMASK
 484         andi    odd, dst, 0x1                   /* odd buffer? */
 485         bnez    t1, .Ldst_unaligned\@
 486          nop
 487         bnez    t0, .Lsrc_unaligned_dst_aligned\@
 488         /*
 489          * use delay slot for fall-through
 490          * src and dst are aligned; need to compute rem
 491          */
 492 .Lboth_aligned\@:
 493          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 494         beqz    t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
 495          nop
 496         SUB     len, 8*NBYTES           # subtract here for bgez loop
 497         .align  4
 498 1:
 499         LOAD(t0, UNIT(0)(src), .Ll_exc\@)
 500         LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
 501         LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
 502         LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
 503         LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
 504         LOAD(t5, UNIT(5)(src), .Ll_exc_copy\@)
 505         LOAD(t6, UNIT(6)(src), .Ll_exc_copy\@)
 506         LOAD(t7, UNIT(7)(src), .Ll_exc_copy\@)
 507         SUB     len, len, 8*NBYTES
 508         ADD     src, src, 8*NBYTES
 509         STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 510         ADDC(t0, t1)
 511         STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 512         ADDC(sum, t0)
 513         STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 514         ADDC(t2, t3)
 515         STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 516         ADDC(sum, t2)
 517         STORE(t4, UNIT(4)(dst), .Ls_exc\@)
 518         ADDC(t4, t5)
 519         STORE(t5, UNIT(5)(dst), .Ls_exc\@)
 520         ADDC(sum, t4)
 521         STORE(t6, UNIT(6)(dst), .Ls_exc\@)
 522         ADDC(t6, t7)
 523         STORE(t7, UNIT(7)(dst), .Ls_exc\@)
 524         ADDC(sum, t6)
 525         .set    reorder                         /* DADDI_WAR */
 526         ADD     dst, dst, 8*NBYTES
 527         bgez    len, 1b
 528         .set    noreorder
 529         ADD     len, 8*NBYTES           # revert len (see above)
 530
 531         /*
 532          * len == the number of bytes left to copy < 8*NBYTES
 533          */
 534 .Lcleanup_both_aligned\@:
 535 #define rem t7
 536         beqz    len, .Ldone\@
 537          sltu   t0, len, 4*NBYTES
 538         bnez    t0, .Lless_than_4units\@
 539          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 540         /*
 541          * len >= 4*NBYTES
 542          */
 543         LOAD(t0, UNIT(0)(src), .Ll_exc\@)
 544         LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
 545         LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
 546         LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
 547         SUB     len, len, 4*NBYTES
 548         ADD     src, src, 4*NBYTES
 549         STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 550         ADDC(t0, t1)
 551         STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 552         ADDC(sum, t0)
 553         STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 554         ADDC(t2, t3)
 555         STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 556         ADDC(sum, t2)
 557         .set    reorder                         /* DADDI_WAR */
 558         ADD     dst, dst, 4*NBYTES
 559         beqz    len, .Ldone\@
 560         .set    noreorder
 561 .Lless_than_4units\@:
 562         /*
 563          * rem = len % NBYTES
 564          */
 565         beq     rem, len, .Lcopy_bytes\@
 566          nop
 567 1:
 568         LOAD(t0, 0(src), .Ll_exc\@)
 569         ADD     src, src, NBYTES
 570         SUB     len, len, NBYTES
 571         STORE(t0, 0(dst), .Ls_exc\@)
 572         ADDC(sum, t0)
 573         .set    reorder                         /* DADDI_WAR */
 574         ADD     dst, dst, NBYTES
 575         bne     rem, len, 1b
 576         .set    noreorder
 577
 578         /*
 579          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 580          * A loop would do only a byte at a time with possible branch
 581          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 582          * because can't assume read-access to dst.  Instead, use
 583          * STREST dst, which doesn't require read access to dst.
 584          *
 585          * This code should perform better than a simple loop on modern,
 586          * wide-issue mips processors because the code has fewer branches and
 587          * more instruction-level parallelism.
 588          */
 589 #define bits t2
 590         beqz    len, .Ldone\@
 591          ADD    t1, dst, len    # t1 is just past last byte of dst
 592         li      bits, 8*NBYTES
 593         SLL     rem, len, 3     # rem = number of bits to keep
 594         LOAD(t0, 0(src), .Ll_exc\@)
 595         SUB     bits, bits, rem # bits = number of bits to discard
 596         SHIFT_DISCARD t0, t0, bits
 597         STREST(t0, -1(t1), .Ls_exc\@)
 598         SHIFT_DISCARD_REVERT t0, t0, bits
 599         .set reorder
 600         ADDC(sum, t0)
 601         b       .Ldone\@
 602         .set noreorder
 603 .Ldst_unaligned\@:
 604         /*
 605          * dst is unaligned
 606          * t0 = src & ADDRMASK
 607          * t1 = dst & ADDRMASK; T1 > 0
 608          * len >= NBYTES
 609          *
 610          * Copy enough bytes to align dst
 611          * Set match = (src and dst have same alignment)
 612          */
 613 #define match rem
 614         LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
 615         ADD     t2, zero, NBYTES
 616         LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
 617         SUB     t2, t2, t1      # t2 = number of bytes copied
 618         xor     match, t0, t1
 619         STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
 620         SLL     t4, t1, 3               # t4 = number of bits to discard
 621         SHIFT_DISCARD t3, t3, t4
 622         /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 623         ADDC(sum, t3)
 624         beq     len, t2, .Ldone\@
 625          SUB    len, len, t2
 626         ADD     dst, dst, t2
 627         beqz    match, .Lboth_aligned\@
 628          ADD    src, src, t2
 629
 630 .Lsrc_unaligned_dst_aligned\@:
 631         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 632         beqz    t0, .Lcleanup_src_unaligned\@
 633          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 634 1:
 635 /*
 636  * Avoid consecutive LD*'s to the same register since some mips
 637  * implementations can't issue them in the same cycle.
 638  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 639  * are to the same unit (unless src is aligned, but it's not).
 640  */
 641         LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
 642         LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
 643         SUB     len, len, 4*NBYTES
 644         LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
 645         LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
 646         LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
 647         LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
 648         LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
 649         LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
 650         ADD     src, src, 4*NBYTES
 651 #ifdef CONFIG_CPU_SB1
 652         nop                             # improves slotting
 653 #endif
 654         STORE(t0, UNIT(0)(dst), .Ls_exc\@)
 655         ADDC(t0, t1)
 656         STORE(t1, UNIT(1)(dst), .Ls_exc\@)
 657         ADDC(sum, t0)
 658         STORE(t2, UNIT(2)(dst), .Ls_exc\@)
 659         ADDC(t2, t3)
 660         STORE(t3, UNIT(3)(dst), .Ls_exc\@)
 661         ADDC(sum, t2)
 662         .set    reorder                         /* DADDI_WAR */
 663         ADD     dst, dst, 4*NBYTES
 664         bne     len, rem, 1b
 665         .set    noreorder
 666
 667 .Lcleanup_src_unaligned\@:
 668         beqz    len, .Ldone\@
 669          and    rem, len, NBYTES-1  # rem = len % NBYTES
 670         beq     rem, len, .Lcopy_bytes\@
 671          nop
 672 1:
 673         LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
 674         LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
 675         ADD     src, src, NBYTES
 676         SUB     len, len, NBYTES
 677         STORE(t0, 0(dst), .Ls_exc\@)
 678         ADDC(sum, t0)
 679         .set    reorder                         /* DADDI_WAR */
 680         ADD     dst, dst, NBYTES
 681         bne     len, rem, 1b
 682         .set    noreorder
 683
 684 .Lcopy_bytes_checklen\@:
 685         beqz    len, .Ldone\@
 686          nop
 687 .Lcopy_bytes\@:
 688         /* 0 < len < NBYTES  */
 689 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 690 #define SHIFT_START 0
 691 #define SHIFT_INC 8
 692 #else
 693 #define SHIFT_START 8*(NBYTES-1)
 694 #define SHIFT_INC -8
 695 #endif
 696         move    t2, zero        # partial word
 697         li      t3, SHIFT_START # shift
 698 /* use .Ll_exc_copy here to return correct sum on fault */
 699 #define COPY_BYTE(N)                    \
 700         LOADBU(t0, N(src), .Ll_exc_copy\@);     \
 701         SUB     len, len, 1;            \
 702         STOREB(t0, N(dst), .Ls_exc\@);  \
 703         SLLV    t0, t0, t3;             \
 704         addu    t3, SHIFT_INC;          \
 705         beqz    len, .Lcopy_bytes_done\@; \
 706          or     t2, t0
 707
 708         COPY_BYTE(0)
 709         COPY_BYTE(1)
 710 #ifdef USE_DOUBLE
 711         COPY_BYTE(2)
 712         COPY_BYTE(3)
 713         COPY_BYTE(4)
 714         COPY_BYTE(5)
 715 #endif
 716         LOADBU(t0, NBYTES-2(src), .Ll_exc_copy\@)
 717         SUB     len, len, 1
 718         STOREB(t0, NBYTES-2(dst), .Ls_exc\@)
 719         SLLV    t0, t0, t3
 720         or      t2, t0
 721 .Lcopy_bytes_done\@:
 722         ADDC(sum, t2)
 723 .Ldone\@:
 724         /* fold checksum */
 725         .set    push
 726         .set    noat
 727 #ifdef USE_DOUBLE
 728         dsll32  v1, sum, 0
 729         daddu   sum, v1
 730         sltu    v1, sum, v1
 731         dsra32  sum, sum, 0
 732         addu    sum, v1
 733 #endif
 734
 735 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
 736         .set    push
 737         .set    arch=mips32r2
 738         wsbh    v1, sum
 739         movn    sum, v1, odd
 740         .set    pop
 741 #else
 742         beqz    odd, 1f                 /* odd buffer alignment? */
 743          lui    v1, 0x00ff
 744         addu    v1, 0x00ff
 745         and     t0, sum, v1
 746         sll     t0, t0, 8
 747         srl     sum, sum, 8
 748         and     sum, sum, v1
 749         or      sum, sum, t0
 750 1:
 751 #endif
 752         .set    pop
 753         .set reorder
 754         ADDC32(sum, psum)
 755         jr      ra
 756         .set noreorder
 757
 758 .Ll_exc_copy\@:
 759         /*
 760          * Copy bytes from src until faulting load address (or until a
 761          * lb faults)
 762          *
 763          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 764          * may be more than a byte beyond the last address.
 765          * Hence, the lb below may get an exception.
 766          *
 767          * Assumes src < THREAD_BUADDR($28)
 768          */
 769         LOADK   t0, TI_TASK($28)
 770          li     t2, SHIFT_START
 771         LOADK   t0, THREAD_BUADDR(t0)
 772 1:
 773         LOADBU(t1, 0(src), .Ll_exc\@)
 774         ADD     src, src, 1
 775         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 776         SLLV    t1, t1, t2
 777         addu    t2, SHIFT_INC
 778         ADDC(sum, t1)
 779         .set    reorder                         /* DADDI_WAR */
 780         ADD     dst, dst, 1
 781         bne     src, t0, 1b
 782         .set    noreorder
 783 .Ll_exc\@:
 784         LOADK   t0, TI_TASK($28)
 785          nop
 786         LOADK   t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 787          nop
 788         SUB     len, AT, t0             # len number of uncopied bytes
 789         /*
 790          * Here's where we rely on src and dst being incremented in tandem,
 791          *   See (3) above.
 792          * dst += (fault addr - src) to put dst at first byte to clear
 793          */
 794         ADD     dst, t0                 # compute start address in a1
 795         SUB     dst, src
 796         /*
 797          * Clear len bytes starting at dst.  Can't call __bzero because it
 798          * might modify len.  An inefficient loop for these rare times...
 799          */
 800         .set    reorder                         /* DADDI_WAR */
 801         SUB     src, len, 1
 802         beqz    len, .Ldone\@
 803         .set    noreorder
 804 1:      sb      zero, 0(dst)
 805         ADD     dst, dst, 1
 806         .set    push
 807         .set    noat
 808 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 809         bnez    src, 1b
 810          SUB    src, src, 1
 811 #else
 812         li      v1, 1
 813         bnez    src, 1b
 814          SUB    src, src, v1
 815 #endif
 816         li      v1, -EFAULT
 817         b       .Ldone\@
 818          sw     v1, (errptr)
 819
 820 .Ls_exc\@:
 821         li      v0, -1 /* invalid checksum */
 822         li      v1, -EFAULT
 823         jr      ra
 824          sw     v1, (errptr)
 825         .set    pop
 826         .endm
 827
 828 LEAF(__csum_partial_copy_kernel)
 829 EXPORT_SYMBOL(__csum_partial_copy_kernel)
 830 #ifndef CONFIG_EVA
 831 FEXPORT(__csum_partial_copy_to_user)
 832 EXPORT_SYMBOL(__csum_partial_copy_to_user)
 833 FEXPORT(__csum_partial_copy_from_user)
 834 EXPORT_SYMBOL(__csum_partial_copy_from_user)
 835 #endif
 836 __BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP 1
 837 END(__csum_partial_copy_kernel)
 838
 839 #ifdef CONFIG_EVA
 840 LEAF(__csum_partial_copy_to_user)
 841 __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP 0
 842 END(__csum_partial_copy_to_user)
 843
 844 LEAF(__csum_partial_copy_from_user)
 845 __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP 0
 846 END(__csum_partial_copy_from_user)
 847 #endif