arch/mips/lib/memcpy.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  *
  13  * Mnemonic names for arguments to memcpy/__copy_user
  14  */
  15 #include <linux/config.h>
  16 #include <asm/asm.h>
  17 #include <asm/asm-offsets.h>
  18 #include <asm/regdef.h>
  19
  20 #define dst a0
  21 #define src a1
  22 #define len a2
  23
  24 /*
  25  * Spec
  26  *
  27  * memcpy copies len bytes from src to dst and sets v0 to dst.
  28  * It assumes that
  29  *   - src and dst don't overlap
  30  *   - src is readable
  31  *   - dst is writable
  32  * memcpy uses the standard calling convention
  33  *
  34  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  35  * the number of uncopied bytes due to an exception caused by a read or write.
  36  * __copy_user assumes that src and dst don't overlap, and that the call is
  37  * implementing one of the following:
  38  *   copy_to_user
  39  *     - src is readable  (no exceptions when reading src)
  40  *   copy_from_user
  41  *     - dst is writable  (no exceptions when writing dst)
  42  * __copy_user uses a non-standard calling convention; see
  43  * include/asm-mips/uaccess.h
  44  *
  45  * When an exception happens on a load, the handler must
  46  # ensure that all of the destination buffer is overwritten to prevent
  47  * leaking information to user mode programs.
  48  */
  49
  50 /*
  51  * Implementation
  52  */
  53
  54 /*
  55  * The exception handler for loads requires that:
  56  *  1- AT contain the address of the byte just past the end of the source
  57  *     of the copy,
  58  *  2- src_entry <= src < AT, and
  59  *  3- (dst - src) == (dst_entry - src_entry),
  60  * The _entry suffix denotes values when __copy_user was called.
  61  *
  62  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  63  * (2) is met by incrementing src by the number of bytes copied
  64  * (3) is met by not doing loads between a pair of increments of dst and src
  65  *
  66  * The exception handlers for stores adjust len (if necessary) and return.
  67  * These handlers do not need to overwrite any data.
  68  *
  69  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  70  * they're not protected.
  71  */
  72
  73 #define EXC(inst_reg,addr,handler)              \
  74 9:      inst_reg, addr;                         \
  75         .section __ex_table,"a";                \
  76         PTR     9b, handler;                    \
  77         .previous
  78
  79 /*
  80  * Only on the 64-bit kernel we can made use of 64-bit registers.
  81  */
  82 #ifdef CONFIG_64BIT
  83 #define USE_DOUBLE
  84 #endif
  85
  86 #ifdef USE_DOUBLE
  87
  88 #define LOAD   ld
  89 #define LOADL  ldl
  90 #define LOADR  ldr
  91 #define STOREL sdl
  92 #define STORER sdr
  93 #define STORE  sd
  94 #define ADD    daddu
  95 #define SUB    dsubu
  96 #define SRL    dsrl
  97 #define SRA    dsra
  98 #define SLL    dsll
  99 #define SLLV   dsllv
 100 #define SRLV   dsrlv
 101 #define NBYTES 8
 102 #define LOG_NBYTES 3
 103
 104 /*
 105  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 106  * register definitions). We need to redefine the register definitions from
 107  * the n64 ABI register naming to the o32 ABI register naming.
 108  */
 109 #undef t0
 110 #undef t1
 111 #undef t2
 112 #undef t3
 113 #define t0      $8
 114 #define t1      $9
 115 #define t2      $10
 116 #define t3      $11
 117 #define t4      $12
 118 #define t5      $13
 119 #define t6      $14
 120 #define t7      $15
 121
 122 #else
 123
 124 #define LOAD   lw
 125 #define LOADL  lwl
 126 #define LOADR  lwr
 127 #define STOREL swl
 128 #define STORER swr
 129 #define STORE  sw
 130 #define ADD    addu
 131 #define SUB    subu
 132 #define SRL    srl
 133 #define SLL    sll
 134 #define SRA    sra
 135 #define SLLV   sllv
 136 #define SRLV   srlv
 137 #define NBYTES 4
 138 #define LOG_NBYTES 2
 139
 140 #endif /* USE_DOUBLE */
 141
 142 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 143 #define LDFIRST LOADR
 144 #define LDREST  LOADL
 145 #define STFIRST STORER
 146 #define STREST  STOREL
 147 #define SHIFT_DISCARD SLLV
 148 #else
 149 #define LDFIRST LOADL
 150 #define LDREST  LOADR
 151 #define STFIRST STOREL
 152 #define STREST  STORER
 153 #define SHIFT_DISCARD SRLV
 154 #endif
 155
 156 #define FIRST(unit) ((unit)*NBYTES)
 157 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 158 #define UNIT(unit)  FIRST(unit)
 159
 160 #define ADDRMASK (NBYTES-1)
 161
 162         .text
 163         .set    noreorder
 164         .set    noat
 165
 166 /*
 167  * A combined memcpy/__copy_user
 168  * __copy_user sets len to 0 for success; else to an upper bound of
 169  * the number of uncopied bytes.
 170  * memcpy sets v0 to dst.
 171  */
 172         .align  5
 173 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 174         move    v0, dst                         /* return value */
 175 __memcpy:
 176 FEXPORT(__copy_user)
 177         /*
 178          * Note: dst & src may be unaligned, len may be 0
 179          * Temps
 180          */
 181 #define rem t8
 182
 183         /*
 184          * The "issue break"s below are very approximate.
 185          * Issue delays for dcache fills will perturb the schedule, as will
 186          * load queue full replay traps, etc.
 187          *
 188          * If len < NBYTES use byte operations.
 189          */
 190         PREF(   0, 0(src) )
 191         PREF(   1, 0(dst) )
 192         sltu    t2, len, NBYTES
 193         and     t1, dst, ADDRMASK
 194         PREF(   0, 1*32(src) )
 195         PREF(   1, 1*32(dst) )
 196         bnez    t2, copy_bytes_checklen
 197          and    t0, src, ADDRMASK
 198         PREF(   0, 2*32(src) )
 199         PREF(   1, 2*32(dst) )
 200         bnez    t1, dst_unaligned
 201          nop
 202         bnez    t0, src_unaligned_dst_aligned
 203         /*
 204          * use delay slot for fall-through
 205          * src and dst are aligned; need to compute rem
 206          */
 207 both_aligned:
 208          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 209         beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 210          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 211         PREF(   0, 3*32(src) )
 212         PREF(   1, 3*32(dst) )
 213         .align  4
 214 1:
 215 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 216 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 217 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 218 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 219         SUB     len, len, 8*NBYTES
 220 EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 221 EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
 222 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 223 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 224 EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
 225 EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
 226         ADD     src, src, 8*NBYTES
 227         ADD     dst, dst, 8*NBYTES
 228 EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 229 EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 230 EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
 231 EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
 232 EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
 233 EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
 234         PREF(   0, 8*32(src) )
 235         PREF(   1, 8*32(dst) )
 236         bne     len, rem, 1b
 237          nop
 238
 239         /*
 240          * len == rem == the number of bytes left to copy < 8*NBYTES
 241          */
 242 cleanup_both_aligned:
 243         beqz    len, done
 244          sltu   t0, len, 4*NBYTES
 245         bnez    t0, less_than_4units
 246          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 247         /*
 248          * len >= 4*NBYTES
 249          */
 250 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 251 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 252 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 253 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 254         SUB     len, len, 4*NBYTES
 255         ADD     src, src, 4*NBYTES
 256 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 257 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 258 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 259 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 260         beqz    len, done
 261          ADD    dst, dst, 4*NBYTES
 262 less_than_4units:
 263         /*
 264          * rem = len % NBYTES
 265          */
 266         beq     rem, len, copy_bytes
 267          nop
 268 1:
 269 EXC(    LOAD    t0, 0(src),             l_exc)
 270         ADD     src, src, NBYTES
 271         SUB     len, len, NBYTES
 272 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 273         bne     rem, len, 1b
 274          ADD    dst, dst, NBYTES
 275
 276         /*
 277          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 278          * A loop would do only a byte at a time with possible branch
 279          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 280          * because can't assume read-access to dst.  Instead, use
 281          * STREST dst, which doesn't require read access to dst.
 282          *
 283          * This code should perform better than a simple loop on modern,
 284          * wide-issue mips processors because the code has fewer branches and
 285          * more instruction-level parallelism.
 286          */
 287 #define bits t2
 288         beqz    len, done
 289          ADD    t1, dst, len    # t1 is just past last byte of dst
 290         li      bits, 8*NBYTES
 291         SLL     rem, len, 3     # rem = number of bits to keep
 292 EXC(    LOAD    t0, 0(src),             l_exc)
 293         SUB     bits, bits, rem # bits = number of bits to discard
 294         SHIFT_DISCARD t0, t0, bits
 295 EXC(    STREST  t0, -1(t1),             s_exc)
 296         jr      ra
 297          move   len, zero
 298 dst_unaligned:
 299         /*
 300          * dst is unaligned
 301          * t0 = src & ADDRMASK
 302          * t1 = dst & ADDRMASK; T1 > 0
 303          * len >= NBYTES
 304          *
 305          * Copy enough bytes to align dst
 306          * Set match = (src and dst have same alignment)
 307          */
 308 #define match rem
 309 EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 310         ADD     t2, zero, NBYTES
 311 EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 312         SUB     t2, t2, t1      # t2 = number of bytes copied
 313         xor     match, t0, t1
 314 EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
 315         beq     len, t2, done
 316          SUB    len, len, t2
 317         ADD     dst, dst, t2
 318         beqz    match, both_aligned
 319          ADD    src, src, t2
 320
 321 src_unaligned_dst_aligned:
 322         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 323         PREF(   0, 3*32(src) )
 324         beqz    t0, cleanup_src_unaligned
 325          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 326         PREF(   1, 3*32(dst) )
 327 1:
 328 /*
 329  * Avoid consecutive LD*'s to the same register since some mips
 330  * implementations can't issue them in the same cycle.
 331  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 332  * are to the same unit (unless src is aligned, but it's not).
 333  */
 334 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 335 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 336         SUB     len, len, 4*NBYTES
 337 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 338 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 339 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 340 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 341 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 342 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 343         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 344         ADD     src, src, 4*NBYTES
 345 #ifdef CONFIG_CPU_SB1
 346         nop                             # improves slotting
 347 #endif
 348 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 349 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 350 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 351 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 352         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 353         bne     len, rem, 1b
 354          ADD    dst, dst, 4*NBYTES
 355
 356 cleanup_src_unaligned:
 357         beqz    len, done
 358          and    rem, len, NBYTES-1  # rem = len % NBYTES
 359         beq     rem, len, copy_bytes
 360          nop
 361 1:
 362 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 363 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 364         ADD     src, src, NBYTES
 365         SUB     len, len, NBYTES
 366 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 367         bne     len, rem, 1b
 368          ADD    dst, dst, NBYTES
 369
 370 copy_bytes_checklen:
 371         beqz    len, done
 372          nop
 373 copy_bytes:
 374         /* 0 < len < NBYTES  */
 375 #define COPY_BYTE(N)                    \
 376 EXC(    lb      t0, N(src), l_exc);     \
 377         SUB     len, len, 1;            \
 378         beqz    len, done;              \
 379 EXC(     sb     t0, N(dst), s_exc_p1)
 380
 381         COPY_BYTE(0)
 382         COPY_BYTE(1)
 383 #ifdef USE_DOUBLE
 384         COPY_BYTE(2)
 385         COPY_BYTE(3)
 386         COPY_BYTE(4)
 387         COPY_BYTE(5)
 388 #endif
 389 EXC(    lb      t0, NBYTES-2(src), l_exc)
 390         SUB     len, len, 1
 391         jr      ra
 392 EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 393 done:
 394         jr      ra
 395          nop
 396         END(memcpy)
 397
 398 l_exc_copy:
 399         /*
 400          * Copy bytes from src until faulting load address (or until a
 401          * lb faults)
 402          *
 403          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 404          * may be more than a byte beyond the last address.
 405          * Hence, the lb below may get an exception.
 406          *
 407          * Assumes src < THREAD_BUADDR($28)
 408          */
 409         LOAD    t0, TI_TASK($28)
 410          nop
 411         LOAD    t0, THREAD_BUADDR(t0)
 412 1:
 413 EXC(    lb      t1, 0(src),     l_exc)
 414         ADD     src, src, 1
 415         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 416         bne     src, t0, 1b
 417          ADD    dst, dst, 1
 418 l_exc:
 419         LOAD    t0, TI_TASK($28)
 420          nop
 421         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 422          nop
 423         SUB     len, AT, t0             # len number of uncopied bytes
 424         /*
 425          * Here's where we rely on src and dst being incremented in tandem,
 426          *   See (3) above.
 427          * dst += (fault addr - src) to put dst at first byte to clear
 428          */
 429         ADD     dst, t0                 # compute start address in a1
 430         SUB     dst, src
 431         /*
 432          * Clear len bytes starting at dst.  Can't call __bzero because it
 433          * might modify len.  An inefficient loop for these rare times...
 434          */
 435         beqz    len, done
 436          SUB    src, len, 1
 437 1:      sb      zero, 0(dst)
 438         ADD     dst, dst, 1
 439         bnez    src, 1b
 440          SUB    src, src, 1
 441         jr      ra
 442          nop
 443
 444
 445 #define SEXC(n)                         \
 446 s_exc_p ## n ## u:                      \
 447         jr      ra;                     \
 448          ADD    len, len, n*NBYTES
 449
 450 SEXC(8)
 451 SEXC(7)
 452 SEXC(6)
 453 SEXC(5)
 454 SEXC(4)
 455 SEXC(3)
 456 SEXC(2)
 457 SEXC(1)
 458
 459 s_exc_p1:
 460         jr      ra
 461          ADD    len, len, 1
 462 s_exc:
 463         jr      ra
 464          nop
 465
 466         .align  5
 467 LEAF(memmove)
 468         ADD     t0, a0, a2
 469         ADD     t1, a1, a2
 470         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 471         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 472         and     t0, t1
 473         beqz    t0, __memcpy
 474          move   v0, a0                          /* return value */
 475         beqz    a2, r_out
 476         END(memmove)
 477
 478         /* fall through to __rmemcpy */
 479 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 480          sltu   t0, a1, a0
 481         beqz    t0, r_end_bytes_up              # src >= dst
 482          nop
 483         ADD     a0, a2                          # dst = dst + len
 484         ADD     a1, a2                          # src = src + len
 485
 486 r_end_bytes:
 487         lb      t0, -1(a1)
 488         SUB     a2, a2, 0x1
 489         sb      t0, -1(a0)
 490         SUB     a1, a1, 0x1
 491         bnez    a2, r_end_bytes
 492          SUB    a0, a0, 0x1
 493
 494 r_out:
 495         jr      ra
 496          move   a2, zero
 497
 498 r_end_bytes_up:
 499         lb      t0, (a1)
 500         SUB     a2, a2, 0x1
 501         sb      t0, (a0)
 502         ADD     a1, a1, 0x1
 503         bnez    a2, r_end_bytes_up
 504          ADD    a0, a0, 0x1
 505
 506         jr      ra
 507          move   a2, zero
 508         END(__rmemcpy)