MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S

   1 //
   2 // Copyright (c) 2012 - 2016, Linaro Limited
   3 // All rights reserved.
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are met:
   7 //     * Redistributions of source code must retain the above copyright
   8 //       notice, this list of conditions and the following disclaimer.
   9 //     * Redistributions in binary form must reproduce the above copyright
  10 //       notice, this list of conditions and the following disclaimer in the
  11 //       documentation and/or other materials provided with the distribution.
  12 //     * Neither the name of the Linaro nor the
  13 //       names of its contributors may be used to endorse or promote products
  14 //       derived from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27 //
  28
  29 //
  30 // Copyright (c) 2015 ARM Ltd
  31 // All rights reserved.
  32 //
  33 // Redistribution and use in source and binary forms, with or without
  34 // modification, are permitted provided that the following conditions
  35 // are met:
  36 // 1. Redistributions of source code must retain the above copyright
  37 //    notice, this list of conditions and the following disclaimer.
  38 // 2. Redistributions in binary form must reproduce the above copyright
  39 //    notice, this list of conditions and the following disclaimer in the
  40 //    documentation and/or other materials provided with the distribution.
  41 // 3. The name of the company may not be used to endorse or promote
  42 //    products derived from this software without specific prior written
  43 //    permission.
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55 //
  56
  57 // Assumptions:
  58 //
  59 // ARMv8-a, AArch64, unaligned accesses.
  60 //
  61 //
  62
  63 #define dstin     x0
  64 #define src       x1
  65 #define count     x2
  66 #define dst       x3
  67 #define srcend    x4
  68 #define dstend    x5
  69 #define A_l       x6
  70 #define A_lw      w6
  71 #define A_h       x7
  72 #define A_hw      w7
  73 #define B_l       x8
  74 #define B_lw      w8
  75 #define B_h       x9
  76 #define C_l       x10
  77 #define C_h       x11
  78 #define D_l       x12
  79 #define D_h       x13
  80 #define E_l       x14
  81 #define E_h       x15
  82 #define F_l       srcend
  83 #define F_h       dst
  84 #define tmp1      x9
  85 #define tmp2      x3
  86
  87 #define L(l) .L ## l
  88
  89 // Copies are split into 3 main cases: small copies of up to 16 bytes,
  90 // medium copies of 17..96 bytes which are fully unrolled. Large copies
  91 // of more than 96 bytes align the destination and use an unrolled loop
  92 // processing 64 bytes per iteration.
  93 // Small and medium copies read all data before writing, allowing any
  94 // kind of overlap, and memmove tailcalls memcpy for these cases as
  95 // well as non-overlapping copies.
  96
  97 __memcpy:
  98     prfm    PLDL1KEEP, [src]
  99     add     srcend, src, count
 100     add     dstend, dstin, count
 101     cmp     count, 16
 102     b.ls    L(copy16)
 103     cmp     count, 96
 104     b.hi    L(copy_long)
 105
 106     // Medium copies: 17..96 bytes.
 107     sub     tmp1, count, 1
 108     ldp     A_l, A_h, [src]
 109     tbnz    tmp1, 6, L(copy96)
 110     ldp     D_l, D_h, [srcend, -16]
 111     tbz     tmp1, 5, 1f
 112     ldp     B_l, B_h, [src, 16]
 113     ldp     C_l, C_h, [srcend, -32]
 114     stp     B_l, B_h, [dstin, 16]
 115     stp     C_l, C_h, [dstend, -32]
 116 1:
 117     stp     A_l, A_h, [dstin]
 118     stp     D_l, D_h, [dstend, -16]
 119     ret
 120
 121     .p2align 4
 122     // Small copies: 0..16 bytes.
 123 L(copy16):
 124     cmp     count, 8
 125     b.lo    1f
 126     ldr     A_l, [src]
 127     ldr     A_h, [srcend, -8]
 128     str     A_l, [dstin]
 129     str     A_h, [dstend, -8]
 130     ret
 131     .p2align 4
 132 1:
 133     tbz     count, 2, 1f
 134     ldr     A_lw, [src]
 135     ldr     A_hw, [srcend, -4]
 136     str     A_lw, [dstin]
 137     str     A_hw, [dstend, -4]
 138     ret
 139
 140     // Copy 0..3 bytes.  Use a branchless sequence that copies the same
 141     // byte 3 times if count==1, or the 2nd byte twice if count==2.
 142 1:
 143     cbz     count, 2f
 144     lsr     tmp1, count, 1
 145     ldrb    A_lw, [src]
 146     ldrb    A_hw, [srcend, -1]
 147     ldrb    B_lw, [src, tmp1]
 148     strb    A_lw, [dstin]
 149     strb    B_lw, [dstin, tmp1]
 150     strb    A_hw, [dstend, -1]
 151 2:  ret
 152
 153     .p2align 4
 154     // Copy 64..96 bytes.  Copy 64 bytes from the start and
 155     // 32 bytes from the end.
 156 L(copy96):
 157     ldp     B_l, B_h, [src, 16]
 158     ldp     C_l, C_h, [src, 32]
 159     ldp     D_l, D_h, [src, 48]
 160     ldp     E_l, E_h, [srcend, -32]
 161     ldp     F_l, F_h, [srcend, -16]
 162     stp     A_l, A_h, [dstin]
 163     stp     B_l, B_h, [dstin, 16]
 164     stp     C_l, C_h, [dstin, 32]
 165     stp     D_l, D_h, [dstin, 48]
 166     stp     E_l, E_h, [dstend, -32]
 167     stp     F_l, F_h, [dstend, -16]
 168     ret
 169
 170     // Align DST to 16 byte alignment so that we don't cross cache line
 171     // boundaries on both loads and stores. There are at least 96 bytes
 172     // to copy, so copy 16 bytes unaligned and then align.      The loop
 173     // copies 64 bytes per iteration and prefetches one iteration ahead.
 174
 175     .p2align 4
 176 L(copy_long):
 177     and     tmp1, dstin, 15
 178     bic     dst, dstin, 15
 179     ldp     D_l, D_h, [src]
 180     sub     src, src, tmp1
 181     add     count, count, tmp1      // Count is now 16 too large.
 182     ldp     A_l, A_h, [src, 16]
 183     stp     D_l, D_h, [dstin]
 184     ldp     B_l, B_h, [src, 32]
 185     ldp     C_l, C_h, [src, 48]
 186     ldp     D_l, D_h, [src, 64]!
 187     subs    count, count, 128 + 16  // Test and readjust count.
 188     b.ls    2f
 189 1:
 190     stp     A_l, A_h, [dst, 16]
 191     ldp     A_l, A_h, [src, 16]
 192     stp     B_l, B_h, [dst, 32]
 193     ldp     B_l, B_h, [src, 32]
 194     stp     C_l, C_h, [dst, 48]
 195     ldp     C_l, C_h, [src, 48]
 196     stp     D_l, D_h, [dst, 64]!
 197     ldp     D_l, D_h, [src, 64]!
 198     subs    count, count, 64
 199     b.hi    1b
 200
 201     // Write the last full set of 64 bytes.      The remainder is at most 64
 202     // bytes, so it is safe to always copy 64 bytes from the end even if
 203     // there is just 1 byte left.
 204 2:
 205     ldp     E_l, E_h, [srcend, -64]
 206     stp     A_l, A_h, [dst, 16]
 207     ldp     A_l, A_h, [srcend, -48]
 208     stp     B_l, B_h, [dst, 32]
 209     ldp     B_l, B_h, [srcend, -32]
 210     stp     C_l, C_h, [dst, 48]
 211     ldp     C_l, C_h, [srcend, -16]
 212     stp     D_l, D_h, [dst, 64]
 213     stp     E_l, E_h, [dstend, -64]
 214     stp     A_l, A_h, [dstend, -48]
 215     stp     B_l, B_h, [dstend, -32]
 216     stp     C_l, C_h, [dstend, -16]
 217     ret
 218
 219
 220 //
 221 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
 222 // Larger backwards copies are also handled by memcpy. The only remaining
 223 // case is forward large copies.  The destination is aligned, and an
 224 // unrolled loop processes 64 bytes per iteration.
 225 //
 226
 227 ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
 228 ASM_PFX(InternalMemCopyMem):
 229     sub     tmp2, dstin, src
 230     cmp     count, 96
 231     ccmp    tmp2, count, 2, hi
 232     b.hs    __memcpy
 233
 234     cbz     tmp2, 3f
 235     add     dstend, dstin, count
 236     add     srcend, src, count
 237
 238     // Align dstend to 16 byte alignment so that we don't cross cache line
 239     // boundaries on both loads and stores. There are at least 96 bytes
 240     // to copy, so copy 16 bytes unaligned and then align. The loop
 241     // copies 64 bytes per iteration and prefetches one iteration ahead.
 242
 243     and     tmp2, dstend, 15
 244     ldp     D_l, D_h, [srcend, -16]
 245     sub     srcend, srcend, tmp2
 246     sub     count, count, tmp2
 247     ldp     A_l, A_h, [srcend, -16]
 248     stp     D_l, D_h, [dstend, -16]
 249     ldp     B_l, B_h, [srcend, -32]
 250     ldp     C_l, C_h, [srcend, -48]
 251     ldp     D_l, D_h, [srcend, -64]!
 252     sub     dstend, dstend, tmp2
 253     subs    count, count, 128
 254     b.ls    2f
 255     nop
 256 1:
 257     stp     A_l, A_h, [dstend, -16]
 258     ldp     A_l, A_h, [srcend, -16]
 259     stp     B_l, B_h, [dstend, -32]
 260     ldp     B_l, B_h, [srcend, -32]
 261     stp     C_l, C_h, [dstend, -48]
 262     ldp     C_l, C_h, [srcend, -48]
 263     stp     D_l, D_h, [dstend, -64]!
 264     ldp     D_l, D_h, [srcend, -64]!
 265     subs    count, count, 64
 266     b.hi    1b
 267
 268     // Write the last full set of 64 bytes. The remainder is at most 64
 269     // bytes, so it is safe to always copy 64 bytes from the start even if
 270     // there is just 1 byte left.
 271 2:
 272     ldp     E_l, E_h, [src, 48]
 273     stp     A_l, A_h, [dstend, -16]
 274     ldp     A_l, A_h, [src, 32]
 275     stp     B_l, B_h, [dstend, -32]
 276     ldp     B_l, B_h, [src, 16]
 277     stp     C_l, C_h, [dstend, -48]
 278     ldp     C_l, C_h, [src]
 279     stp     D_l, D_h, [dstend, -64]
 280     stp     E_l, E_h, [dstin, 48]
 281     stp     A_l, A_h, [dstin, 32]
 282     stp     B_l, B_h, [dstin, 16]
 283     stp     C_l, C_h, [dstin]
 284 3:  ret