MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S

   1 //
   2 // Copyright (c) 2012 - 2016, Linaro Limited
   3 // All rights reserved.
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are met:
   7 //     * Redistributions of source code must retain the above copyright
   8 //       notice, this list of conditions and the following disclaimer.
   9 //     * Redistributions in binary form must reproduce the above copyright
  10 //       notice, this list of conditions and the following disclaimer in the
  11 //       documentation and/or other materials provided with the distribution.
  12 //     * Neither the name of the Linaro nor the
  13 //       names of its contributors may be used to endorse or promote products
  14 //       derived from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27 //
  28
  29 //
  30 // Copyright (c) 2015 ARM Ltd
  31 // All rights reserved.
  32 //
  33 // Redistribution and use in source and binary forms, with or without
  34 // modification, are permitted provided that the following conditions
  35 // are met:
  36 // 1. Redistributions of source code must retain the above copyright
  37 //    notice, this list of conditions and the following disclaimer.
  38 // 2. Redistributions in binary form must reproduce the above copyright
  39 //    notice, this list of conditions and the following disclaimer in the
  40 //    documentation and/or other materials provided with the distribution.
  41 // 3. The name of the company may not be used to endorse or promote
  42 //    products derived from this software without specific prior written
  43 //    permission.
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55 //
  56
  57 // Assumptions:
  58 //
  59 // ARMv8-a, AArch64, unaligned accesses
  60 //
  61 //
  62
  63 #define dstin     x0
  64 #define count     x1
  65 #define val       x2
  66 #define valw      w2
  67 #define dst       x3
  68 #define dstend    x4
  69 #define tmp1      x5
  70 #define tmp1w     w5
  71 #define tmp2      x6
  72 #define tmp2w     w6
  73 #define zva_len   x7
  74 #define zva_lenw  w7
  75
  76 #define L(l) .L ## l
  77
  78 ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
  79 ASM_PFX(InternalMemSetMem16):
  80     dup     v0.8H, valw
  81     b       0f
  82
  83 ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
  84 ASM_PFX(InternalMemSetMem32):
  85     dup     v0.4S, valw
  86     b       0f
  87
  88 ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
  89 ASM_PFX(InternalMemSetMem64):
  90     dup     v0.2D, val
  91     b       0f
  92
  93 ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
  94 ASM_PFX(InternalMemZeroMem):
  95     movi    v0.16B, #0
  96     b       0f
  97
  98 ASM_GLOBAL ASM_PFX(InternalMemSetMem)
  99 ASM_PFX(InternalMemSetMem):
 100     dup     v0.16B, valw
 101 0:  add     dstend, dstin, count
 102     mov     val, v0.D[0]
 103
 104     cmp     count, 96
 105     b.hi    L(set_long)
 106     cmp     count, 16
 107     b.hs    L(set_medium)
 108
 109     // Set 0..15 bytes.
 110     tbz     count, 3, 1f
 111     str     val, [dstin]
 112     str     val, [dstend, -8]
 113     ret
 114     nop
 115 1:  tbz     count, 2, 2f
 116     str     valw, [dstin]
 117     str     valw, [dstend, -4]
 118     ret
 119 2:  cbz     count, 3f
 120     strb    valw, [dstin]
 121     tbz     count, 1, 3f
 122     strh    valw, [dstend, -2]
 123 3:  ret
 124
 125     // Set 17..96 bytes.
 126 L(set_medium):
 127     str     q0, [dstin]
 128     tbnz    count, 6, L(set96)
 129     str     q0, [dstend, -16]
 130     tbz     count, 5, 1f
 131     str     q0, [dstin, 16]
 132     str     q0, [dstend, -32]
 133 1:  ret
 134
 135     .p2align 4
 136     // Set 64..96 bytes.  Write 64 bytes from the start and
 137     // 32 bytes from the end.
 138 L(set96):
 139     str     q0, [dstin, 16]
 140     stp     q0, q0, [dstin, 32]
 141     stp     q0, q0, [dstend, -32]
 142     ret
 143
 144     .p2align 3
 145     nop
 146 L(set_long):
 147     bic     dst, dstin, 15
 148     str     q0, [dstin]
 149     cmp     count, 256
 150     ccmp    val, 0, 0, cs
 151     b.eq    L(try_zva)
 152 L(no_zva):
 153     sub     count, dstend, dst        // Count is 16 too large.
 154     add     dst, dst, 16
 155     sub     count, count, 64 + 16     // Adjust count and bias for loop.
 156 1:  stp     q0, q0, [dst], 64
 157     stp     q0, q0, [dst, -32]
 158 L(tail64):
 159     subs    count, count, 64
 160     b.hi    1b
 161 2:  stp     q0, q0, [dstend, -64]
 162     stp     q0, q0, [dstend, -32]
 163     ret
 164
 165     .p2align 3
 166 L(try_zva):
 167     mrs     tmp1, dczid_el0
 168     tbnz    tmp1w, 4, L(no_zva)
 169     and     tmp1w, tmp1w, 15
 170     cmp     tmp1w, 4                  // ZVA size is 64 bytes.
 171     b.ne    L(zva_128)
 172
 173     // Write the first and last 64 byte aligned block using stp rather
 174     // than using DC ZVA.  This is faster on some cores.
 175 L(zva_64):
 176     str     q0, [dst, 16]
 177     stp     q0, q0, [dst, 32]
 178     bic     dst, dst, 63
 179     stp     q0, q0, [dst, 64]
 180     stp     q0, q0, [dst, 96]
 181     sub     count, dstend, dst         // Count is now 128 too large.
 182     sub     count, count, 128+64+64    // Adjust count and bias for loop.
 183     add     dst, dst, 128
 184     nop
 185 1:  dc      zva, dst
 186     add     dst, dst, 64
 187     subs    count, count, 64
 188     b.hi    1b
 189     stp     q0, q0, [dst, 0]
 190     stp     q0, q0, [dst, 32]
 191     stp     q0, q0, [dstend, -64]
 192     stp     q0, q0, [dstend, -32]
 193     ret
 194
 195     .p2align 3
 196 L(zva_128):
 197     cmp     tmp1w, 5                    // ZVA size is 128 bytes.
 198     b.ne    L(zva_other)
 199
 200     str     q0, [dst, 16]
 201     stp     q0, q0, [dst, 32]
 202     stp     q0, q0, [dst, 64]
 203     stp     q0, q0, [dst, 96]
 204     bic     dst, dst, 127
 205     sub     count, dstend, dst          // Count is now 128 too large.
 206     sub     count, count, 128+128       // Adjust count and bias for loop.
 207     add     dst, dst, 128
 208 1:  dc      zva, dst
 209     add     dst, dst, 128
 210     subs    count, count, 128
 211     b.hi    1b
 212     stp     q0, q0, [dstend, -128]
 213     stp     q0, q0, [dstend, -96]
 214     stp     q0, q0, [dstend, -64]
 215     stp     q0, q0, [dstend, -32]
 216     ret
 217
 218 L(zva_other):
 219     mov     tmp2w, 4
 220     lsl     zva_lenw, tmp2w, tmp1w
 221     add     tmp1, zva_len, 64           // Max alignment bytes written.
 222     cmp     count, tmp1
 223     blo     L(no_zva)
 224
 225     sub     tmp2, zva_len, 1
 226     add     tmp1, dst, zva_len
 227     add     dst, dst, 16
 228     subs    count, tmp1, dst            // Actual alignment bytes to write.
 229     bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
 230     beq     2f
 231 1:  stp     q0, q0, [dst], 64
 232     stp     q0, q0, [dst, -32]
 233     subs    count, count, 64
 234     b.hi    1b
 235 2:  mov     dst, tmp1
 236     sub     count, dstend, tmp1         // Remaining bytes to write.
 237     subs    count, count, zva_len
 238     b.lo    4f
 239 3:  dc      zva, dst
 240     add     dst, dst, zva_len
 241     subs    count, count, zva_len
 242     b.hs    3b
 243 4:  add     count, count, zva_len
 244     b       L(tail64)