MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S

   1 //
   2 // Copyright (c) 2012 - 2016, Linaro Limited
   3 // All rights reserved.
   4 // Copyright (c) 2015 ARM Ltd
   5 // All rights reserved.
   6 // SPDX-License-Identifier: BSD-2-Clause-Patent
   7 //
   8
   9 // Assumptions:
  10 //
  11 // ARMv8-a, AArch64, unaligned accesses
  12 //
  13 //
  14
  15 #define dstin     x0
  16 #define count     x1
  17 #define val       x2
  18 #define valw      w2
  19 #define dst       x3
  20 #define dstend    x4
  21 #define tmp1      x5
  22 #define tmp1w     w5
  23 #define tmp2      x6
  24 #define tmp2w     w6
  25 #define zva_len   x7
  26 #define zva_lenw  w7
  27
  28 #define L(l) .L ## l
  29
  30 ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
  31 ASM_PFX(InternalMemSetMem16):
  32     dup     v0.8H, valw
  33     lsl     count, count, #1
  34     b       0f
  35
  36 ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
  37 ASM_PFX(InternalMemSetMem32):
  38     dup     v0.4S, valw
  39     lsl     count, count, #2
  40     b       0f
  41
  42 ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
  43 ASM_PFX(InternalMemSetMem64):
  44     dup     v0.2D, val
  45     lsl     count, count, #3
  46     b       0f
  47
  48 ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
  49 ASM_PFX(InternalMemZeroMem):
  50     movi    v0.16B, #0
  51     b       0f
  52
  53 ASM_GLOBAL ASM_PFX(InternalMemSetMem)
  54 ASM_PFX(InternalMemSetMem):
  55     dup     v0.16B, valw
  56 0:  add     dstend, dstin, count
  57     mov     val, v0.D[0]
  58
  59     cmp     count, 96
  60     b.hi    L(set_long)
  61     cmp     count, 16
  62     b.hs    L(set_medium)
  63
  64     // Set 0..15 bytes.
  65     tbz     count, 3, 1f
  66     str     val, [dstin]
  67     str     val, [dstend, -8]
  68     ret
  69     nop
  70 1:  tbz     count, 2, 2f
  71     str     valw, [dstin]
  72     str     valw, [dstend, -4]
  73     ret
  74 2:  cbz     count, 3f
  75     strb    valw, [dstin]
  76     tbz     count, 1, 3f
  77     strh    valw, [dstend, -2]
  78 3:  ret
  79
  80     // Set 17..96 bytes.
  81 L(set_medium):
  82     str     q0, [dstin]
  83     tbnz    count, 6, L(set96)
  84     str     q0, [dstend, -16]
  85     tbz     count, 5, 1f
  86     str     q0, [dstin, 16]
  87     str     q0, [dstend, -32]
  88 1:  ret
  89
  90     .p2align 4
  91     // Set 64..96 bytes.  Write 64 bytes from the start and
  92     // 32 bytes from the end.
  93 L(set96):
  94     str     q0, [dstin, 16]
  95     stp     q0, q0, [dstin, 32]
  96     stp     q0, q0, [dstend, -32]
  97     ret
  98
  99     .p2align 3
 100     nop
 101 L(set_long):
 102     bic     dst, dstin, 15
 103     str     q0, [dstin]
 104     cmp     count, 256
 105     ccmp    val, 0, 0, cs
 106     b.eq    L(try_zva)
 107 L(no_zva):
 108     sub     count, dstend, dst        // Count is 16 too large.
 109     add     dst, dst, 16
 110     sub     count, count, 64 + 16     // Adjust count and bias for loop.
 111 1:  stp     q0, q0, [dst], 64
 112     stp     q0, q0, [dst, -32]
 113 L(tail64):
 114     subs    count, count, 64
 115     b.hi    1b
 116 2:  stp     q0, q0, [dstend, -64]
 117     stp     q0, q0, [dstend, -32]
 118     ret
 119
 120     .p2align 3
 121 L(try_zva):
 122     mrs     tmp1, dczid_el0
 123     tbnz    tmp1w, 4, L(no_zva)
 124     and     tmp1w, tmp1w, 15
 125     cmp     tmp1w, 4                  // ZVA size is 64 bytes.
 126     b.ne    L(zva_128)
 127
 128     // Write the first and last 64 byte aligned block using stp rather
 129     // than using DC ZVA.  This is faster on some cores.
 130 L(zva_64):
 131     str     q0, [dst, 16]
 132     stp     q0, q0, [dst, 32]
 133     bic     dst, dst, 63
 134     stp     q0, q0, [dst, 64]
 135     stp     q0, q0, [dst, 96]
 136     sub     count, dstend, dst         // Count is now 128 too large.
 137     sub     count, count, 128+64+64    // Adjust count and bias for loop.
 138     add     dst, dst, 128
 139     nop
 140 1:  dc      zva, dst
 141     add     dst, dst, 64
 142     subs    count, count, 64
 143     b.hi    1b
 144     stp     q0, q0, [dst, 0]
 145     stp     q0, q0, [dst, 32]
 146     stp     q0, q0, [dstend, -64]
 147     stp     q0, q0, [dstend, -32]
 148     ret
 149
 150     .p2align 3
 151 L(zva_128):
 152     cmp     tmp1w, 5                    // ZVA size is 128 bytes.
 153     b.ne    L(zva_other)
 154
 155     str     q0, [dst, 16]
 156     stp     q0, q0, [dst, 32]
 157     stp     q0, q0, [dst, 64]
 158     stp     q0, q0, [dst, 96]
 159     bic     dst, dst, 127
 160     sub     count, dstend, dst          // Count is now 128 too large.
 161     sub     count, count, 128+128       // Adjust count and bias for loop.
 162     add     dst, dst, 128
 163 1:  dc      zva, dst
 164     add     dst, dst, 128
 165     subs    count, count, 128
 166     b.hi    1b
 167     stp     q0, q0, [dstend, -128]
 168     stp     q0, q0, [dstend, -96]
 169     stp     q0, q0, [dstend, -64]
 170     stp     q0, q0, [dstend, -32]
 171     ret
 172
 173 L(zva_other):
 174     mov     tmp2w, 4
 175     lsl     zva_lenw, tmp2w, tmp1w
 176     add     tmp1, zva_len, 64           // Max alignment bytes written.
 177     cmp     count, tmp1
 178     blo     L(no_zva)
 179
 180     sub     tmp2, zva_len, 1
 181     add     tmp1, dst, zva_len
 182     add     dst, dst, 16
 183     subs    count, tmp1, dst            // Actual alignment bytes to write.
 184     bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
 185     beq     2f
 186 1:  stp     q0, q0, [dst], 64
 187     stp     q0, q0, [dst, -32]
 188     subs    count, count, 64
 189     b.hi    1b
 190 2:  mov     dst, tmp1
 191     sub     count, dstend, tmp1         // Remaining bytes to write.
 192     subs    count, count, zva_len
 193     b.lo    4f
 194 3:  dc      zva, dst
 195     add     dst, dst, zva_len
 196     subs    count, count, zva_len
 197     b.hs    3b
 198 4:  add     count, count, zva_len
 199     b       L(tail64)