MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S

   1 //
   2 // Copyright (c) 2014, ARM Limited
   3 // All rights Reserved.
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are met:
   7 //     * Redistributions of source code must retain the above copyright
   8 //       notice, this list of conditions and the following disclaimer.
   9 //     * Redistributions in binary form must reproduce the above copyright
  10 //       notice, this list of conditions and the following disclaimer in the
  11 //       documentation and/or other materials provided with the distribution.
  12 //     * Neither the name of the company nor the names of its contributors
  13 //       may be used to endorse or promote products derived from this
  14 //       software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27 //
  28
  29 // Assumptions:
  30 //
  31 // ARMv8-a, AArch64
  32 // Neon Available.
  33 //
  34
  35 // Arguments and results.
  36 #define srcin     x0
  37 #define cntin     x1
  38 #define chrin     w2
  39
  40 #define result    x0
  41
  42 #define src       x3
  43 #define tmp       x4
  44 #define wtmp2     w5
  45 #define synd      x6
  46 #define soff      x9
  47 #define cntrem    x10
  48
  49 #define vrepchr   v0
  50 #define vdata1    v1
  51 #define vdata2    v2
  52 #define vhas_chr1 v3
  53 #define vhas_chr2 v4
  54 #define vrepmask  v5
  55 #define vend      v6
  56
  57 //
  58 // Core algorithm:
  59 //
  60 // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
  61 // per byte. For each tuple, bit 0 is set if the relevant byte matched the
  62 // requested character and bit 1 is not used (faster than using a 32bit
  63 // syndrome). Since the bits in the syndrome reflect exactly the order in which
  64 // things occur in the original string, counting trailing zeros allows to
  65 // identify exactly which byte has matched.
  66 //
  67
  68 ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
  69 ASM_PFX(InternalMemScanMem8):
  70     // Do not dereference srcin if no bytes to compare.
  71     cbz cntin, .Lzero_length
  72     //
  73     // Magic constant 0x40100401 allows us to identify which lane matches
  74     // the requested byte.
  75     //
  76     mov     wtmp2, #0x0401
  77     movk    wtmp2, #0x4010, lsl #16
  78     dup     vrepchr.16b, chrin
  79     // Work with aligned 32-byte chunks
  80     bic     src, srcin, #31
  81     dup     vrepmask.4s, wtmp2
  82     ands    soff, srcin, #31
  83     and     cntrem, cntin, #31
  84     b.eq    .Lloop
  85
  86     //
  87     // Input string is not 32-byte aligned. We calculate the syndrome
  88     // value for the aligned 32 bytes block containing the first bytes
  89     // and mask the irrelevant part.
  90     //
  91
  92     ld1     {vdata1.16b, vdata2.16b}, [src], #32
  93     sub     tmp, soff, #32
  94     adds    cntin, cntin, tmp
  95     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  96     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  97     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
  98     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
  99     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
 100     addp    vend.16b, vend.16b, vend.16b                  // 128->64
 101     mov     synd, vend.d[0]
 102     // Clear the soff*2 lower bits
 103     lsl     tmp, soff, #1
 104     lsr     synd, synd, tmp
 105     lsl     synd, synd, tmp
 106     // The first block can also be the last
 107     b.ls    .Lmasklast
 108     // Have we found something already?
 109     cbnz    synd, .Ltail
 110
 111 .Lloop:
 112     ld1     {vdata1.16b, vdata2.16b}, [src], #32
 113     subs    cntin, cntin, #32
 114     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
 115     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
 116     // If we're out of data we finish regardless of the result
 117     b.ls    .Lend
 118     // Use a fast check for the termination condition
 119     orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
 120     addp    vend.2d, vend.2d, vend.2d
 121     mov     synd, vend.d[0]
 122     // We're not out of data, loop if we haven't found the character
 123     cbz     synd, .Lloop
 124
 125 .Lend:
 126     // Termination condition found, let's calculate the syndrome value
 127     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 128     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 129     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
 130     addp    vend.16b, vend.16b, vend.16b                // 128->64
 131     mov     synd, vend.d[0]
 132     // Only do the clear for the last possible block
 133     b.hi    .Ltail
 134
 135 .Lmasklast:
 136     // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
 137     add     tmp, cntrem, soff
 138     and     tmp, tmp, #31
 139     sub     tmp, tmp, #32
 140     neg     tmp, tmp, lsl #1
 141     lsl     synd, synd, tmp
 142     lsr     synd, synd, tmp
 143
 144 .Ltail:
 145     // Count the trailing zeros using bit reversing
 146     rbit    synd, synd
 147     // Compensate the last post-increment
 148     sub     src, src, #32
 149     // Check that we have found a character
 150     cmp     synd, #0
 151     // And count the leading zeros
 152     clz     synd, synd
 153     // Compute the potential result
 154     add     result, src, synd, lsr #1
 155     // Select result or NULL
 156     csel    result, xzr, result, eq
 157     ret
 158
 159 .Lzero_length:
 160     mov   result, #0
 161     ret