// // Copyright (c) 2014, ARM Limited // All rights Reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of the company nor the names of its contributors // may be used to endorse or promote products derived from this // software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Assumptions: // // ARMv8-a, AArch64 // Neon Available. // // Arguments and results. #define srcin x0 #define cntin x1 #define chrin w2 #define result x0 #define src x3 #define tmp x4 #define wtmp2 w5 #define synd x6 #define soff x9 #define cntrem x10 #define vrepchr v0 #define vdata1 v1 #define vdata2 v2 #define vhas_chr1 v3 #define vhas_chr2 v4 #define vrepmask v5 #define vend v6 // // Core algorithm: // // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits // per byte. For each tuple, bit 0 is set if the relevant byte matched the // requested character and bit 1 is not used (faster than using a 32bit // syndrome). Since the bits in the syndrome reflect exactly the order in which // things occur in the original string, counting trailing zeros allows to // identify exactly which byte has matched. // ASM_GLOBAL ASM_PFX(InternalMemScanMem8) ASM_PFX(InternalMemScanMem8): // Do not dereference srcin if no bytes to compare. cbz cntin, .Lzero_length // // Magic constant 0x40100401 allows us to identify which lane matches // the requested byte. // mov wtmp2, #0x0401 movk wtmp2, #0x4010, lsl #16 dup vrepchr.16b, chrin // Work with aligned 32-byte chunks bic src, srcin, #31 dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 b.eq .Lloop // // Input string is not 32-byte aligned. We calculate the syndrome // value for the aligned 32 bytes block containing the first bytes // and mask the irrelevant part. // ld1 {vdata1.16b, vdata2.16b}, [src], #32 sub tmp, soff, #32 adds cntin, cntin, tmp cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend.16b, vend.16b, vend.16b // 128->64 mov synd, vend.d[0] // Clear the soff*2 lower bits lsl tmp, soff, #1 lsr synd, synd, tmp lsl synd, synd, tmp // The first block can also be the last b.ls .Lmasklast // Have we found something already? cbnz synd, .Ltail .Lloop: ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b // If we're out of data we finish regardless of the result b.ls .Lend // Use a fast check for the termination condition orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d mov synd, vend.d[0] // We're not out of data, loop if we haven't found the character cbz synd, .Lloop .Lend: // Termination condition found, let's calculate the syndrome value and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend.16b, vend.16b, vend.16b // 128->64 mov synd, vend.d[0] // Only do the clear for the last possible block b.hi .Ltail .Lmasklast: // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits add tmp, cntrem, soff and tmp, tmp, #31 sub tmp, tmp, #32 neg tmp, tmp, lsl #1 lsl synd, synd, tmp lsr synd, synd, tmp .Ltail: // Count the trailing zeros using bit reversing rbit synd, synd // Compensate the last post-increment sub src, src, #32 // Check that we have found a character cmp synd, #0 // And count the leading zeros clz synd, synd // Compute the potential result add result, src, synd, lsr #1 // Select result or NULL csel result, xzr, result, eq ret .Lzero_length: mov result, #0 ret