]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2014, ARM Limited\r | |
3 | // All rights Reserved.\r | |
4 | //\r | |
5 | // Redistribution and use in source and binary forms, with or without\r | |
6 | // modification, are permitted provided that the following conditions are met:\r | |
7 | // * Redistributions of source code must retain the above copyright\r | |
8 | // notice, this list of conditions and the following disclaimer.\r | |
9 | // * Redistributions in binary form must reproduce the above copyright\r | |
10 | // notice, this list of conditions and the following disclaimer in the\r | |
11 | // documentation and/or other materials provided with the distribution.\r | |
12 | // * Neither the name of the company nor the names of its contributors\r | |
13 | // may be used to endorse or promote products derived from this\r | |
14 | // software without specific prior written permission.\r | |
15 | //\r | |
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r | |
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r | |
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r | |
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r | |
20 | // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r | |
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r | |
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r | |
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r | |
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r | |
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r | |
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r | |
27 | //\r | |
28 | \r | |
29 | // Assumptions:\r | |
30 | //\r | |
31 | // ARMv8-a, AArch64\r | |
32 | // Neon Available.\r | |
33 | //\r | |
34 | \r | |
35 | // Arguments and results.\r | |
36 | #define srcin x0\r | |
37 | #define cntin x1\r | |
38 | #define chrin w2\r | |
39 | \r | |
40 | #define result x0\r | |
41 | \r | |
42 | #define src x3\r | |
9095d37b | 43 | #define tmp x4\r |
c86cd1e1 AB |
44 | #define wtmp2 w5\r |
45 | #define synd x6\r | |
46 | #define soff x9\r | |
47 | #define cntrem x10\r | |
48 | \r | |
49 | #define vrepchr v0\r | |
50 | #define vdata1 v1\r | |
51 | #define vdata2 v2\r | |
52 | #define vhas_chr1 v3\r | |
53 | #define vhas_chr2 v4\r | |
54 | #define vrepmask v5\r | |
55 | #define vend v6\r | |
56 | \r | |
57 | //\r | |
58 | // Core algorithm:\r | |
59 | //\r | |
60 | // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r | |
61 | // per byte. For each tuple, bit 0 is set if the relevant byte matched the\r | |
62 | // requested character and bit 1 is not used (faster than using a 32bit\r | |
63 | // syndrome). Since the bits in the syndrome reflect exactly the order in which\r | |
64 | // things occur in the original string, counting trailing zeros allows to\r | |
65 | // identify exactly which byte has matched.\r | |
66 | //\r | |
67 | \r | |
68 | ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r | |
69 | ASM_PFX(InternalMemScanMem8):\r | |
70 | // Do not dereference srcin if no bytes to compare.\r | |
9095d37b | 71 | cbz cntin, .Lzero_length\r |
c86cd1e1 AB |
72 | //\r |
73 | // Magic constant 0x40100401 allows us to identify which lane matches\r | |
74 | // the requested byte.\r | |
75 | //\r | |
76 | mov wtmp2, #0x0401\r | |
77 | movk wtmp2, #0x4010, lsl #16\r | |
78 | dup vrepchr.16b, chrin\r | |
79 | // Work with aligned 32-byte chunks\r | |
80 | bic src, srcin, #31\r | |
81 | dup vrepmask.4s, wtmp2\r | |
82 | ands soff, srcin, #31\r | |
83 | and cntrem, cntin, #31\r | |
84 | b.eq .Lloop\r | |
85 | \r | |
86 | //\r | |
87 | // Input string is not 32-byte aligned. We calculate the syndrome\r | |
88 | // value for the aligned 32 bytes block containing the first bytes\r | |
89 | // and mask the irrelevant part.\r | |
90 | //\r | |
91 | \r | |
92 | ld1 {vdata1.16b, vdata2.16b}, [src], #32\r | |
93 | sub tmp, soff, #32\r | |
94 | adds cntin, cntin, tmp\r | |
95 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r | |
96 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r | |
97 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r | |
98 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r | |
99 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r | |
100 | addp vend.16b, vend.16b, vend.16b // 128->64\r | |
101 | mov synd, vend.d[0]\r | |
102 | // Clear the soff*2 lower bits\r | |
103 | lsl tmp, soff, #1\r | |
104 | lsr synd, synd, tmp\r | |
105 | lsl synd, synd, tmp\r | |
106 | // The first block can also be the last\r | |
107 | b.ls .Lmasklast\r | |
108 | // Have we found something already?\r | |
109 | cbnz synd, .Ltail\r | |
110 | \r | |
111 | .Lloop:\r | |
112 | ld1 {vdata1.16b, vdata2.16b}, [src], #32\r | |
113 | subs cntin, cntin, #32\r | |
114 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r | |
115 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r | |
116 | // If we're out of data we finish regardless of the result\r | |
117 | b.ls .Lend\r | |
118 | // Use a fast check for the termination condition\r | |
119 | orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r | |
120 | addp vend.2d, vend.2d, vend.2d\r | |
121 | mov synd, vend.d[0]\r | |
122 | // We're not out of data, loop if we haven't found the character\r | |
123 | cbz synd, .Lloop\r | |
124 | \r | |
125 | .Lend:\r | |
126 | // Termination condition found, let's calculate the syndrome value\r | |
127 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r | |
128 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r | |
129 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r | |
130 | addp vend.16b, vend.16b, vend.16b // 128->64\r | |
131 | mov synd, vend.d[0]\r | |
132 | // Only do the clear for the last possible block\r | |
133 | b.hi .Ltail\r | |
134 | \r | |
135 | .Lmasklast:\r | |
136 | // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r | |
137 | add tmp, cntrem, soff\r | |
138 | and tmp, tmp, #31\r | |
139 | sub tmp, tmp, #32\r | |
140 | neg tmp, tmp, lsl #1\r | |
141 | lsl synd, synd, tmp\r | |
142 | lsr synd, synd, tmp\r | |
143 | \r | |
144 | .Ltail:\r | |
145 | // Count the trailing zeros using bit reversing\r | |
146 | rbit synd, synd\r | |
147 | // Compensate the last post-increment\r | |
148 | sub src, src, #32\r | |
149 | // Check that we have found a character\r | |
150 | cmp synd, #0\r | |
151 | // And count the leading zeros\r | |
152 | clz synd, synd\r | |
153 | // Compute the potential result\r | |
154 | add result, src, synd, lsr #1\r | |
155 | // Select result or NULL\r | |
156 | csel result, xzr, result, eq\r | |
157 | ret\r | |
158 | \r | |
159 | .Lzero_length:\r | |
160 | mov result, #0\r | |
161 | ret\r |