]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2014, ARM Limited\r | |
3 | // All rights Reserved.\r | |
aa1b377e | 4 | // SPDX-License-Identifier: BSD-2-Clause-Patent\r |
c86cd1e1 AB |
5 | //\r |
6 | \r | |
7 | // Assumptions:\r | |
8 | //\r | |
9 | // ARMv8-a, AArch64\r | |
10 | // Neon Available.\r | |
11 | //\r | |
12 | \r | |
13 | // Arguments and results.\r | |
14 | #define srcin x0\r | |
15 | #define cntin x1\r | |
16 | #define chrin w2\r | |
17 | \r | |
18 | #define result x0\r | |
19 | \r | |
20 | #define src x3\r | |
9095d37b | 21 | #define tmp x4\r |
c86cd1e1 AB |
22 | #define wtmp2 w5\r |
23 | #define synd x6\r | |
24 | #define soff x9\r | |
25 | #define cntrem x10\r | |
26 | \r | |
27 | #define vrepchr v0\r | |
28 | #define vdata1 v1\r | |
29 | #define vdata2 v2\r | |
30 | #define vhas_chr1 v3\r | |
31 | #define vhas_chr2 v4\r | |
32 | #define vrepmask v5\r | |
33 | #define vend v6\r | |
34 | \r | |
35 | //\r | |
36 | // Core algorithm:\r | |
37 | //\r | |
38 | // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r | |
39 | // per byte. For each tuple, bit 0 is set if the relevant byte matched the\r | |
40 | // requested character and bit 1 is not used (faster than using a 32bit\r | |
41 | // syndrome). Since the bits in the syndrome reflect exactly the order in which\r | |
42 | // things occur in the original string, counting trailing zeros allows to\r | |
43 | // identify exactly which byte has matched.\r | |
44 | //\r | |
45 | \r | |
46 | ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r | |
47 | ASM_PFX(InternalMemScanMem8):\r | |
48 | // Do not dereference srcin if no bytes to compare.\r | |
9095d37b | 49 | cbz cntin, .Lzero_length\r |
c86cd1e1 AB |
50 | //\r |
51 | // Magic constant 0x40100401 allows us to identify which lane matches\r | |
52 | // the requested byte.\r | |
53 | //\r | |
54 | mov wtmp2, #0x0401\r | |
55 | movk wtmp2, #0x4010, lsl #16\r | |
56 | dup vrepchr.16b, chrin\r | |
57 | // Work with aligned 32-byte chunks\r | |
58 | bic src, srcin, #31\r | |
59 | dup vrepmask.4s, wtmp2\r | |
60 | ands soff, srcin, #31\r | |
61 | and cntrem, cntin, #31\r | |
62 | b.eq .Lloop\r | |
63 | \r | |
64 | //\r | |
65 | // Input string is not 32-byte aligned. We calculate the syndrome\r | |
66 | // value for the aligned 32 bytes block containing the first bytes\r | |
67 | // and mask the irrelevant part.\r | |
68 | //\r | |
69 | \r | |
70 | ld1 {vdata1.16b, vdata2.16b}, [src], #32\r | |
71 | sub tmp, soff, #32\r | |
72 | adds cntin, cntin, tmp\r | |
73 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r | |
74 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r | |
75 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r | |
76 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r | |
77 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r | |
78 | addp vend.16b, vend.16b, vend.16b // 128->64\r | |
79 | mov synd, vend.d[0]\r | |
80 | // Clear the soff*2 lower bits\r | |
81 | lsl tmp, soff, #1\r | |
82 | lsr synd, synd, tmp\r | |
83 | lsl synd, synd, tmp\r | |
84 | // The first block can also be the last\r | |
85 | b.ls .Lmasklast\r | |
86 | // Have we found something already?\r | |
87 | cbnz synd, .Ltail\r | |
88 | \r | |
89 | .Lloop:\r | |
90 | ld1 {vdata1.16b, vdata2.16b}, [src], #32\r | |
91 | subs cntin, cntin, #32\r | |
92 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r | |
93 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r | |
94 | // If we're out of data we finish regardless of the result\r | |
95 | b.ls .Lend\r | |
96 | // Use a fast check for the termination condition\r | |
97 | orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r | |
98 | addp vend.2d, vend.2d, vend.2d\r | |
99 | mov synd, vend.d[0]\r | |
100 | // We're not out of data, loop if we haven't found the character\r | |
101 | cbz synd, .Lloop\r | |
102 | \r | |
103 | .Lend:\r | |
104 | // Termination condition found, let's calculate the syndrome value\r | |
105 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r | |
106 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r | |
107 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r | |
108 | addp vend.16b, vend.16b, vend.16b // 128->64\r | |
109 | mov synd, vend.d[0]\r | |
110 | // Only do the clear for the last possible block\r | |
111 | b.hi .Ltail\r | |
112 | \r | |
113 | .Lmasklast:\r | |
114 | // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r | |
115 | add tmp, cntrem, soff\r | |
116 | and tmp, tmp, #31\r | |
117 | sub tmp, tmp, #32\r | |
118 | neg tmp, tmp, lsl #1\r | |
119 | lsl synd, synd, tmp\r | |
120 | lsr synd, synd, tmp\r | |
121 | \r | |
122 | .Ltail:\r | |
123 | // Count the trailing zeros using bit reversing\r | |
124 | rbit synd, synd\r | |
125 | // Compensate the last post-increment\r | |
126 | sub src, src, #32\r | |
127 | // Check that we have found a character\r | |
128 | cmp synd, #0\r | |
129 | // And count the leading zeros\r | |
130 | clz synd, synd\r | |
131 | // Compute the potential result\r | |
132 | add result, src, synd, lsr #1\r | |
133 | // Select result or NULL\r | |
134 | csel result, xzr, result, eq\r | |
135 | ret\r | |
136 | \r | |
137 | .Lzero_length:\r | |
138 | mov result, #0\r | |
139 | ret\r |