]> git.proxmox.com Git - mirror_edk2.git/blame - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
MdePkg: Clean up source files
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / ScanMem.S
CommitLineData
c86cd1e1
AB
1//\r
2// Copyright (c) 2014, ARM Limited\r
3// All rights Reserved.\r
4//\r
5// Redistribution and use in source and binary forms, with or without\r
6// modification, are permitted provided that the following conditions are met:\r
7// * Redistributions of source code must retain the above copyright\r
8// notice, this list of conditions and the following disclaimer.\r
9// * Redistributions in binary form must reproduce the above copyright\r
10// notice, this list of conditions and the following disclaimer in the\r
11// documentation and/or other materials provided with the distribution.\r
12// * Neither the name of the company nor the names of its contributors\r
13// may be used to endorse or promote products derived from this\r
14// software without specific prior written permission.\r
15//\r
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
27//\r
28\r
29// Assumptions:\r
30//\r
31// ARMv8-a, AArch64\r
32// Neon Available.\r
33//\r
34\r
35// Arguments and results.\r
36#define srcin x0\r
37#define cntin x1\r
38#define chrin w2\r
39\r
40#define result x0\r
41\r
42#define src x3\r
9095d37b 43#define tmp x4\r
c86cd1e1
AB
44#define wtmp2 w5\r
45#define synd x6\r
46#define soff x9\r
47#define cntrem x10\r
48\r
49#define vrepchr v0\r
50#define vdata1 v1\r
51#define vdata2 v2\r
52#define vhas_chr1 v3\r
53#define vhas_chr2 v4\r
54#define vrepmask v5\r
55#define vend v6\r
56\r
57//\r
58// Core algorithm:\r
59//\r
60// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
61// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
62// requested character and bit 1 is not used (faster than using a 32bit\r
63// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
64// things occur in the original string, counting trailing zeros allows to\r
65// identify exactly which byte has matched.\r
66//\r
67\r
68ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
69ASM_PFX(InternalMemScanMem8):\r
70 // Do not dereference srcin if no bytes to compare.\r
9095d37b 71 cbz cntin, .Lzero_length\r
c86cd1e1
AB
72 //\r
73 // Magic constant 0x40100401 allows us to identify which lane matches\r
74 // the requested byte.\r
75 //\r
76 mov wtmp2, #0x0401\r
77 movk wtmp2, #0x4010, lsl #16\r
78 dup vrepchr.16b, chrin\r
79 // Work with aligned 32-byte chunks\r
80 bic src, srcin, #31\r
81 dup vrepmask.4s, wtmp2\r
82 ands soff, srcin, #31\r
83 and cntrem, cntin, #31\r
84 b.eq .Lloop\r
85\r
86 //\r
87 // Input string is not 32-byte aligned. We calculate the syndrome\r
88 // value for the aligned 32 bytes block containing the first bytes\r
89 // and mask the irrelevant part.\r
90 //\r
91\r
92 ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
93 sub tmp, soff, #32\r
94 adds cntin, cntin, tmp\r
95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
97 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
98 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
99 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
100 addp vend.16b, vend.16b, vend.16b // 128->64\r
101 mov synd, vend.d[0]\r
102 // Clear the soff*2 lower bits\r
103 lsl tmp, soff, #1\r
104 lsr synd, synd, tmp\r
105 lsl synd, synd, tmp\r
106 // The first block can also be the last\r
107 b.ls .Lmasklast\r
108 // Have we found something already?\r
109 cbnz synd, .Ltail\r
110\r
111.Lloop:\r
112 ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
113 subs cntin, cntin, #32\r
114 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
115 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
116 // If we're out of data we finish regardless of the result\r
117 b.ls .Lend\r
118 // Use a fast check for the termination condition\r
119 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
120 addp vend.2d, vend.2d, vend.2d\r
121 mov synd, vend.d[0]\r
122 // We're not out of data, loop if we haven't found the character\r
123 cbz synd, .Lloop\r
124\r
125.Lend:\r
126 // Termination condition found, let's calculate the syndrome value\r
127 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
128 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
129 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
130 addp vend.16b, vend.16b, vend.16b // 128->64\r
131 mov synd, vend.d[0]\r
132 // Only do the clear for the last possible block\r
133 b.hi .Ltail\r
134\r
135.Lmasklast:\r
136 // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
137 add tmp, cntrem, soff\r
138 and tmp, tmp, #31\r
139 sub tmp, tmp, #32\r
140 neg tmp, tmp, lsl #1\r
141 lsl synd, synd, tmp\r
142 lsr synd, synd, tmp\r
143\r
144.Ltail:\r
145 // Count the trailing zeros using bit reversing\r
146 rbit synd, synd\r
147 // Compensate the last post-increment\r
148 sub src, src, #32\r
149 // Check that we have found a character\r
150 cmp synd, #0\r
151 // And count the leading zeros\r
152 clz synd, synd\r
153 // Compute the potential result\r
154 add result, src, synd, lsr #1\r
155 // Select result or NULL\r
156 csel result, xzr, result, eq\r
157 ret\r
158\r
159.Lzero_length:\r
160 mov result, #0\r
161 ret\r