]> git.proxmox.com Git - mirror_edk2.git/blob - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / ScanMem.S
1 //
2 // Copyright (c) 2014, ARM Limited
3 // All rights Reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are met:
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright
10 // notice, this list of conditions and the following disclaimer in the
11 // documentation and/or other materials provided with the distribution.
12 // * Neither the name of the company nor the names of its contributors
13 // may be used to endorse or promote products derived from this
14 // software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 //
28
29 // Assumptions:
30 //
31 // ARMv8-a, AArch64
32 // Neon Available.
33 //
34
35 // Arguments and results.
36 #define srcin x0
37 #define cntin x1
38 #define chrin w2
39
40 #define result x0
41
42 #define src x3
43 #define tmp x4
44 #define wtmp2 w5
45 #define synd x6
46 #define soff x9
47 #define cntrem x10
48
49 #define vrepchr v0
50 #define vdata1 v1
51 #define vdata2 v2
52 #define vhas_chr1 v3
53 #define vhas_chr2 v4
54 #define vrepmask v5
55 #define vend v6
56
57 //
58 // Core algorithm:
59 //
60 // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
61 // per byte. For each tuple, bit 0 is set if the relevant byte matched the
62 // requested character and bit 1 is not used (faster than using a 32bit
63 // syndrome). Since the bits in the syndrome reflect exactly the order in which
64 // things occur in the original string, counting trailing zeros allows to
65 // identify exactly which byte has matched.
66 //
67
68 ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
69 ASM_PFX(InternalMemScanMem8):
70 // Do not dereference srcin if no bytes to compare.
71 cbz cntin, .Lzero_length
72 //
73 // Magic constant 0x40100401 allows us to identify which lane matches
74 // the requested byte.
75 //
76 mov wtmp2, #0x0401
77 movk wtmp2, #0x4010, lsl #16
78 dup vrepchr.16b, chrin
79 // Work with aligned 32-byte chunks
80 bic src, srcin, #31
81 dup vrepmask.4s, wtmp2
82 ands soff, srcin, #31
83 and cntrem, cntin, #31
84 b.eq .Lloop
85
86 //
87 // Input string is not 32-byte aligned. We calculate the syndrome
88 // value for the aligned 32 bytes block containing the first bytes
89 // and mask the irrelevant part.
90 //
91
92 ld1 {vdata1.16b, vdata2.16b}, [src], #32
93 sub tmp, soff, #32
94 adds cntin, cntin, tmp
95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
97 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
98 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
99 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
100 addp vend.16b, vend.16b, vend.16b // 128->64
101 mov synd, vend.d[0]
102 // Clear the soff*2 lower bits
103 lsl tmp, soff, #1
104 lsr synd, synd, tmp
105 lsl synd, synd, tmp
106 // The first block can also be the last
107 b.ls .Lmasklast
108 // Have we found something already?
109 cbnz synd, .Ltail
110
111 .Lloop:
112 ld1 {vdata1.16b, vdata2.16b}, [src], #32
113 subs cntin, cntin, #32
114 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
115 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
116 // If we're out of data we finish regardless of the result
117 b.ls .Lend
118 // Use a fast check for the termination condition
119 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
120 addp vend.2d, vend.2d, vend.2d
121 mov synd, vend.d[0]
122 // We're not out of data, loop if we haven't found the character
123 cbz synd, .Lloop
124
125 .Lend:
126 // Termination condition found, let's calculate the syndrome value
127 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
128 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
129 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
130 addp vend.16b, vend.16b, vend.16b // 128->64
131 mov synd, vend.d[0]
132 // Only do the clear for the last possible block
133 b.hi .Ltail
134
135 .Lmasklast:
136 // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
137 add tmp, cntrem, soff
138 and tmp, tmp, #31
139 sub tmp, tmp, #32
140 neg tmp, tmp, lsl #1
141 lsl synd, synd, tmp
142 lsr synd, synd, tmp
143
144 .Ltail:
145 // Count the trailing zeros using bit reversing
146 rbit synd, synd
147 // Compensate the last post-increment
148 sub src, src, #32
149 // Check that we have found a character
150 cmp synd, #0
151 // And count the leading zeros
152 clz synd, synd
153 // Compute the potential result
154 add result, src, synd, lsr #1
155 // Select result or NULL
156 csel result, xzr, result, eq
157 ret
158
159 .Lzero_length:
160 mov result, #0
161 ret