]> git.proxmox.com Git - mirror_edk2.git/blame - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
UefiCpuPkg: Move AsmRelocateApLoopStart from Mpfuncs.nasm to AmdSev.nasm
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / ScanMem.S
CommitLineData
c86cd1e1
AB
1//\r
2// Copyright (c) 2014, ARM Limited\r
3// All rights Reserved.\r
aa1b377e 4// SPDX-License-Identifier: BSD-2-Clause-Patent\r
c86cd1e1
AB
5//\r
6\r
7// Assumptions:\r
8//\r
9// ARMv8-a, AArch64\r
10// Neon Available.\r
11//\r
12\r
13// Arguments and results.\r
14#define srcin x0\r
15#define cntin x1\r
16#define chrin w2\r
17\r
18#define result x0\r
19\r
20#define src x3\r
9095d37b 21#define tmp x4\r
c86cd1e1
AB
22#define wtmp2 w5\r
23#define synd x6\r
24#define soff x9\r
25#define cntrem x10\r
26\r
27#define vrepchr v0\r
28#define vdata1 v1\r
29#define vdata2 v2\r
30#define vhas_chr1 v3\r
31#define vhas_chr2 v4\r
32#define vrepmask v5\r
33#define vend v6\r
34\r
35//\r
36// Core algorithm:\r
37//\r
38// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
39// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
40// requested character and bit 1 is not used (faster than using a 32bit\r
41// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
42// things occur in the original string, counting trailing zeros allows to\r
43// identify exactly which byte has matched.\r
44//\r
45\r
46ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
47ASM_PFX(InternalMemScanMem8):\r
48 // Do not dereference srcin if no bytes to compare.\r
9095d37b 49 cbz cntin, .Lzero_length\r
c86cd1e1
AB
50 //\r
51 // Magic constant 0x40100401 allows us to identify which lane matches\r
52 // the requested byte.\r
53 //\r
54 mov wtmp2, #0x0401\r
55 movk wtmp2, #0x4010, lsl #16\r
56 dup vrepchr.16b, chrin\r
57 // Work with aligned 32-byte chunks\r
58 bic src, srcin, #31\r
59 dup vrepmask.4s, wtmp2\r
60 ands soff, srcin, #31\r
61 and cntrem, cntin, #31\r
62 b.eq .Lloop\r
63\r
64 //\r
65 // Input string is not 32-byte aligned. We calculate the syndrome\r
66 // value for the aligned 32 bytes block containing the first bytes\r
67 // and mask the irrelevant part.\r
68 //\r
69\r
70 ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
71 sub tmp, soff, #32\r
72 adds cntin, cntin, tmp\r
73 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
74 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
75 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
76 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
77 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
78 addp vend.16b, vend.16b, vend.16b // 128->64\r
79 mov synd, vend.d[0]\r
80 // Clear the soff*2 lower bits\r
81 lsl tmp, soff, #1\r
82 lsr synd, synd, tmp\r
83 lsl synd, synd, tmp\r
84 // The first block can also be the last\r
85 b.ls .Lmasklast\r
86 // Have we found something already?\r
87 cbnz synd, .Ltail\r
88\r
89.Lloop:\r
90 ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
91 subs cntin, cntin, #32\r
92 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
93 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
94 // If we're out of data we finish regardless of the result\r
95 b.ls .Lend\r
96 // Use a fast check for the termination condition\r
97 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
98 addp vend.2d, vend.2d, vend.2d\r
99 mov synd, vend.d[0]\r
100 // We're not out of data, loop if we haven't found the character\r
101 cbz synd, .Lloop\r
102\r
103.Lend:\r
104 // Termination condition found, let's calculate the syndrome value\r
105 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
106 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
107 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
108 addp vend.16b, vend.16b, vend.16b // 128->64\r
109 mov synd, vend.d[0]\r
110 // Only do the clear for the last possible block\r
111 b.hi .Ltail\r
112\r
113.Lmasklast:\r
114 // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
115 add tmp, cntrem, soff\r
116 and tmp, tmp, #31\r
117 sub tmp, tmp, #32\r
118 neg tmp, tmp, lsl #1\r
119 lsl synd, synd, tmp\r
120 lsr synd, synd, tmp\r
121\r
122.Ltail:\r
123 // Count the trailing zeros using bit reversing\r
124 rbit synd, synd\r
125 // Compensate the last post-increment\r
126 sub src, src, #32\r
127 // Check that we have found a character\r
128 cmp synd, #0\r
129 // And count the leading zeros\r
130 clz synd, synd\r
131 // Compute the potential result\r
132 add result, src, synd, lsr #1\r
133 // Select result or NULL\r
134 csel result, xzr, result, eq\r
135 ret\r
136\r
137.Lzero_length:\r
138 mov result, #0\r
139 ret\r