]> git.proxmox.com Git - mirror_edk2.git/blame - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S
CommitLineData
c86cd1e1
AB
1//\r
2// Copyright (c) 2012 - 2016, Linaro Limited\r
3// All rights reserved.\r
4//\r
5// Redistribution and use in source and binary forms, with or without\r
6// modification, are permitted provided that the following conditions are met:\r
7// * Redistributions of source code must retain the above copyright\r
8// notice, this list of conditions and the following disclaimer.\r
9// * Redistributions in binary form must reproduce the above copyright\r
10// notice, this list of conditions and the following disclaimer in the\r
11// documentation and/or other materials provided with the distribution.\r
12// * Neither the name of the Linaro nor the\r
13// names of its contributors may be used to endorse or promote products\r
14// derived from this software without specific prior written permission.\r
15//\r
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
27//\r
28\r
29//\r
30// Copyright (c) 2015 ARM Ltd\r
31// All rights reserved.\r
32//\r
33// Redistribution and use in source and binary forms, with or without\r
34// modification, are permitted provided that the following conditions\r
35// are met:\r
36// 1. Redistributions of source code must retain the above copyright\r
37// notice, this list of conditions and the following disclaimer.\r
38// 2. Redistributions in binary form must reproduce the above copyright\r
39// notice, this list of conditions and the following disclaimer in the\r
40// documentation and/or other materials provided with the distribution.\r
41// 3. The name of the company may not be used to endorse or promote\r
42// products derived from this software without specific prior written\r
43// permission.\r
44//\r
45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
55//\r
56\r
57// Assumptions:\r
58//\r
59// ARMv8-a, AArch64, unaligned accesses\r
60//\r
61//\r
62\r
63#define dstin x0\r
64#define count x1\r
65#define val x2\r
66#define valw w2\r
67#define dst x3\r
68#define dstend x4\r
69#define tmp1 x5\r
70#define tmp1w w5\r
71#define tmp2 x6\r
72#define tmp2w w6\r
73#define zva_len x7\r
74#define zva_lenw w7\r
75\r
76#define L(l) .L ## l\r
77\r
78ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
79ASM_PFX(InternalMemSetMem16):\r
80 dup v0.8H, valw\r
81 b 0f\r
82\r
83ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
84ASM_PFX(InternalMemSetMem32):\r
85 dup v0.4S, valw\r
86 b 0f\r
87\r
88ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
89ASM_PFX(InternalMemSetMem64):\r
90 dup v0.2D, val\r
91 b 0f\r
92\r
93ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
94ASM_PFX(InternalMemZeroMem):\r
95 movi v0.16B, #0\r
96 b 0f\r
97\r
98ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
99ASM_PFX(InternalMemSetMem):\r
100 dup v0.16B, valw\r
1010: add dstend, dstin, count\r
102 mov val, v0.D[0]\r
103\r
104 cmp count, 96\r
105 b.hi L(set_long)\r
106 cmp count, 16\r
107 b.hs L(set_medium)\r
108\r
109 // Set 0..15 bytes.\r
110 tbz count, 3, 1f\r
111 str val, [dstin]\r
112 str val, [dstend, -8]\r
113 ret\r
114 nop\r
1151: tbz count, 2, 2f\r
116 str valw, [dstin]\r
117 str valw, [dstend, -4]\r
118 ret\r
1192: cbz count, 3f\r
120 strb valw, [dstin]\r
121 tbz count, 1, 3f\r
122 strh valw, [dstend, -2]\r
1233: ret\r
124\r
125 // Set 17..96 bytes.\r
126L(set_medium):\r
127 str q0, [dstin]\r
128 tbnz count, 6, L(set96)\r
129 str q0, [dstend, -16]\r
130 tbz count, 5, 1f\r
131 str q0, [dstin, 16]\r
132 str q0, [dstend, -32]\r
1331: ret\r
134\r
135 .p2align 4\r
136 // Set 64..96 bytes. Write 64 bytes from the start and\r
137 // 32 bytes from the end.\r
138L(set96):\r
139 str q0, [dstin, 16]\r
140 stp q0, q0, [dstin, 32]\r
141 stp q0, q0, [dstend, -32]\r
142 ret\r
143\r
144 .p2align 3\r
145 nop\r
146L(set_long):\r
147 bic dst, dstin, 15\r
148 str q0, [dstin]\r
149 cmp count, 256\r
150 ccmp val, 0, 0, cs\r
151 b.eq L(try_zva)\r
152L(no_zva):\r
153 sub count, dstend, dst // Count is 16 too large.\r
154 add dst, dst, 16\r
155 sub count, count, 64 + 16 // Adjust count and bias for loop.\r
1561: stp q0, q0, [dst], 64\r
157 stp q0, q0, [dst, -32]\r
158L(tail64):\r
159 subs count, count, 64\r
160 b.hi 1b\r
1612: stp q0, q0, [dstend, -64]\r
162 stp q0, q0, [dstend, -32]\r
163 ret\r
164\r
165 .p2align 3\r
166L(try_zva):\r
167 mrs tmp1, dczid_el0\r
168 tbnz tmp1w, 4, L(no_zva)\r
169 and tmp1w, tmp1w, 15\r
170 cmp tmp1w, 4 // ZVA size is 64 bytes.\r
171 b.ne L(zva_128)\r
172\r
173 // Write the first and last 64 byte aligned block using stp rather\r
174 // than using DC ZVA. This is faster on some cores.\r
175L(zva_64):\r
176 str q0, [dst, 16]\r
177 stp q0, q0, [dst, 32]\r
178 bic dst, dst, 63\r
179 stp q0, q0, [dst, 64]\r
180 stp q0, q0, [dst, 96]\r
181 sub count, dstend, dst // Count is now 128 too large.\r
182 sub count, count, 128+64+64 // Adjust count and bias for loop.\r
183 add dst, dst, 128\r
184 nop\r
1851: dc zva, dst\r
186 add dst, dst, 64\r
187 subs count, count, 64\r
188 b.hi 1b\r
189 stp q0, q0, [dst, 0]\r
190 stp q0, q0, [dst, 32]\r
191 stp q0, q0, [dstend, -64]\r
192 stp q0, q0, [dstend, -32]\r
193 ret\r
194\r
195 .p2align 3\r
196L(zva_128):\r
197 cmp tmp1w, 5 // ZVA size is 128 bytes.\r
198 b.ne L(zva_other)\r
199\r
200 str q0, [dst, 16]\r
201 stp q0, q0, [dst, 32]\r
202 stp q0, q0, [dst, 64]\r
203 stp q0, q0, [dst, 96]\r
204 bic dst, dst, 127\r
205 sub count, dstend, dst // Count is now 128 too large.\r
206 sub count, count, 128+128 // Adjust count and bias for loop.\r
207 add dst, dst, 128\r
2081: dc zva, dst\r
209 add dst, dst, 128\r
210 subs count, count, 128\r
211 b.hi 1b\r
212 stp q0, q0, [dstend, -128]\r
213 stp q0, q0, [dstend, -96]\r
214 stp q0, q0, [dstend, -64]\r
215 stp q0, q0, [dstend, -32]\r
216 ret\r
217\r
218L(zva_other):\r
219 mov tmp2w, 4\r
220 lsl zva_lenw, tmp2w, tmp1w\r
221 add tmp1, zva_len, 64 // Max alignment bytes written.\r
222 cmp count, tmp1\r
223 blo L(no_zva)\r
224\r
225 sub tmp2, zva_len, 1\r
226 add tmp1, dst, zva_len\r
227 add dst, dst, 16\r
228 subs count, tmp1, dst // Actual alignment bytes to write.\r
229 bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r
230 beq 2f\r
2311: stp q0, q0, [dst], 64\r
232 stp q0, q0, [dst, -32]\r
233 subs count, count, 64\r
234 b.hi 1b\r
2352: mov dst, tmp1\r
236 sub count, dstend, tmp1 // Remaining bytes to write.\r
237 subs count, count, zva_len\r
238 b.lo 4f\r
2393: dc zva, dst\r
240 add dst, dst, zva_len\r
241 subs count, count, zva_len\r
242 b.hs 3b\r
2434: add count, count, zva_len\r
244 b L(tail64)\r