]> git.proxmox.com Git - mirror_edk2.git/blame - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
MdePkg/BaseMemoryLibOptDxe ARM AARCH64: fix thinko in SetMem##
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S
CommitLineData
c86cd1e1
AB
1//\r
2// Copyright (c) 2012 - 2016, Linaro Limited\r
3// All rights reserved.\r
4//\r
5// Redistribution and use in source and binary forms, with or without\r
6// modification, are permitted provided that the following conditions are met:\r
7// * Redistributions of source code must retain the above copyright\r
8// notice, this list of conditions and the following disclaimer.\r
9// * Redistributions in binary form must reproduce the above copyright\r
10// notice, this list of conditions and the following disclaimer in the\r
11// documentation and/or other materials provided with the distribution.\r
12// * Neither the name of the Linaro nor the\r
13// names of its contributors may be used to endorse or promote products\r
14// derived from this software without specific prior written permission.\r
15//\r
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
27//\r
28\r
29//\r
30// Copyright (c) 2015 ARM Ltd\r
31// All rights reserved.\r
32//\r
33// Redistribution and use in source and binary forms, with or without\r
34// modification, are permitted provided that the following conditions\r
35// are met:\r
36// 1. Redistributions of source code must retain the above copyright\r
37// notice, this list of conditions and the following disclaimer.\r
38// 2. Redistributions in binary form must reproduce the above copyright\r
39// notice, this list of conditions and the following disclaimer in the\r
40// documentation and/or other materials provided with the distribution.\r
41// 3. The name of the company may not be used to endorse or promote\r
42// products derived from this software without specific prior written\r
43// permission.\r
44//\r
45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
55//\r
56\r
57// Assumptions:\r
58//\r
59// ARMv8-a, AArch64, unaligned accesses\r
60//\r
61//\r
62\r
63#define dstin x0\r
64#define count x1\r
65#define val x2\r
66#define valw w2\r
67#define dst x3\r
68#define dstend x4\r
69#define tmp1 x5\r
70#define tmp1w w5\r
71#define tmp2 x6\r
72#define tmp2w w6\r
73#define zva_len x7\r
74#define zva_lenw w7\r
75\r
76#define L(l) .L ## l\r
77\r
78ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
79ASM_PFX(InternalMemSetMem16):\r
80 dup v0.8H, valw\r
8b4ca351 81 lsl count, count, #1\r
c86cd1e1
AB
82 b 0f\r
83\r
84ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
85ASM_PFX(InternalMemSetMem32):\r
86 dup v0.4S, valw\r
8b4ca351 87 lsl count, count, #2\r
c86cd1e1
AB
88 b 0f\r
89\r
90ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
91ASM_PFX(InternalMemSetMem64):\r
92 dup v0.2D, val\r
8b4ca351 93 lsl count, count, #3\r
c86cd1e1
AB
94 b 0f\r
95\r
96ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
97ASM_PFX(InternalMemZeroMem):\r
98 movi v0.16B, #0\r
99 b 0f\r
100\r
101ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
102ASM_PFX(InternalMemSetMem):\r
103 dup v0.16B, valw\r
1040: add dstend, dstin, count\r
105 mov val, v0.D[0]\r
106\r
107 cmp count, 96\r
108 b.hi L(set_long)\r
109 cmp count, 16\r
110 b.hs L(set_medium)\r
111\r
112 // Set 0..15 bytes.\r
113 tbz count, 3, 1f\r
114 str val, [dstin]\r
115 str val, [dstend, -8]\r
116 ret\r
117 nop\r
1181: tbz count, 2, 2f\r
119 str valw, [dstin]\r
120 str valw, [dstend, -4]\r
121 ret\r
1222: cbz count, 3f\r
123 strb valw, [dstin]\r
124 tbz count, 1, 3f\r
125 strh valw, [dstend, -2]\r
1263: ret\r
127\r
128 // Set 17..96 bytes.\r
129L(set_medium):\r
130 str q0, [dstin]\r
131 tbnz count, 6, L(set96)\r
132 str q0, [dstend, -16]\r
133 tbz count, 5, 1f\r
134 str q0, [dstin, 16]\r
135 str q0, [dstend, -32]\r
1361: ret\r
137\r
138 .p2align 4\r
139 // Set 64..96 bytes. Write 64 bytes from the start and\r
140 // 32 bytes from the end.\r
141L(set96):\r
142 str q0, [dstin, 16]\r
143 stp q0, q0, [dstin, 32]\r
144 stp q0, q0, [dstend, -32]\r
145 ret\r
146\r
147 .p2align 3\r
148 nop\r
149L(set_long):\r
150 bic dst, dstin, 15\r
151 str q0, [dstin]\r
152 cmp count, 256\r
153 ccmp val, 0, 0, cs\r
154 b.eq L(try_zva)\r
155L(no_zva):\r
156 sub count, dstend, dst // Count is 16 too large.\r
157 add dst, dst, 16\r
158 sub count, count, 64 + 16 // Adjust count and bias for loop.\r
1591: stp q0, q0, [dst], 64\r
160 stp q0, q0, [dst, -32]\r
161L(tail64):\r
162 subs count, count, 64\r
163 b.hi 1b\r
1642: stp q0, q0, [dstend, -64]\r
165 stp q0, q0, [dstend, -32]\r
166 ret\r
167\r
168 .p2align 3\r
169L(try_zva):\r
170 mrs tmp1, dczid_el0\r
171 tbnz tmp1w, 4, L(no_zva)\r
172 and tmp1w, tmp1w, 15\r
173 cmp tmp1w, 4 // ZVA size is 64 bytes.\r
174 b.ne L(zva_128)\r
175\r
176 // Write the first and last 64 byte aligned block using stp rather\r
177 // than using DC ZVA. This is faster on some cores.\r
178L(zva_64):\r
179 str q0, [dst, 16]\r
180 stp q0, q0, [dst, 32]\r
181 bic dst, dst, 63\r
182 stp q0, q0, [dst, 64]\r
183 stp q0, q0, [dst, 96]\r
184 sub count, dstend, dst // Count is now 128 too large.\r
185 sub count, count, 128+64+64 // Adjust count and bias for loop.\r
186 add dst, dst, 128\r
187 nop\r
1881: dc zva, dst\r
189 add dst, dst, 64\r
190 subs count, count, 64\r
191 b.hi 1b\r
192 stp q0, q0, [dst, 0]\r
193 stp q0, q0, [dst, 32]\r
194 stp q0, q0, [dstend, -64]\r
195 stp q0, q0, [dstend, -32]\r
196 ret\r
197\r
198 .p2align 3\r
199L(zva_128):\r
200 cmp tmp1w, 5 // ZVA size is 128 bytes.\r
201 b.ne L(zva_other)\r
202\r
203 str q0, [dst, 16]\r
204 stp q0, q0, [dst, 32]\r
205 stp q0, q0, [dst, 64]\r
206 stp q0, q0, [dst, 96]\r
207 bic dst, dst, 127\r
208 sub count, dstend, dst // Count is now 128 too large.\r
209 sub count, count, 128+128 // Adjust count and bias for loop.\r
210 add dst, dst, 128\r
2111: dc zva, dst\r
212 add dst, dst, 128\r
213 subs count, count, 128\r
214 b.hi 1b\r
215 stp q0, q0, [dstend, -128]\r
216 stp q0, q0, [dstend, -96]\r
217 stp q0, q0, [dstend, -64]\r
218 stp q0, q0, [dstend, -32]\r
219 ret\r
220\r
221L(zva_other):\r
222 mov tmp2w, 4\r
223 lsl zva_lenw, tmp2w, tmp1w\r
224 add tmp1, zva_len, 64 // Max alignment bytes written.\r
225 cmp count, tmp1\r
226 blo L(no_zva)\r
227\r
228 sub tmp2, zva_len, 1\r
229 add tmp1, dst, zva_len\r
230 add dst, dst, 16\r
231 subs count, tmp1, dst // Actual alignment bytes to write.\r
232 bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r
233 beq 2f\r
2341: stp q0, q0, [dst], 64\r
235 stp q0, q0, [dst, -32]\r
236 subs count, count, 64\r
237 b.hi 1b\r
2382: mov dst, tmp1\r
239 sub count, dstend, tmp1 // Remaining bytes to write.\r
240 subs count, count, zva_len\r
241 b.lo 4f\r
2423: dc zva, dst\r
243 add dst, dst, zva_len\r
244 subs count, count, zva_len\r
245 b.hs 3b\r
2464: add count, count, zva_len\r
247 b L(tail64)\r