]> git.proxmox.com Git - mirror_edk2.git/blame - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
MdePkg/BaseMemoryLibOptDxe ARM: add missing function annotations
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / CopyMem.S
CommitLineData
c86cd1e1
AB
1//\r
2// Copyright (c) 2012 - 2016, Linaro Limited\r
3// All rights reserved.\r
4//\r
5// Redistribution and use in source and binary forms, with or without\r
6// modification, are permitted provided that the following conditions are met:\r
7// * Redistributions of source code must retain the above copyright\r
8// notice, this list of conditions and the following disclaimer.\r
9// * Redistributions in binary form must reproduce the above copyright\r
10// notice, this list of conditions and the following disclaimer in the\r
11// documentation and/or other materials provided with the distribution.\r
12// * Neither the name of the Linaro nor the\r
13// names of its contributors may be used to endorse or promote products\r
14// derived from this software without specific prior written permission.\r
15//\r
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
27//\r
28\r
29//\r
30// Copyright (c) 2015 ARM Ltd\r
31// All rights reserved.\r
32//\r
33// Redistribution and use in source and binary forms, with or without\r
34// modification, are permitted provided that the following conditions\r
35// are met:\r
36// 1. Redistributions of source code must retain the above copyright\r
37// notice, this list of conditions and the following disclaimer.\r
38// 2. Redistributions in binary form must reproduce the above copyright\r
39// notice, this list of conditions and the following disclaimer in the\r
40// documentation and/or other materials provided with the distribution.\r
41// 3. The name of the company may not be used to endorse or promote\r
42// products derived from this software without specific prior written\r
43// permission.\r
44//\r
45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
55//\r
56\r
57// Assumptions:\r
58//\r
59// ARMv8-a, AArch64, unaligned accesses.\r
60//\r
61//\r
62\r
63#define dstin x0\r
64#define src x1\r
65#define count x2\r
66#define dst x3\r
67#define srcend x4\r
68#define dstend x5\r
69#define A_l x6\r
70#define A_lw w6\r
71#define A_h x7\r
72#define A_hw w7\r
73#define B_l x8\r
74#define B_lw w8\r
75#define B_h x9\r
76#define C_l x10\r
77#define C_h x11\r
78#define D_l x12\r
79#define D_h x13\r
80#define E_l x14\r
81#define E_h x15\r
82#define F_l srcend\r
83#define F_h dst\r
84#define tmp1 x9\r
85#define tmp2 x3\r
86\r
87#define L(l) .L ## l\r
88\r
89// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
90// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
91// of more than 96 bytes align the destination and use an unrolled loop\r
92// processing 64 bytes per iteration.\r
93// Small and medium copies read all data before writing, allowing any\r
94// kind of overlap, and memmove tailcalls memcpy for these cases as\r
95// well as non-overlapping copies.\r
96\r
97__memcpy:\r
98 prfm PLDL1KEEP, [src]\r
99 add srcend, src, count\r
100 add dstend, dstin, count\r
101 cmp count, 16\r
102 b.ls L(copy16)\r
103 cmp count, 96\r
104 b.hi L(copy_long)\r
105\r
106 // Medium copies: 17..96 bytes.\r
107 sub tmp1, count, 1\r
108 ldp A_l, A_h, [src]\r
109 tbnz tmp1, 6, L(copy96)\r
110 ldp D_l, D_h, [srcend, -16]\r
111 tbz tmp1, 5, 1f\r
112 ldp B_l, B_h, [src, 16]\r
113 ldp C_l, C_h, [srcend, -32]\r
114 stp B_l, B_h, [dstin, 16]\r
115 stp C_l, C_h, [dstend, -32]\r
1161:\r
117 stp A_l, A_h, [dstin]\r
118 stp D_l, D_h, [dstend, -16]\r
119 ret\r
120\r
121 .p2align 4\r
122 // Small copies: 0..16 bytes.\r
123L(copy16):\r
124 cmp count, 8\r
125 b.lo 1f\r
126 ldr A_l, [src]\r
127 ldr A_h, [srcend, -8]\r
128 str A_l, [dstin]\r
129 str A_h, [dstend, -8]\r
130 ret\r
131 .p2align 4\r
1321:\r
133 tbz count, 2, 1f\r
134 ldr A_lw, [src]\r
135 ldr A_hw, [srcend, -4]\r
136 str A_lw, [dstin]\r
137 str A_hw, [dstend, -4]\r
138 ret\r
139\r
140 // Copy 0..3 bytes. Use a branchless sequence that copies the same\r
141 // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
1421:\r
143 cbz count, 2f\r
144 lsr tmp1, count, 1\r
145 ldrb A_lw, [src]\r
146 ldrb A_hw, [srcend, -1]\r
147 ldrb B_lw, [src, tmp1]\r
148 strb A_lw, [dstin]\r
149 strb B_lw, [dstin, tmp1]\r
150 strb A_hw, [dstend, -1]\r
1512: ret\r
152\r
153 .p2align 4\r
154 // Copy 64..96 bytes. Copy 64 bytes from the start and\r
155 // 32 bytes from the end.\r
156L(copy96):\r
157 ldp B_l, B_h, [src, 16]\r
158 ldp C_l, C_h, [src, 32]\r
159 ldp D_l, D_h, [src, 48]\r
160 ldp E_l, E_h, [srcend, -32]\r
161 ldp F_l, F_h, [srcend, -16]\r
162 stp A_l, A_h, [dstin]\r
163 stp B_l, B_h, [dstin, 16]\r
164 stp C_l, C_h, [dstin, 32]\r
165 stp D_l, D_h, [dstin, 48]\r
166 stp E_l, E_h, [dstend, -32]\r
167 stp F_l, F_h, [dstend, -16]\r
168 ret\r
169\r
170 // Align DST to 16 byte alignment so that we don't cross cache line\r
171 // boundaries on both loads and stores. There are at least 96 bytes\r
9095d37b 172 // to copy, so copy 16 bytes unaligned and then align. The loop\r
c86cd1e1
AB
173 // copies 64 bytes per iteration and prefetches one iteration ahead.\r
174\r
175 .p2align 4\r
176L(copy_long):\r
177 and tmp1, dstin, 15\r
178 bic dst, dstin, 15\r
179 ldp D_l, D_h, [src]\r
180 sub src, src, tmp1\r
181 add count, count, tmp1 // Count is now 16 too large.\r
182 ldp A_l, A_h, [src, 16]\r
183 stp D_l, D_h, [dstin]\r
184 ldp B_l, B_h, [src, 32]\r
185 ldp C_l, C_h, [src, 48]\r
186 ldp D_l, D_h, [src, 64]!\r
187 subs count, count, 128 + 16 // Test and readjust count.\r
188 b.ls 2f\r
1891:\r
190 stp A_l, A_h, [dst, 16]\r
191 ldp A_l, A_h, [src, 16]\r
192 stp B_l, B_h, [dst, 32]\r
193 ldp B_l, B_h, [src, 32]\r
194 stp C_l, C_h, [dst, 48]\r
195 ldp C_l, C_h, [src, 48]\r
196 stp D_l, D_h, [dst, 64]!\r
197 ldp D_l, D_h, [src, 64]!\r
198 subs count, count, 64\r
199 b.hi 1b\r
200\r
9095d37b 201 // Write the last full set of 64 bytes. The remainder is at most 64\r
c86cd1e1
AB
202 // bytes, so it is safe to always copy 64 bytes from the end even if\r
203 // there is just 1 byte left.\r
2042:\r
205 ldp E_l, E_h, [srcend, -64]\r
206 stp A_l, A_h, [dst, 16]\r
207 ldp A_l, A_h, [srcend, -48]\r
208 stp B_l, B_h, [dst, 32]\r
209 ldp B_l, B_h, [srcend, -32]\r
210 stp C_l, C_h, [dst, 48]\r
211 ldp C_l, C_h, [srcend, -16]\r
212 stp D_l, D_h, [dst, 64]\r
213 stp E_l, E_h, [dstend, -64]\r
214 stp A_l, A_h, [dstend, -48]\r
215 stp B_l, B_h, [dstend, -32]\r
216 stp C_l, C_h, [dstend, -16]\r
217 ret\r
218\r
219\r
220//\r
221// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
222// Larger backwards copies are also handled by memcpy. The only remaining\r
223// case is forward large copies. The destination is aligned, and an\r
224// unrolled loop processes 64 bytes per iteration.\r
225//\r
226\r
227ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
228ASM_PFX(InternalMemCopyMem):\r
229 sub tmp2, dstin, src\r
230 cmp count, 96\r
231 ccmp tmp2, count, 2, hi\r
232 b.hs __memcpy\r
233\r
234 cbz tmp2, 3f\r
235 add dstend, dstin, count\r
236 add srcend, src, count\r
237\r
238 // Align dstend to 16 byte alignment so that we don't cross cache line\r
239 // boundaries on both loads and stores. There are at least 96 bytes\r
240 // to copy, so copy 16 bytes unaligned and then align. The loop\r
241 // copies 64 bytes per iteration and prefetches one iteration ahead.\r
242\r
243 and tmp2, dstend, 15\r
244 ldp D_l, D_h, [srcend, -16]\r
245 sub srcend, srcend, tmp2\r
246 sub count, count, tmp2\r
247 ldp A_l, A_h, [srcend, -16]\r
248 stp D_l, D_h, [dstend, -16]\r
249 ldp B_l, B_h, [srcend, -32]\r
250 ldp C_l, C_h, [srcend, -48]\r
251 ldp D_l, D_h, [srcend, -64]!\r
252 sub dstend, dstend, tmp2\r
253 subs count, count, 128\r
254 b.ls 2f\r
255 nop\r
2561:\r
257 stp A_l, A_h, [dstend, -16]\r
258 ldp A_l, A_h, [srcend, -16]\r
259 stp B_l, B_h, [dstend, -32]\r
260 ldp B_l, B_h, [srcend, -32]\r
261 stp C_l, C_h, [dstend, -48]\r
262 ldp C_l, C_h, [srcend, -48]\r
263 stp D_l, D_h, [dstend, -64]!\r
264 ldp D_l, D_h, [srcend, -64]!\r
265 subs count, count, 64\r
266 b.hi 1b\r
267\r
268 // Write the last full set of 64 bytes. The remainder is at most 64\r
269 // bytes, so it is safe to always copy 64 bytes from the start even if\r
270 // there is just 1 byte left.\r
2712:\r
272 ldp E_l, E_h, [src, 48]\r
273 stp A_l, A_h, [dstend, -16]\r
274 ldp A_l, A_h, [src, 32]\r
275 stp B_l, B_h, [dstend, -32]\r
276 ldp B_l, B_h, [src, 16]\r
277 stp C_l, C_h, [dstend, -48]\r
278 ldp C_l, C_h, [src]\r
279 stp D_l, D_h, [dstend, -64]\r
280 stp E_l, E_h, [dstin, 48]\r
281 stp A_l, A_h, [dstin, 32]\r
282 stp B_l, B_h, [dstin, 16]\r
283 stp C_l, C_h, [dstin]\r
2843: ret\r