]> git.proxmox.com Git - mirror_edk2.git/blob - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
MdePkg: Clean up source files
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / CopyMem.S
1 //
2 // Copyright (c) 2012 - 2016, Linaro Limited
3 // All rights reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are met:
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright
10 // notice, this list of conditions and the following disclaimer in the
11 // documentation and/or other materials provided with the distribution.
12 // * Neither the name of the Linaro nor the
13 // names of its contributors may be used to endorse or promote products
14 // derived from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 //
28
29 //
30 // Copyright (c) 2015 ARM Ltd
31 // All rights reserved.
32 //
33 // Redistribution and use in source and binary forms, with or without
34 // modification, are permitted provided that the following conditions
35 // are met:
36 // 1. Redistributions of source code must retain the above copyright
37 // notice, this list of conditions and the following disclaimer.
38 // 2. Redistributions in binary form must reproduce the above copyright
39 // notice, this list of conditions and the following disclaimer in the
40 // documentation and/or other materials provided with the distribution.
41 // 3. The name of the company may not be used to endorse or promote
42 // products derived from this software without specific prior written
43 // permission.
44 //
45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 //
56
57 // Assumptions:
58 //
59 // ARMv8-a, AArch64, unaligned accesses.
60 //
61 //
62
63 #define dstin x0
64 #define src x1
65 #define count x2
66 #define dst x3
67 #define srcend x4
68 #define dstend x5
69 #define A_l x6
70 #define A_lw w6
71 #define A_h x7
72 #define A_hw w7
73 #define B_l x8
74 #define B_lw w8
75 #define B_h x9
76 #define C_l x10
77 #define C_h x11
78 #define D_l x12
79 #define D_h x13
80 #define E_l x14
81 #define E_h x15
82 #define F_l srcend
83 #define F_h dst
84 #define tmp1 x9
85 #define tmp2 x3
86
87 #define L(l) .L ## l
88
89 // Copies are split into 3 main cases: small copies of up to 16 bytes,
90 // medium copies of 17..96 bytes which are fully unrolled. Large copies
91 // of more than 96 bytes align the destination and use an unrolled loop
92 // processing 64 bytes per iteration.
93 // Small and medium copies read all data before writing, allowing any
94 // kind of overlap, and memmove tailcalls memcpy for these cases as
95 // well as non-overlapping copies.
96
97 __memcpy:
98 prfm PLDL1KEEP, [src]
99 add srcend, src, count
100 add dstend, dstin, count
101 cmp count, 16
102 b.ls L(copy16)
103 cmp count, 96
104 b.hi L(copy_long)
105
106 // Medium copies: 17..96 bytes.
107 sub tmp1, count, 1
108 ldp A_l, A_h, [src]
109 tbnz tmp1, 6, L(copy96)
110 ldp D_l, D_h, [srcend, -16]
111 tbz tmp1, 5, 1f
112 ldp B_l, B_h, [src, 16]
113 ldp C_l, C_h, [srcend, -32]
114 stp B_l, B_h, [dstin, 16]
115 stp C_l, C_h, [dstend, -32]
116 1:
117 stp A_l, A_h, [dstin]
118 stp D_l, D_h, [dstend, -16]
119 ret
120
121 .p2align 4
122 // Small copies: 0..16 bytes.
123 L(copy16):
124 cmp count, 8
125 b.lo 1f
126 ldr A_l, [src]
127 ldr A_h, [srcend, -8]
128 str A_l, [dstin]
129 str A_h, [dstend, -8]
130 ret
131 .p2align 4
132 1:
133 tbz count, 2, 1f
134 ldr A_lw, [src]
135 ldr A_hw, [srcend, -4]
136 str A_lw, [dstin]
137 str A_hw, [dstend, -4]
138 ret
139
140 // Copy 0..3 bytes. Use a branchless sequence that copies the same
141 // byte 3 times if count==1, or the 2nd byte twice if count==2.
142 1:
143 cbz count, 2f
144 lsr tmp1, count, 1
145 ldrb A_lw, [src]
146 ldrb A_hw, [srcend, -1]
147 ldrb B_lw, [src, tmp1]
148 strb A_lw, [dstin]
149 strb B_lw, [dstin, tmp1]
150 strb A_hw, [dstend, -1]
151 2: ret
152
153 .p2align 4
154 // Copy 64..96 bytes. Copy 64 bytes from the start and
155 // 32 bytes from the end.
156 L(copy96):
157 ldp B_l, B_h, [src, 16]
158 ldp C_l, C_h, [src, 32]
159 ldp D_l, D_h, [src, 48]
160 ldp E_l, E_h, [srcend, -32]
161 ldp F_l, F_h, [srcend, -16]
162 stp A_l, A_h, [dstin]
163 stp B_l, B_h, [dstin, 16]
164 stp C_l, C_h, [dstin, 32]
165 stp D_l, D_h, [dstin, 48]
166 stp E_l, E_h, [dstend, -32]
167 stp F_l, F_h, [dstend, -16]
168 ret
169
170 // Align DST to 16 byte alignment so that we don't cross cache line
171 // boundaries on both loads and stores. There are at least 96 bytes
172 // to copy, so copy 16 bytes unaligned and then align. The loop
173 // copies 64 bytes per iteration and prefetches one iteration ahead.
174
175 .p2align 4
176 L(copy_long):
177 and tmp1, dstin, 15
178 bic dst, dstin, 15
179 ldp D_l, D_h, [src]
180 sub src, src, tmp1
181 add count, count, tmp1 // Count is now 16 too large.
182 ldp A_l, A_h, [src, 16]
183 stp D_l, D_h, [dstin]
184 ldp B_l, B_h, [src, 32]
185 ldp C_l, C_h, [src, 48]
186 ldp D_l, D_h, [src, 64]!
187 subs count, count, 128 + 16 // Test and readjust count.
188 b.ls 2f
189 1:
190 stp A_l, A_h, [dst, 16]
191 ldp A_l, A_h, [src, 16]
192 stp B_l, B_h, [dst, 32]
193 ldp B_l, B_h, [src, 32]
194 stp C_l, C_h, [dst, 48]
195 ldp C_l, C_h, [src, 48]
196 stp D_l, D_h, [dst, 64]!
197 ldp D_l, D_h, [src, 64]!
198 subs count, count, 64
199 b.hi 1b
200
201 // Write the last full set of 64 bytes. The remainder is at most 64
202 // bytes, so it is safe to always copy 64 bytes from the end even if
203 // there is just 1 byte left.
204 2:
205 ldp E_l, E_h, [srcend, -64]
206 stp A_l, A_h, [dst, 16]
207 ldp A_l, A_h, [srcend, -48]
208 stp B_l, B_h, [dst, 32]
209 ldp B_l, B_h, [srcend, -32]
210 stp C_l, C_h, [dst, 48]
211 ldp C_l, C_h, [srcend, -16]
212 stp D_l, D_h, [dst, 64]
213 stp E_l, E_h, [dstend, -64]
214 stp A_l, A_h, [dstend, -48]
215 stp B_l, B_h, [dstend, -32]
216 stp C_l, C_h, [dstend, -16]
217 ret
218
219
220 //
221 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
222 // Larger backwards copies are also handled by memcpy. The only remaining
223 // case is forward large copies. The destination is aligned, and an
224 // unrolled loop processes 64 bytes per iteration.
225 //
226
227 ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
228 ASM_PFX(InternalMemCopyMem):
229 sub tmp2, dstin, src
230 cmp count, 96
231 ccmp tmp2, count, 2, hi
232 b.hs __memcpy
233
234 cbz tmp2, 3f
235 add dstend, dstin, count
236 add srcend, src, count
237
238 // Align dstend to 16 byte alignment so that we don't cross cache line
239 // boundaries on both loads and stores. There are at least 96 bytes
240 // to copy, so copy 16 bytes unaligned and then align. The loop
241 // copies 64 bytes per iteration and prefetches one iteration ahead.
242
243 and tmp2, dstend, 15
244 ldp D_l, D_h, [srcend, -16]
245 sub srcend, srcend, tmp2
246 sub count, count, tmp2
247 ldp A_l, A_h, [srcend, -16]
248 stp D_l, D_h, [dstend, -16]
249 ldp B_l, B_h, [srcend, -32]
250 ldp C_l, C_h, [srcend, -48]
251 ldp D_l, D_h, [srcend, -64]!
252 sub dstend, dstend, tmp2
253 subs count, count, 128
254 b.ls 2f
255 nop
256 1:
257 stp A_l, A_h, [dstend, -16]
258 ldp A_l, A_h, [srcend, -16]
259 stp B_l, B_h, [dstend, -32]
260 ldp B_l, B_h, [srcend, -32]
261 stp C_l, C_h, [dstend, -48]
262 ldp C_l, C_h, [srcend, -48]
263 stp D_l, D_h, [dstend, -64]!
264 ldp D_l, D_h, [srcend, -64]!
265 subs count, count, 64
266 b.hi 1b
267
268 // Write the last full set of 64 bytes. The remainder is at most 64
269 // bytes, so it is safe to always copy 64 bytes from the start even if
270 // there is just 1 byte left.
271 2:
272 ldp E_l, E_h, [src, 48]
273 stp A_l, A_h, [dstend, -16]
274 ldp A_l, A_h, [src, 32]
275 stp B_l, B_h, [dstend, -32]
276 ldp B_l, B_h, [src, 16]
277 stp C_l, C_h, [dstend, -48]
278 ldp C_l, C_h, [src]
279 stp D_l, D_h, [dstend, -64]
280 stp E_l, E_h, [dstin, 48]
281 stp A_l, A_h, [dstin, 32]
282 stp B_l, B_h, [dstin, 16]
283 stp C_l, C_h, [dstin]
284 3: ret