2 // Copyright (c) 2012 - 2016, Linaro Limited
3 // All rights reserved.
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are met:
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright
10 // notice, this list of conditions and the following disclaimer in the
11 // documentation and/or other materials provided with the distribution.
12 // * Neither the name of the Linaro nor the
13 // names of its contributors may be used to endorse or promote products
14 // derived from this software without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 // Copyright (c) 2015 ARM Ltd
31 // All rights reserved.
33 // Redistribution and use in source and binary forms, with or without
34 // modification, are permitted provided that the following conditions
36 // 1. Redistributions of source code must retain the above copyright
37 // notice, this list of conditions and the following disclaimer.
38 // 2. Redistributions in binary form must reproduce the above copyright
39 // notice, this list of conditions and the following disclaimer in the
40 // documentation and/or other materials provided with the distribution.
41 // 3. The name of the company may not be used to endorse or promote
42 // products derived from this software without specific prior written
45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 // ARMv8-a, AArch64, unaligned accesses.
89 // Copies are split into 3 main cases: small copies of up to 16 bytes,
90 // medium copies of 17..96 bytes which are fully unrolled. Large copies
91 // of more than 96 bytes align the destination and use an unrolled loop
92 // processing 64 bytes per iteration.
93 // Small and medium copies read all data before writing, allowing any
94 // kind of overlap, and memmove tailcalls memcpy for these cases as
95 // well as non-overlapping copies.
99 add srcend, src, count
100 add dstend, dstin, count
106 // Medium copies: 17..96 bytes.
109 tbnz tmp1, 6, L(copy96)
110 ldp D_l, D_h, [srcend, -16]
112 ldp B_l, B_h, [src, 16]
113 ldp C_l, C_h, [srcend, -32]
114 stp B_l, B_h, [dstin, 16]
115 stp C_l, C_h, [dstend, -32]
117 stp A_l, A_h, [dstin]
118 stp D_l, D_h, [dstend, -16]
122 // Small copies: 0..16 bytes.
127 ldr A_h, [srcend, -8]
129 str A_h, [dstend, -8]
135 ldr A_hw, [srcend, -4]
137 str A_hw, [dstend, -4]
140 // Copy 0..3 bytes. Use a branchless sequence that copies the same
141 // byte 3 times if count==1, or the 2nd byte twice if count==2.
146 ldrb A_hw, [srcend, -1]
147 ldrb B_lw, [src, tmp1]
149 strb B_lw, [dstin, tmp1]
150 strb A_hw, [dstend, -1]
154 // Copy 64..96 bytes. Copy 64 bytes from the start and
155 // 32 bytes from the end.
157 ldp B_l, B_h, [src, 16]
158 ldp C_l, C_h, [src, 32]
159 ldp D_l, D_h, [src, 48]
160 ldp E_l, E_h, [srcend, -32]
161 ldp F_l, F_h, [srcend, -16]
162 stp A_l, A_h, [dstin]
163 stp B_l, B_h, [dstin, 16]
164 stp C_l, C_h, [dstin, 32]
165 stp D_l, D_h, [dstin, 48]
166 stp E_l, E_h, [dstend, -32]
167 stp F_l, F_h, [dstend, -16]
170 // Align DST to 16 byte alignment so that we don't cross cache line
171 // boundaries on both loads and stores. There are at least 96 bytes
172 // to copy, so copy 16 bytes unaligned and then align. The loop
173 // copies 64 bytes per iteration and prefetches one iteration ahead.
181 add count, count, tmp1 // Count is now 16 too large.
182 ldp A_l, A_h, [src, 16]
183 stp D_l, D_h, [dstin]
184 ldp B_l, B_h, [src, 32]
185 ldp C_l, C_h, [src, 48]
186 ldp D_l, D_h, [src, 64]!
187 subs count, count, 128 + 16 // Test and readjust count.
190 stp A_l, A_h, [dst, 16]
191 ldp A_l, A_h, [src, 16]
192 stp B_l, B_h, [dst, 32]
193 ldp B_l, B_h, [src, 32]
194 stp C_l, C_h, [dst, 48]
195 ldp C_l, C_h, [src, 48]
196 stp D_l, D_h, [dst, 64]!
197 ldp D_l, D_h, [src, 64]!
198 subs count, count, 64
201 // Write the last full set of 64 bytes. The remainder is at most 64
202 // bytes, so it is safe to always copy 64 bytes from the end even if
203 // there is just 1 byte left.
205 ldp E_l, E_h, [srcend, -64]
206 stp A_l, A_h, [dst, 16]
207 ldp A_l, A_h, [srcend, -48]
208 stp B_l, B_h, [dst, 32]
209 ldp B_l, B_h, [srcend, -32]
210 stp C_l, C_h, [dst, 48]
211 ldp C_l, C_h, [srcend, -16]
212 stp D_l, D_h, [dst, 64]
213 stp E_l, E_h, [dstend, -64]
214 stp A_l, A_h, [dstend, -48]
215 stp B_l, B_h, [dstend, -32]
216 stp C_l, C_h, [dstend, -16]
221 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
222 // Larger backwards copies are also handled by memcpy. The only remaining
223 // case is forward large copies. The destination is aligned, and an
224 // unrolled loop processes 64 bytes per iteration.
227 ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
228 ASM_PFX(InternalMemCopyMem):
231 ccmp tmp2, count, 2, hi
235 add dstend, dstin, count
236 add srcend, src, count
238 // Align dstend to 16 byte alignment so that we don't cross cache line
239 // boundaries on both loads and stores. There are at least 96 bytes
240 // to copy, so copy 16 bytes unaligned and then align. The loop
241 // copies 64 bytes per iteration and prefetches one iteration ahead.
244 ldp D_l, D_h, [srcend, -16]
245 sub srcend, srcend, tmp2
246 sub count, count, tmp2
247 ldp A_l, A_h, [srcend, -16]
248 stp D_l, D_h, [dstend, -16]
249 ldp B_l, B_h, [srcend, -32]
250 ldp C_l, C_h, [srcend, -48]
251 ldp D_l, D_h, [srcend, -64]!
252 sub dstend, dstend, tmp2
253 subs count, count, 128
257 stp A_l, A_h, [dstend, -16]
258 ldp A_l, A_h, [srcend, -16]
259 stp B_l, B_h, [dstend, -32]
260 ldp B_l, B_h, [srcend, -32]
261 stp C_l, C_h, [dstend, -48]
262 ldp C_l, C_h, [srcend, -48]
263 stp D_l, D_h, [dstend, -64]!
264 ldp D_l, D_h, [srcend, -64]!
265 subs count, count, 64
268 // Write the last full set of 64 bytes. The remainder is at most 64
269 // bytes, so it is safe to always copy 64 bytes from the start even if
270 // there is just 1 byte left.
272 ldp E_l, E_h, [src, 48]
273 stp A_l, A_h, [dstend, -16]
274 ldp A_l, A_h, [src, 32]
275 stp B_l, B_h, [dstend, -32]
276 ldp B_l, B_h, [src, 16]
277 stp C_l, C_h, [dstend, -48]
279 stp D_l, D_h, [dstend, -64]
280 stp E_l, E_h, [dstin, 48]
281 stp A_l, A_h, [dstin, 32]
282 stp B_l, B_h, [dstin, 16]
283 stp C_l, C_h, [dstin]