]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2012 - 2016, Linaro Limited\r | |
3 | // All rights reserved.\r | |
4 | //\r | |
5 | // Redistribution and use in source and binary forms, with or without\r | |
6 | // modification, are permitted provided that the following conditions are met:\r | |
7 | // * Redistributions of source code must retain the above copyright\r | |
8 | // notice, this list of conditions and the following disclaimer.\r | |
9 | // * Redistributions in binary form must reproduce the above copyright\r | |
10 | // notice, this list of conditions and the following disclaimer in the\r | |
11 | // documentation and/or other materials provided with the distribution.\r | |
12 | // * Neither the name of the Linaro nor the\r | |
13 | // names of its contributors may be used to endorse or promote products\r | |
14 | // derived from this software without specific prior written permission.\r | |
15 | //\r | |
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r | |
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r | |
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r | |
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r | |
20 | // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r | |
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r | |
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r | |
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r | |
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r | |
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r | |
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r | |
27 | //\r | |
28 | \r | |
29 | //\r | |
30 | // Copyright (c) 2015 ARM Ltd\r | |
31 | // All rights reserved.\r | |
32 | //\r | |
33 | // Redistribution and use in source and binary forms, with or without\r | |
34 | // modification, are permitted provided that the following conditions\r | |
35 | // are met:\r | |
36 | // 1. Redistributions of source code must retain the above copyright\r | |
37 | // notice, this list of conditions and the following disclaimer.\r | |
38 | // 2. Redistributions in binary form must reproduce the above copyright\r | |
39 | // notice, this list of conditions and the following disclaimer in the\r | |
40 | // documentation and/or other materials provided with the distribution.\r | |
41 | // 3. The name of the company may not be used to endorse or promote\r | |
42 | // products derived from this software without specific prior written\r | |
43 | // permission.\r | |
44 | //\r | |
45 | // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r | |
46 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r | |
47 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r | |
48 | // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r | |
49 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r | |
50 | // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r | |
51 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r | |
52 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r | |
53 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r | |
54 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r | |
55 | //\r | |
56 | \r | |
57 | // Assumptions:\r | |
58 | //\r | |
59 | // ARMv8-a, AArch64, unaligned accesses.\r | |
60 | //\r | |
61 | //\r | |
62 | \r | |
63 | #define dstin x0\r | |
64 | #define src x1\r | |
65 | #define count x2\r | |
66 | #define dst x3\r | |
67 | #define srcend x4\r | |
68 | #define dstend x5\r | |
69 | #define A_l x6\r | |
70 | #define A_lw w6\r | |
71 | #define A_h x7\r | |
72 | #define A_hw w7\r | |
73 | #define B_l x8\r | |
74 | #define B_lw w8\r | |
75 | #define B_h x9\r | |
76 | #define C_l x10\r | |
77 | #define C_h x11\r | |
78 | #define D_l x12\r | |
79 | #define D_h x13\r | |
80 | #define E_l x14\r | |
81 | #define E_h x15\r | |
82 | #define F_l srcend\r | |
83 | #define F_h dst\r | |
84 | #define tmp1 x9\r | |
85 | #define tmp2 x3\r | |
86 | \r | |
87 | #define L(l) .L ## l\r | |
88 | \r | |
89 | // Copies are split into 3 main cases: small copies of up to 16 bytes,\r | |
90 | // medium copies of 17..96 bytes which are fully unrolled. Large copies\r | |
91 | // of more than 96 bytes align the destination and use an unrolled loop\r | |
92 | // processing 64 bytes per iteration.\r | |
93 | // Small and medium copies read all data before writing, allowing any\r | |
94 | // kind of overlap, and memmove tailcalls memcpy for these cases as\r | |
95 | // well as non-overlapping copies.\r | |
96 | \r | |
97 | __memcpy:\r | |
98 | prfm PLDL1KEEP, [src]\r | |
99 | add srcend, src, count\r | |
100 | add dstend, dstin, count\r | |
101 | cmp count, 16\r | |
102 | b.ls L(copy16)\r | |
103 | cmp count, 96\r | |
104 | b.hi L(copy_long)\r | |
105 | \r | |
106 | // Medium copies: 17..96 bytes.\r | |
107 | sub tmp1, count, 1\r | |
108 | ldp A_l, A_h, [src]\r | |
109 | tbnz tmp1, 6, L(copy96)\r | |
110 | ldp D_l, D_h, [srcend, -16]\r | |
111 | tbz tmp1, 5, 1f\r | |
112 | ldp B_l, B_h, [src, 16]\r | |
113 | ldp C_l, C_h, [srcend, -32]\r | |
114 | stp B_l, B_h, [dstin, 16]\r | |
115 | stp C_l, C_h, [dstend, -32]\r | |
116 | 1:\r | |
117 | stp A_l, A_h, [dstin]\r | |
118 | stp D_l, D_h, [dstend, -16]\r | |
119 | ret\r | |
120 | \r | |
121 | .p2align 4\r | |
122 | // Small copies: 0..16 bytes.\r | |
123 | L(copy16):\r | |
124 | cmp count, 8\r | |
125 | b.lo 1f\r | |
126 | ldr A_l, [src]\r | |
127 | ldr A_h, [srcend, -8]\r | |
128 | str A_l, [dstin]\r | |
129 | str A_h, [dstend, -8]\r | |
130 | ret\r | |
131 | .p2align 4\r | |
132 | 1:\r | |
133 | tbz count, 2, 1f\r | |
134 | ldr A_lw, [src]\r | |
135 | ldr A_hw, [srcend, -4]\r | |
136 | str A_lw, [dstin]\r | |
137 | str A_hw, [dstend, -4]\r | |
138 | ret\r | |
139 | \r | |
140 | // Copy 0..3 bytes. Use a branchless sequence that copies the same\r | |
141 | // byte 3 times if count==1, or the 2nd byte twice if count==2.\r | |
142 | 1:\r | |
143 | cbz count, 2f\r | |
144 | lsr tmp1, count, 1\r | |
145 | ldrb A_lw, [src]\r | |
146 | ldrb A_hw, [srcend, -1]\r | |
147 | ldrb B_lw, [src, tmp1]\r | |
148 | strb A_lw, [dstin]\r | |
149 | strb B_lw, [dstin, tmp1]\r | |
150 | strb A_hw, [dstend, -1]\r | |
151 | 2: ret\r | |
152 | \r | |
153 | .p2align 4\r | |
154 | // Copy 64..96 bytes. Copy 64 bytes from the start and\r | |
155 | // 32 bytes from the end.\r | |
156 | L(copy96):\r | |
157 | ldp B_l, B_h, [src, 16]\r | |
158 | ldp C_l, C_h, [src, 32]\r | |
159 | ldp D_l, D_h, [src, 48]\r | |
160 | ldp E_l, E_h, [srcend, -32]\r | |
161 | ldp F_l, F_h, [srcend, -16]\r | |
162 | stp A_l, A_h, [dstin]\r | |
163 | stp B_l, B_h, [dstin, 16]\r | |
164 | stp C_l, C_h, [dstin, 32]\r | |
165 | stp D_l, D_h, [dstin, 48]\r | |
166 | stp E_l, E_h, [dstend, -32]\r | |
167 | stp F_l, F_h, [dstend, -16]\r | |
168 | ret\r | |
169 | \r | |
170 | // Align DST to 16 byte alignment so that we don't cross cache line\r | |
171 | // boundaries on both loads and stores. There are at least 96 bytes\r | |
9095d37b | 172 | // to copy, so copy 16 bytes unaligned and then align. The loop\r |
c86cd1e1 AB |
173 | // copies 64 bytes per iteration and prefetches one iteration ahead.\r |
174 | \r | |
175 | .p2align 4\r | |
176 | L(copy_long):\r | |
177 | and tmp1, dstin, 15\r | |
178 | bic dst, dstin, 15\r | |
179 | ldp D_l, D_h, [src]\r | |
180 | sub src, src, tmp1\r | |
181 | add count, count, tmp1 // Count is now 16 too large.\r | |
182 | ldp A_l, A_h, [src, 16]\r | |
183 | stp D_l, D_h, [dstin]\r | |
184 | ldp B_l, B_h, [src, 32]\r | |
185 | ldp C_l, C_h, [src, 48]\r | |
186 | ldp D_l, D_h, [src, 64]!\r | |
187 | subs count, count, 128 + 16 // Test and readjust count.\r | |
188 | b.ls 2f\r | |
189 | 1:\r | |
190 | stp A_l, A_h, [dst, 16]\r | |
191 | ldp A_l, A_h, [src, 16]\r | |
192 | stp B_l, B_h, [dst, 32]\r | |
193 | ldp B_l, B_h, [src, 32]\r | |
194 | stp C_l, C_h, [dst, 48]\r | |
195 | ldp C_l, C_h, [src, 48]\r | |
196 | stp D_l, D_h, [dst, 64]!\r | |
197 | ldp D_l, D_h, [src, 64]!\r | |
198 | subs count, count, 64\r | |
199 | b.hi 1b\r | |
200 | \r | |
9095d37b | 201 | // Write the last full set of 64 bytes. The remainder is at most 64\r |
c86cd1e1 AB |
202 | // bytes, so it is safe to always copy 64 bytes from the end even if\r |
203 | // there is just 1 byte left.\r | |
204 | 2:\r | |
205 | ldp E_l, E_h, [srcend, -64]\r | |
206 | stp A_l, A_h, [dst, 16]\r | |
207 | ldp A_l, A_h, [srcend, -48]\r | |
208 | stp B_l, B_h, [dst, 32]\r | |
209 | ldp B_l, B_h, [srcend, -32]\r | |
210 | stp C_l, C_h, [dst, 48]\r | |
211 | ldp C_l, C_h, [srcend, -16]\r | |
212 | stp D_l, D_h, [dst, 64]\r | |
213 | stp E_l, E_h, [dstend, -64]\r | |
214 | stp A_l, A_h, [dstend, -48]\r | |
215 | stp B_l, B_h, [dstend, -32]\r | |
216 | stp C_l, C_h, [dstend, -16]\r | |
217 | ret\r | |
218 | \r | |
219 | \r | |
220 | //\r | |
221 | // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r | |
222 | // Larger backwards copies are also handled by memcpy. The only remaining\r | |
223 | // case is forward large copies. The destination is aligned, and an\r | |
224 | // unrolled loop processes 64 bytes per iteration.\r | |
225 | //\r | |
226 | \r | |
227 | ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r | |
228 | ASM_PFX(InternalMemCopyMem):\r | |
229 | sub tmp2, dstin, src\r | |
230 | cmp count, 96\r | |
231 | ccmp tmp2, count, 2, hi\r | |
232 | b.hs __memcpy\r | |
233 | \r | |
234 | cbz tmp2, 3f\r | |
235 | add dstend, dstin, count\r | |
236 | add srcend, src, count\r | |
237 | \r | |
238 | // Align dstend to 16 byte alignment so that we don't cross cache line\r | |
239 | // boundaries on both loads and stores. There are at least 96 bytes\r | |
240 | // to copy, so copy 16 bytes unaligned and then align. The loop\r | |
241 | // copies 64 bytes per iteration and prefetches one iteration ahead.\r | |
242 | \r | |
243 | and tmp2, dstend, 15\r | |
244 | ldp D_l, D_h, [srcend, -16]\r | |
245 | sub srcend, srcend, tmp2\r | |
246 | sub count, count, tmp2\r | |
247 | ldp A_l, A_h, [srcend, -16]\r | |
248 | stp D_l, D_h, [dstend, -16]\r | |
249 | ldp B_l, B_h, [srcend, -32]\r | |
250 | ldp C_l, C_h, [srcend, -48]\r | |
251 | ldp D_l, D_h, [srcend, -64]!\r | |
252 | sub dstend, dstend, tmp2\r | |
253 | subs count, count, 128\r | |
254 | b.ls 2f\r | |
255 | nop\r | |
256 | 1:\r | |
257 | stp A_l, A_h, [dstend, -16]\r | |
258 | ldp A_l, A_h, [srcend, -16]\r | |
259 | stp B_l, B_h, [dstend, -32]\r | |
260 | ldp B_l, B_h, [srcend, -32]\r | |
261 | stp C_l, C_h, [dstend, -48]\r | |
262 | ldp C_l, C_h, [srcend, -48]\r | |
263 | stp D_l, D_h, [dstend, -64]!\r | |
264 | ldp D_l, D_h, [srcend, -64]!\r | |
265 | subs count, count, 64\r | |
266 | b.hi 1b\r | |
267 | \r | |
268 | // Write the last full set of 64 bytes. The remainder is at most 64\r | |
269 | // bytes, so it is safe to always copy 64 bytes from the start even if\r | |
270 | // there is just 1 byte left.\r | |
271 | 2:\r | |
272 | ldp E_l, E_h, [src, 48]\r | |
273 | stp A_l, A_h, [dstend, -16]\r | |
274 | ldp A_l, A_h, [src, 32]\r | |
275 | stp B_l, B_h, [dstend, -32]\r | |
276 | ldp B_l, B_h, [src, 16]\r | |
277 | stp C_l, C_h, [dstend, -48]\r | |
278 | ldp C_l, C_h, [src]\r | |
279 | stp D_l, D_h, [dstend, -64]\r | |
280 | stp E_l, E_h, [dstin, 48]\r | |
281 | stp A_l, A_h, [dstin, 32]\r | |
282 | stp B_l, B_h, [dstin, 16]\r | |
283 | stp C_l, C_h, [dstin]\r | |
284 | 3: ret\r |