]>
Commit | Line | Data |
---|---|---|
1 | //\r | |
2 | // Copyright (c) 2012 - 2016, Linaro Limited\r | |
3 | // All rights reserved.\r | |
4 | // Copyright (c) 2015 ARM Ltd\r | |
5 | // All rights reserved.\r | |
6 | // SPDX-License-Identifier: BSD-2-Clause-Patent\r | |
7 | //\r | |
8 | \r | |
9 | // Assumptions:\r | |
10 | //\r | |
11 | // ARMv8-a, AArch64, unaligned accesses.\r | |
12 | //\r | |
13 | //\r | |
14 | \r | |
15 | #define dstin x0\r | |
16 | #define src x1\r | |
17 | #define count x2\r | |
18 | #define dst x3\r | |
19 | #define srcend x4\r | |
20 | #define dstend x5\r | |
21 | #define A_l x6\r | |
22 | #define A_lw w6\r | |
23 | #define A_h x7\r | |
24 | #define A_hw w7\r | |
25 | #define B_l x8\r | |
26 | #define B_lw w8\r | |
27 | #define B_h x9\r | |
28 | #define C_l x10\r | |
29 | #define C_h x11\r | |
30 | #define D_l x12\r | |
31 | #define D_h x13\r | |
32 | #define E_l x14\r | |
33 | #define E_h x15\r | |
34 | #define F_l srcend\r | |
35 | #define F_h dst\r | |
36 | #define tmp1 x9\r | |
37 | #define tmp2 x3\r | |
38 | \r | |
39 | #define L(l) .L ## l\r | |
40 | \r | |
41 | // Copies are split into 3 main cases: small copies of up to 16 bytes,\r | |
42 | // medium copies of 17..96 bytes which are fully unrolled. Large copies\r | |
43 | // of more than 96 bytes align the destination and use an unrolled loop\r | |
44 | // processing 64 bytes per iteration.\r | |
45 | // Small and medium copies read all data before writing, allowing any\r | |
46 | // kind of overlap, and memmove tailcalls memcpy for these cases as\r | |
47 | // well as non-overlapping copies.\r | |
48 | \r | |
49 | __memcpy:\r | |
50 | prfm PLDL1KEEP, [src]\r | |
51 | add srcend, src, count\r | |
52 | add dstend, dstin, count\r | |
53 | cmp count, 16\r | |
54 | b.ls L(copy16)\r | |
55 | cmp count, 96\r | |
56 | b.hi L(copy_long)\r | |
57 | \r | |
58 | // Medium copies: 17..96 bytes.\r | |
59 | sub tmp1, count, 1\r | |
60 | ldp A_l, A_h, [src]\r | |
61 | tbnz tmp1, 6, L(copy96)\r | |
62 | ldp D_l, D_h, [srcend, -16]\r | |
63 | tbz tmp1, 5, 1f\r | |
64 | ldp B_l, B_h, [src, 16]\r | |
65 | ldp C_l, C_h, [srcend, -32]\r | |
66 | stp B_l, B_h, [dstin, 16]\r | |
67 | stp C_l, C_h, [dstend, -32]\r | |
68 | 1:\r | |
69 | stp A_l, A_h, [dstin]\r | |
70 | stp D_l, D_h, [dstend, -16]\r | |
71 | ret\r | |
72 | \r | |
73 | .p2align 4\r | |
74 | // Small copies: 0..16 bytes.\r | |
75 | L(copy16):\r | |
76 | cmp count, 8\r | |
77 | b.lo 1f\r | |
78 | ldr A_l, [src]\r | |
79 | ldr A_h, [srcend, -8]\r | |
80 | str A_l, [dstin]\r | |
81 | str A_h, [dstend, -8]\r | |
82 | ret\r | |
83 | .p2align 4\r | |
84 | 1:\r | |
85 | tbz count, 2, 1f\r | |
86 | ldr A_lw, [src]\r | |
87 | ldr A_hw, [srcend, -4]\r | |
88 | str A_lw, [dstin]\r | |
89 | str A_hw, [dstend, -4]\r | |
90 | ret\r | |
91 | \r | |
92 | // Copy 0..3 bytes. Use a branchless sequence that copies the same\r | |
93 | // byte 3 times if count==1, or the 2nd byte twice if count==2.\r | |
94 | 1:\r | |
95 | cbz count, 2f\r | |
96 | lsr tmp1, count, 1\r | |
97 | ldrb A_lw, [src]\r | |
98 | ldrb A_hw, [srcend, -1]\r | |
99 | ldrb B_lw, [src, tmp1]\r | |
100 | strb A_lw, [dstin]\r | |
101 | strb B_lw, [dstin, tmp1]\r | |
102 | strb A_hw, [dstend, -1]\r | |
103 | 2: ret\r | |
104 | \r | |
105 | .p2align 4\r | |
106 | // Copy 64..96 bytes. Copy 64 bytes from the start and\r | |
107 | // 32 bytes from the end.\r | |
108 | L(copy96):\r | |
109 | ldp B_l, B_h, [src, 16]\r | |
110 | ldp C_l, C_h, [src, 32]\r | |
111 | ldp D_l, D_h, [src, 48]\r | |
112 | ldp E_l, E_h, [srcend, -32]\r | |
113 | ldp F_l, F_h, [srcend, -16]\r | |
114 | stp A_l, A_h, [dstin]\r | |
115 | stp B_l, B_h, [dstin, 16]\r | |
116 | stp C_l, C_h, [dstin, 32]\r | |
117 | stp D_l, D_h, [dstin, 48]\r | |
118 | stp E_l, E_h, [dstend, -32]\r | |
119 | stp F_l, F_h, [dstend, -16]\r | |
120 | ret\r | |
121 | \r | |
122 | // Align DST to 16 byte alignment so that we don't cross cache line\r | |
123 | // boundaries on both loads and stores. There are at least 96 bytes\r | |
124 | // to copy, so copy 16 bytes unaligned and then align. The loop\r | |
125 | // copies 64 bytes per iteration and prefetches one iteration ahead.\r | |
126 | \r | |
127 | .p2align 4\r | |
128 | L(copy_long):\r | |
129 | and tmp1, dstin, 15\r | |
130 | bic dst, dstin, 15\r | |
131 | ldp D_l, D_h, [src]\r | |
132 | sub src, src, tmp1\r | |
133 | add count, count, tmp1 // Count is now 16 too large.\r | |
134 | ldp A_l, A_h, [src, 16]\r | |
135 | stp D_l, D_h, [dstin]\r | |
136 | ldp B_l, B_h, [src, 32]\r | |
137 | ldp C_l, C_h, [src, 48]\r | |
138 | ldp D_l, D_h, [src, 64]!\r | |
139 | subs count, count, 128 + 16 // Test and readjust count.\r | |
140 | b.ls 2f\r | |
141 | 1:\r | |
142 | stp A_l, A_h, [dst, 16]\r | |
143 | ldp A_l, A_h, [src, 16]\r | |
144 | stp B_l, B_h, [dst, 32]\r | |
145 | ldp B_l, B_h, [src, 32]\r | |
146 | stp C_l, C_h, [dst, 48]\r | |
147 | ldp C_l, C_h, [src, 48]\r | |
148 | stp D_l, D_h, [dst, 64]!\r | |
149 | ldp D_l, D_h, [src, 64]!\r | |
150 | subs count, count, 64\r | |
151 | b.hi 1b\r | |
152 | \r | |
153 | // Write the last full set of 64 bytes. The remainder is at most 64\r | |
154 | // bytes, so it is safe to always copy 64 bytes from the end even if\r | |
155 | // there is just 1 byte left.\r | |
156 | 2:\r | |
157 | ldp E_l, E_h, [srcend, -64]\r | |
158 | stp A_l, A_h, [dst, 16]\r | |
159 | ldp A_l, A_h, [srcend, -48]\r | |
160 | stp B_l, B_h, [dst, 32]\r | |
161 | ldp B_l, B_h, [srcend, -32]\r | |
162 | stp C_l, C_h, [dst, 48]\r | |
163 | ldp C_l, C_h, [srcend, -16]\r | |
164 | stp D_l, D_h, [dst, 64]\r | |
165 | stp E_l, E_h, [dstend, -64]\r | |
166 | stp A_l, A_h, [dstend, -48]\r | |
167 | stp B_l, B_h, [dstend, -32]\r | |
168 | stp C_l, C_h, [dstend, -16]\r | |
169 | ret\r | |
170 | \r | |
171 | \r | |
172 | //\r | |
173 | // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r | |
174 | // Larger backwards copies are also handled by memcpy. The only remaining\r | |
175 | // case is forward large copies. The destination is aligned, and an\r | |
176 | // unrolled loop processes 64 bytes per iteration.\r | |
177 | //\r | |
178 | \r | |
179 | ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r | |
180 | ASM_PFX(InternalMemCopyMem):\r | |
181 | sub tmp2, dstin, src\r | |
182 | cmp count, 96\r | |
183 | ccmp tmp2, count, 2, hi\r | |
184 | b.hs __memcpy\r | |
185 | \r | |
186 | cbz tmp2, 3f\r | |
187 | add dstend, dstin, count\r | |
188 | add srcend, src, count\r | |
189 | \r | |
190 | // Align dstend to 16 byte alignment so that we don't cross cache line\r | |
191 | // boundaries on both loads and stores. There are at least 96 bytes\r | |
192 | // to copy, so copy 16 bytes unaligned and then align. The loop\r | |
193 | // copies 64 bytes per iteration and prefetches one iteration ahead.\r | |
194 | \r | |
195 | and tmp2, dstend, 15\r | |
196 | ldp D_l, D_h, [srcend, -16]\r | |
197 | sub srcend, srcend, tmp2\r | |
198 | sub count, count, tmp2\r | |
199 | ldp A_l, A_h, [srcend, -16]\r | |
200 | stp D_l, D_h, [dstend, -16]\r | |
201 | ldp B_l, B_h, [srcend, -32]\r | |
202 | ldp C_l, C_h, [srcend, -48]\r | |
203 | ldp D_l, D_h, [srcend, -64]!\r | |
204 | sub dstend, dstend, tmp2\r | |
205 | subs count, count, 128\r | |
206 | b.ls 2f\r | |
207 | nop\r | |
208 | 1:\r | |
209 | stp A_l, A_h, [dstend, -16]\r | |
210 | ldp A_l, A_h, [srcend, -16]\r | |
211 | stp B_l, B_h, [dstend, -32]\r | |
212 | ldp B_l, B_h, [srcend, -32]\r | |
213 | stp C_l, C_h, [dstend, -48]\r | |
214 | ldp C_l, C_h, [srcend, -48]\r | |
215 | stp D_l, D_h, [dstend, -64]!\r | |
216 | ldp D_l, D_h, [srcend, -64]!\r | |
217 | subs count, count, 64\r | |
218 | b.hi 1b\r | |
219 | \r | |
220 | // Write the last full set of 64 bytes. The remainder is at most 64\r | |
221 | // bytes, so it is safe to always copy 64 bytes from the start even if\r | |
222 | // there is just 1 byte left.\r | |
223 | 2:\r | |
224 | ldp E_l, E_h, [src, 48]\r | |
225 | stp A_l, A_h, [dstend, -16]\r | |
226 | ldp A_l, A_h, [src, 32]\r | |
227 | stp B_l, B_h, [dstend, -32]\r | |
228 | ldp B_l, B_h, [src, 16]\r | |
229 | stp C_l, C_h, [dstend, -48]\r | |
230 | ldp C_l, C_h, [src]\r | |
231 | stp D_l, D_h, [dstend, -64]\r | |
232 | stp E_l, E_h, [dstin, 48]\r | |
233 | stp A_l, A_h, [dstin, 32]\r | |
234 | stp B_l, B_h, [dstin, 16]\r | |
235 | stp C_l, C_h, [dstin]\r | |
236 | 3: ret\r |