]> git.proxmox.com Git - mirror_edk2.git/blame_incremental - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
UefiCpuPkg: Move AsmRelocateApLoopStart from Mpfuncs.nasm to AmdSev.nasm
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / CopyMem.S
... / ...
CommitLineData
1//\r
2// Copyright (c) 2012 - 2016, Linaro Limited\r
3// All rights reserved.\r
4// Copyright (c) 2015 ARM Ltd\r
5// All rights reserved.\r
6// SPDX-License-Identifier: BSD-2-Clause-Patent\r
7//\r
8\r
9// Assumptions:\r
10//\r
11// ARMv8-a, AArch64, unaligned accesses.\r
12//\r
13//\r
14\r
15#define dstin x0\r
16#define src x1\r
17#define count x2\r
18#define dst x3\r
19#define srcend x4\r
20#define dstend x5\r
21#define A_l x6\r
22#define A_lw w6\r
23#define A_h x7\r
24#define A_hw w7\r
25#define B_l x8\r
26#define B_lw w8\r
27#define B_h x9\r
28#define C_l x10\r
29#define C_h x11\r
30#define D_l x12\r
31#define D_h x13\r
32#define E_l x14\r
33#define E_h x15\r
34#define F_l srcend\r
35#define F_h dst\r
36#define tmp1 x9\r
37#define tmp2 x3\r
38\r
39#define L(l) .L ## l\r
40\r
41// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
42// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
43// of more than 96 bytes align the destination and use an unrolled loop\r
44// processing 64 bytes per iteration.\r
45// Small and medium copies read all data before writing, allowing any\r
46// kind of overlap, and memmove tailcalls memcpy for these cases as\r
47// well as non-overlapping copies.\r
48\r
49__memcpy:\r
50 prfm PLDL1KEEP, [src]\r
51 add srcend, src, count\r
52 add dstend, dstin, count\r
53 cmp count, 16\r
54 b.ls L(copy16)\r
55 cmp count, 96\r
56 b.hi L(copy_long)\r
57\r
58 // Medium copies: 17..96 bytes.\r
59 sub tmp1, count, 1\r
60 ldp A_l, A_h, [src]\r
61 tbnz tmp1, 6, L(copy96)\r
62 ldp D_l, D_h, [srcend, -16]\r
63 tbz tmp1, 5, 1f\r
64 ldp B_l, B_h, [src, 16]\r
65 ldp C_l, C_h, [srcend, -32]\r
66 stp B_l, B_h, [dstin, 16]\r
67 stp C_l, C_h, [dstend, -32]\r
681:\r
69 stp A_l, A_h, [dstin]\r
70 stp D_l, D_h, [dstend, -16]\r
71 ret\r
72\r
73 .p2align 4\r
74 // Small copies: 0..16 bytes.\r
75L(copy16):\r
76 cmp count, 8\r
77 b.lo 1f\r
78 ldr A_l, [src]\r
79 ldr A_h, [srcend, -8]\r
80 str A_l, [dstin]\r
81 str A_h, [dstend, -8]\r
82 ret\r
83 .p2align 4\r
841:\r
85 tbz count, 2, 1f\r
86 ldr A_lw, [src]\r
87 ldr A_hw, [srcend, -4]\r
88 str A_lw, [dstin]\r
89 str A_hw, [dstend, -4]\r
90 ret\r
91\r
92 // Copy 0..3 bytes. Use a branchless sequence that copies the same\r
93 // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
941:\r
95 cbz count, 2f\r
96 lsr tmp1, count, 1\r
97 ldrb A_lw, [src]\r
98 ldrb A_hw, [srcend, -1]\r
99 ldrb B_lw, [src, tmp1]\r
100 strb A_lw, [dstin]\r
101 strb B_lw, [dstin, tmp1]\r
102 strb A_hw, [dstend, -1]\r
1032: ret\r
104\r
105 .p2align 4\r
106 // Copy 64..96 bytes. Copy 64 bytes from the start and\r
107 // 32 bytes from the end.\r
108L(copy96):\r
109 ldp B_l, B_h, [src, 16]\r
110 ldp C_l, C_h, [src, 32]\r
111 ldp D_l, D_h, [src, 48]\r
112 ldp E_l, E_h, [srcend, -32]\r
113 ldp F_l, F_h, [srcend, -16]\r
114 stp A_l, A_h, [dstin]\r
115 stp B_l, B_h, [dstin, 16]\r
116 stp C_l, C_h, [dstin, 32]\r
117 stp D_l, D_h, [dstin, 48]\r
118 stp E_l, E_h, [dstend, -32]\r
119 stp F_l, F_h, [dstend, -16]\r
120 ret\r
121\r
122 // Align DST to 16 byte alignment so that we don't cross cache line\r
123 // boundaries on both loads and stores. There are at least 96 bytes\r
124 // to copy, so copy 16 bytes unaligned and then align. The loop\r
125 // copies 64 bytes per iteration and prefetches one iteration ahead.\r
126\r
127 .p2align 4\r
128L(copy_long):\r
129 and tmp1, dstin, 15\r
130 bic dst, dstin, 15\r
131 ldp D_l, D_h, [src]\r
132 sub src, src, tmp1\r
133 add count, count, tmp1 // Count is now 16 too large.\r
134 ldp A_l, A_h, [src, 16]\r
135 stp D_l, D_h, [dstin]\r
136 ldp B_l, B_h, [src, 32]\r
137 ldp C_l, C_h, [src, 48]\r
138 ldp D_l, D_h, [src, 64]!\r
139 subs count, count, 128 + 16 // Test and readjust count.\r
140 b.ls 2f\r
1411:\r
142 stp A_l, A_h, [dst, 16]\r
143 ldp A_l, A_h, [src, 16]\r
144 stp B_l, B_h, [dst, 32]\r
145 ldp B_l, B_h, [src, 32]\r
146 stp C_l, C_h, [dst, 48]\r
147 ldp C_l, C_h, [src, 48]\r
148 stp D_l, D_h, [dst, 64]!\r
149 ldp D_l, D_h, [src, 64]!\r
150 subs count, count, 64\r
151 b.hi 1b\r
152\r
153 // Write the last full set of 64 bytes. The remainder is at most 64\r
154 // bytes, so it is safe to always copy 64 bytes from the end even if\r
155 // there is just 1 byte left.\r
1562:\r
157 ldp E_l, E_h, [srcend, -64]\r
158 stp A_l, A_h, [dst, 16]\r
159 ldp A_l, A_h, [srcend, -48]\r
160 stp B_l, B_h, [dst, 32]\r
161 ldp B_l, B_h, [srcend, -32]\r
162 stp C_l, C_h, [dst, 48]\r
163 ldp C_l, C_h, [srcend, -16]\r
164 stp D_l, D_h, [dst, 64]\r
165 stp E_l, E_h, [dstend, -64]\r
166 stp A_l, A_h, [dstend, -48]\r
167 stp B_l, B_h, [dstend, -32]\r
168 stp C_l, C_h, [dstend, -16]\r
169 ret\r
170\r
171\r
172//\r
173// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
174// Larger backwards copies are also handled by memcpy. The only remaining\r
175// case is forward large copies. The destination is aligned, and an\r
176// unrolled loop processes 64 bytes per iteration.\r
177//\r
178\r
179ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
180ASM_PFX(InternalMemCopyMem):\r
181 sub tmp2, dstin, src\r
182 cmp count, 96\r
183 ccmp tmp2, count, 2, hi\r
184 b.hs __memcpy\r
185\r
186 cbz tmp2, 3f\r
187 add dstend, dstin, count\r
188 add srcend, src, count\r
189\r
190 // Align dstend to 16 byte alignment so that we don't cross cache line\r
191 // boundaries on both loads and stores. There are at least 96 bytes\r
192 // to copy, so copy 16 bytes unaligned and then align. The loop\r
193 // copies 64 bytes per iteration and prefetches one iteration ahead.\r
194\r
195 and tmp2, dstend, 15\r
196 ldp D_l, D_h, [srcend, -16]\r
197 sub srcend, srcend, tmp2\r
198 sub count, count, tmp2\r
199 ldp A_l, A_h, [srcend, -16]\r
200 stp D_l, D_h, [dstend, -16]\r
201 ldp B_l, B_h, [srcend, -32]\r
202 ldp C_l, C_h, [srcend, -48]\r
203 ldp D_l, D_h, [srcend, -64]!\r
204 sub dstend, dstend, tmp2\r
205 subs count, count, 128\r
206 b.ls 2f\r
207 nop\r
2081:\r
209 stp A_l, A_h, [dstend, -16]\r
210 ldp A_l, A_h, [srcend, -16]\r
211 stp B_l, B_h, [dstend, -32]\r
212 ldp B_l, B_h, [srcend, -32]\r
213 stp C_l, C_h, [dstend, -48]\r
214 ldp C_l, C_h, [srcend, -48]\r
215 stp D_l, D_h, [dstend, -64]!\r
216 ldp D_l, D_h, [srcend, -64]!\r
217 subs count, count, 64\r
218 b.hi 1b\r
219\r
220 // Write the last full set of 64 bytes. The remainder is at most 64\r
221 // bytes, so it is safe to always copy 64 bytes from the start even if\r
222 // there is just 1 byte left.\r
2232:\r
224 ldp E_l, E_h, [src, 48]\r
225 stp A_l, A_h, [dstend, -16]\r
226 ldp A_l, A_h, [src, 32]\r
227 stp B_l, B_h, [dstend, -32]\r
228 ldp B_l, B_h, [src, 16]\r
229 stp C_l, C_h, [dstend, -48]\r
230 ldp C_l, C_h, [src]\r
231 stp D_l, D_h, [dstend, -64]\r
232 stp E_l, E_h, [dstin, 48]\r
233 stp A_l, A_h, [dstin, 32]\r
234 stp B_l, B_h, [dstin, 16]\r
235 stp C_l, C_h, [dstin]\r
2363: ret\r