]> git.proxmox.com Git - mirror_edk2.git/blob - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
UefiCpuPkg: Move AsmRelocateApLoopStart from Mpfuncs.nasm to AmdSev.nasm
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S
1 //
2 // Copyright (c) 2012 - 2016, Linaro Limited
3 // All rights reserved.
4 // Copyright (c) 2015 ARM Ltd
5 // All rights reserved.
6 // SPDX-License-Identifier: BSD-2-Clause-Patent
7 //
8
9 // Assumptions:
10 //
11 // ARMv8-a, AArch64, unaligned accesses
12 //
13 //
14
15 #define dstin x0
16 #define count x1
17 #define val x2
18 #define valw w2
19 #define dst x3
20 #define dstend x4
21 #define tmp1 x5
22 #define tmp1w w5
23 #define tmp2 x6
24 #define tmp2w w6
25 #define zva_len x7
26 #define zva_lenw w7
27
28 #define L(l) .L ## l
29
30 ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
31 ASM_PFX(InternalMemSetMem16):
32 dup v0.8H, valw
33 lsl count, count, #1
34 b 0f
35
36 ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
37 ASM_PFX(InternalMemSetMem32):
38 dup v0.4S, valw
39 lsl count, count, #2
40 b 0f
41
42 ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
43 ASM_PFX(InternalMemSetMem64):
44 dup v0.2D, val
45 lsl count, count, #3
46 b 0f
47
48 ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
49 ASM_PFX(InternalMemZeroMem):
50 movi v0.16B, #0
51 b 0f
52
53 ASM_GLOBAL ASM_PFX(InternalMemSetMem)
54 ASM_PFX(InternalMemSetMem):
55 dup v0.16B, valw
56 0: add dstend, dstin, count
57 mov val, v0.D[0]
58
59 cmp count, 96
60 b.hi L(set_long)
61 cmp count, 16
62 b.hs L(set_medium)
63
64 // Set 0..15 bytes.
65 tbz count, 3, 1f
66 str val, [dstin]
67 str val, [dstend, -8]
68 ret
69 nop
70 1: tbz count, 2, 2f
71 str valw, [dstin]
72 str valw, [dstend, -4]
73 ret
74 2: cbz count, 3f
75 strb valw, [dstin]
76 tbz count, 1, 3f
77 strh valw, [dstend, -2]
78 3: ret
79
80 // Set 17..96 bytes.
81 L(set_medium):
82 str q0, [dstin]
83 tbnz count, 6, L(set96)
84 str q0, [dstend, -16]
85 tbz count, 5, 1f
86 str q0, [dstin, 16]
87 str q0, [dstend, -32]
88 1: ret
89
90 .p2align 4
91 // Set 64..96 bytes. Write 64 bytes from the start and
92 // 32 bytes from the end.
93 L(set96):
94 str q0, [dstin, 16]
95 stp q0, q0, [dstin, 32]
96 stp q0, q0, [dstend, -32]
97 ret
98
99 .p2align 3
100 nop
101 L(set_long):
102 bic dst, dstin, 15
103 str q0, [dstin]
104 cmp count, 256
105 ccmp val, 0, 0, cs
106 b.eq L(try_zva)
107 L(no_zva):
108 sub count, dstend, dst // Count is 16 too large.
109 add dst, dst, 16
110 sub count, count, 64 + 16 // Adjust count and bias for loop.
111 1: stp q0, q0, [dst], 64
112 stp q0, q0, [dst, -32]
113 L(tail64):
114 subs count, count, 64
115 b.hi 1b
116 2: stp q0, q0, [dstend, -64]
117 stp q0, q0, [dstend, -32]
118 ret
119
120 .p2align 3
121 L(try_zva):
122 mrs tmp1, dczid_el0
123 tbnz tmp1w, 4, L(no_zva)
124 and tmp1w, tmp1w, 15
125 cmp tmp1w, 4 // ZVA size is 64 bytes.
126 b.ne L(zva_128)
127
128 // Write the first and last 64 byte aligned block using stp rather
129 // than using DC ZVA. This is faster on some cores.
130 L(zva_64):
131 str q0, [dst, 16]
132 stp q0, q0, [dst, 32]
133 bic dst, dst, 63
134 stp q0, q0, [dst, 64]
135 stp q0, q0, [dst, 96]
136 sub count, dstend, dst // Count is now 128 too large.
137 sub count, count, 128+64+64 // Adjust count and bias for loop.
138 add dst, dst, 128
139 nop
140 1: dc zva, dst
141 add dst, dst, 64
142 subs count, count, 64
143 b.hi 1b
144 stp q0, q0, [dst, 0]
145 stp q0, q0, [dst, 32]
146 stp q0, q0, [dstend, -64]
147 stp q0, q0, [dstend, -32]
148 ret
149
150 .p2align 3
151 L(zva_128):
152 cmp tmp1w, 5 // ZVA size is 128 bytes.
153 b.ne L(zva_other)
154
155 str q0, [dst, 16]
156 stp q0, q0, [dst, 32]
157 stp q0, q0, [dst, 64]
158 stp q0, q0, [dst, 96]
159 bic dst, dst, 127
160 sub count, dstend, dst // Count is now 128 too large.
161 sub count, count, 128+128 // Adjust count and bias for loop.
162 add dst, dst, 128
163 1: dc zva, dst
164 add dst, dst, 128
165 subs count, count, 128
166 b.hi 1b
167 stp q0, q0, [dstend, -128]
168 stp q0, q0, [dstend, -96]
169 stp q0, q0, [dstend, -64]
170 stp q0, q0, [dstend, -32]
171 ret
172
173 L(zva_other):
174 mov tmp2w, 4
175 lsl zva_lenw, tmp2w, tmp1w
176 add tmp1, zva_len, 64 // Max alignment bytes written.
177 cmp count, tmp1
178 blo L(no_zva)
179
180 sub tmp2, zva_len, 1
181 add tmp1, dst, zva_len
182 add dst, dst, 16
183 subs count, tmp1, dst // Actual alignment bytes to write.
184 bic tmp1, tmp1, tmp2 // Aligned dc zva start address.
185 beq 2f
186 1: stp q0, q0, [dst], 64
187 stp q0, q0, [dst, -32]
188 subs count, count, 64
189 b.hi 1b
190 2: mov dst, tmp1
191 sub count, dstend, tmp1 // Remaining bytes to write.
192 subs count, count, zva_len
193 b.lo 4f
194 3: dc zva, dst
195 add dst, dst, zva_len
196 subs count, count, zva_len
197 b.hs 3b
198 4: add count, count, zva_len
199 b L(tail64)