]> git.proxmox.com Git - mirror_edk2.git/blob - MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines
[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S
1 //
2 // Copyright (c) 2012 - 2016, Linaro Limited
3 // All rights reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are met:
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright
10 // notice, this list of conditions and the following disclaimer in the
11 // documentation and/or other materials provided with the distribution.
12 // * Neither the name of the Linaro nor the
13 // names of its contributors may be used to endorse or promote products
14 // derived from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 //
28
29 //
30 // Copyright (c) 2015 ARM Ltd
31 // All rights reserved.
32 //
33 // Redistribution and use in source and binary forms, with or without
34 // modification, are permitted provided that the following conditions
35 // are met:
36 // 1. Redistributions of source code must retain the above copyright
37 // notice, this list of conditions and the following disclaimer.
38 // 2. Redistributions in binary form must reproduce the above copyright
39 // notice, this list of conditions and the following disclaimer in the
40 // documentation and/or other materials provided with the distribution.
41 // 3. The name of the company may not be used to endorse or promote
42 // products derived from this software without specific prior written
43 // permission.
44 //
45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 //
56
57 // Assumptions:
58 //
59 // ARMv8-a, AArch64, unaligned accesses
60 //
61 //
62
63 #define dstin x0
64 #define count x1
65 #define val x2
66 #define valw w2
67 #define dst x3
68 #define dstend x4
69 #define tmp1 x5
70 #define tmp1w w5
71 #define tmp2 x6
72 #define tmp2w w6
73 #define zva_len x7
74 #define zva_lenw w7
75
76 #define L(l) .L ## l
77
78 ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
79 ASM_PFX(InternalMemSetMem16):
80 dup v0.8H, valw
81 b 0f
82
83 ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
84 ASM_PFX(InternalMemSetMem32):
85 dup v0.4S, valw
86 b 0f
87
88 ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
89 ASM_PFX(InternalMemSetMem64):
90 dup v0.2D, val
91 b 0f
92
93 ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
94 ASM_PFX(InternalMemZeroMem):
95 movi v0.16B, #0
96 b 0f
97
98 ASM_GLOBAL ASM_PFX(InternalMemSetMem)
99 ASM_PFX(InternalMemSetMem):
100 dup v0.16B, valw
101 0: add dstend, dstin, count
102 mov val, v0.D[0]
103
104 cmp count, 96
105 b.hi L(set_long)
106 cmp count, 16
107 b.hs L(set_medium)
108
109 // Set 0..15 bytes.
110 tbz count, 3, 1f
111 str val, [dstin]
112 str val, [dstend, -8]
113 ret
114 nop
115 1: tbz count, 2, 2f
116 str valw, [dstin]
117 str valw, [dstend, -4]
118 ret
119 2: cbz count, 3f
120 strb valw, [dstin]
121 tbz count, 1, 3f
122 strh valw, [dstend, -2]
123 3: ret
124
125 // Set 17..96 bytes.
126 L(set_medium):
127 str q0, [dstin]
128 tbnz count, 6, L(set96)
129 str q0, [dstend, -16]
130 tbz count, 5, 1f
131 str q0, [dstin, 16]
132 str q0, [dstend, -32]
133 1: ret
134
135 .p2align 4
136 // Set 64..96 bytes. Write 64 bytes from the start and
137 // 32 bytes from the end.
138 L(set96):
139 str q0, [dstin, 16]
140 stp q0, q0, [dstin, 32]
141 stp q0, q0, [dstend, -32]
142 ret
143
144 .p2align 3
145 nop
146 L(set_long):
147 bic dst, dstin, 15
148 str q0, [dstin]
149 cmp count, 256
150 ccmp val, 0, 0, cs
151 b.eq L(try_zva)
152 L(no_zva):
153 sub count, dstend, dst // Count is 16 too large.
154 add dst, dst, 16
155 sub count, count, 64 + 16 // Adjust count and bias for loop.
156 1: stp q0, q0, [dst], 64
157 stp q0, q0, [dst, -32]
158 L(tail64):
159 subs count, count, 64
160 b.hi 1b
161 2: stp q0, q0, [dstend, -64]
162 stp q0, q0, [dstend, -32]
163 ret
164
165 .p2align 3
166 L(try_zva):
167 mrs tmp1, dczid_el0
168 tbnz tmp1w, 4, L(no_zva)
169 and tmp1w, tmp1w, 15
170 cmp tmp1w, 4 // ZVA size is 64 bytes.
171 b.ne L(zva_128)
172
173 // Write the first and last 64 byte aligned block using stp rather
174 // than using DC ZVA. This is faster on some cores.
175 L(zva_64):
176 str q0, [dst, 16]
177 stp q0, q0, [dst, 32]
178 bic dst, dst, 63
179 stp q0, q0, [dst, 64]
180 stp q0, q0, [dst, 96]
181 sub count, dstend, dst // Count is now 128 too large.
182 sub count, count, 128+64+64 // Adjust count and bias for loop.
183 add dst, dst, 128
184 nop
185 1: dc zva, dst
186 add dst, dst, 64
187 subs count, count, 64
188 b.hi 1b
189 stp q0, q0, [dst, 0]
190 stp q0, q0, [dst, 32]
191 stp q0, q0, [dstend, -64]
192 stp q0, q0, [dstend, -32]
193 ret
194
195 .p2align 3
196 L(zva_128):
197 cmp tmp1w, 5 // ZVA size is 128 bytes.
198 b.ne L(zva_other)
199
200 str q0, [dst, 16]
201 stp q0, q0, [dst, 32]
202 stp q0, q0, [dst, 64]
203 stp q0, q0, [dst, 96]
204 bic dst, dst, 127
205 sub count, dstend, dst // Count is now 128 too large.
206 sub count, count, 128+128 // Adjust count and bias for loop.
207 add dst, dst, 128
208 1: dc zva, dst
209 add dst, dst, 128
210 subs count, count, 128
211 b.hi 1b
212 stp q0, q0, [dstend, -128]
213 stp q0, q0, [dstend, -96]
214 stp q0, q0, [dstend, -64]
215 stp q0, q0, [dstend, -32]
216 ret
217
218 L(zva_other):
219 mov tmp2w, 4
220 lsl zva_lenw, tmp2w, tmp1w
221 add tmp1, zva_len, 64 // Max alignment bytes written.
222 cmp count, tmp1
223 blo L(no_zva)
224
225 sub tmp2, zva_len, 1
226 add tmp1, dst, zva_len
227 add dst, dst, 16
228 subs count, tmp1, dst // Actual alignment bytes to write.
229 bic tmp1, tmp1, tmp2 // Aligned dc zva start address.
230 beq 2f
231 1: stp q0, q0, [dst], 64
232 stp q0, q0, [dst, -32]
233 subs count, count, 64
234 b.hi 1b
235 2: mov dst, tmp1
236 sub count, dstend, tmp1 // Remaining bytes to write.
237 subs count, count, zva_len
238 b.lo 4f
239 3: dc zva, dst
240 add dst, dst, zva_len
241 subs count, count, zva_len
242 b.hs 3b
243 4: add count, count, zva_len
244 b L(tail64)