2 // Copyright (c) 2012 - 2016, Linaro Limited
3 // All rights reserved.
4 // Copyright (c) 2015 ARM Ltd
5 // All rights reserved.
6 // SPDX-License-Identifier: BSD-2-Clause-Patent
11 // ARMv8-a, AArch64, unaligned accesses
30 ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
31 ASM_PFX(InternalMemSetMem16):
36 ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
37 ASM_PFX(InternalMemSetMem32):
42 ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
43 ASM_PFX(InternalMemSetMem64):
48 ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
49 ASM_PFX(InternalMemZeroMem):
53 ASM_GLOBAL ASM_PFX(InternalMemSetMem)
54 ASM_PFX(InternalMemSetMem):
56 0: add dstend, dstin, count
72 str valw, [dstend, -4]
77 strh valw, [dstend, -2]
83 tbnz count, 6, L(set96)
91 // Set 64..96 bytes. Write 64 bytes from the start and
92 // 32 bytes from the end.
95 stp q0, q0, [dstin, 32]
96 stp q0, q0, [dstend, -32]
108 sub count, dstend, dst // Count is 16 too large.
110 sub count, count, 64 + 16 // Adjust count and bias for loop.
111 1: stp q0, q0, [dst], 64
112 stp q0, q0, [dst, -32]
114 subs count, count, 64
116 2: stp q0, q0, [dstend, -64]
117 stp q0, q0, [dstend, -32]
123 tbnz tmp1w, 4, L(no_zva)
125 cmp tmp1w, 4 // ZVA size is 64 bytes.
128 // Write the first and last 64 byte aligned block using stp rather
129 // than using DC ZVA. This is faster on some cores.
132 stp q0, q0, [dst, 32]
134 stp q0, q0, [dst, 64]
135 stp q0, q0, [dst, 96]
136 sub count, dstend, dst // Count is now 128 too large.
137 sub count, count, 128+64+64 // Adjust count and bias for loop.
142 subs count, count, 64
145 stp q0, q0, [dst, 32]
146 stp q0, q0, [dstend, -64]
147 stp q0, q0, [dstend, -32]
152 cmp tmp1w, 5 // ZVA size is 128 bytes.
156 stp q0, q0, [dst, 32]
157 stp q0, q0, [dst, 64]
158 stp q0, q0, [dst, 96]
160 sub count, dstend, dst // Count is now 128 too large.
161 sub count, count, 128+128 // Adjust count and bias for loop.
165 subs count, count, 128
167 stp q0, q0, [dstend, -128]
168 stp q0, q0, [dstend, -96]
169 stp q0, q0, [dstend, -64]
170 stp q0, q0, [dstend, -32]
175 lsl zva_lenw, tmp2w, tmp1w
176 add tmp1, zva_len, 64 // Max alignment bytes written.
181 add tmp1, dst, zva_len
183 subs count, tmp1, dst // Actual alignment bytes to write.
184 bic tmp1, tmp1, tmp2 // Aligned dc zva start address.
186 1: stp q0, q0, [dst], 64
187 stp q0, q0, [dst, -32]
188 subs count, count, 64
191 sub count, dstend, tmp1 // Remaining bytes to write.
192 subs count, count, zva_len
195 add dst, dst, zva_len
196 subs count, count, zva_len
198 4: add count, count, zva_len