]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2012 - 2016, Linaro Limited\r | |
3 | // All rights reserved.\r | |
c86cd1e1 AB |
4 | // Copyright (c) 2015 ARM Ltd\r |
5 | // All rights reserved.\r | |
aa1b377e | 6 | // SPDX-License-Identifier: BSD-2-Clause-Patent\r |
c86cd1e1 AB |
7 | //\r |
8 | \r | |
9 | // Assumptions:\r | |
10 | //\r | |
11 | // ARMv8-a, AArch64, unaligned accesses\r | |
12 | //\r | |
13 | //\r | |
14 | \r | |
15 | #define dstin x0\r | |
16 | #define count x1\r | |
17 | #define val x2\r | |
18 | #define valw w2\r | |
19 | #define dst x3\r | |
20 | #define dstend x4\r | |
21 | #define tmp1 x5\r | |
22 | #define tmp1w w5\r | |
23 | #define tmp2 x6\r | |
24 | #define tmp2w w6\r | |
25 | #define zva_len x7\r | |
26 | #define zva_lenw w7\r | |
27 | \r | |
28 | #define L(l) .L ## l\r | |
29 | \r | |
30 | ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r | |
31 | ASM_PFX(InternalMemSetMem16):\r | |
32 | dup v0.8H, valw\r | |
8b4ca351 | 33 | lsl count, count, #1\r |
c86cd1e1 AB |
34 | b 0f\r |
35 | \r | |
36 | ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r | |
37 | ASM_PFX(InternalMemSetMem32):\r | |
38 | dup v0.4S, valw\r | |
8b4ca351 | 39 | lsl count, count, #2\r |
c86cd1e1 AB |
40 | b 0f\r |
41 | \r | |
42 | ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r | |
43 | ASM_PFX(InternalMemSetMem64):\r | |
44 | dup v0.2D, val\r | |
8b4ca351 | 45 | lsl count, count, #3\r |
c86cd1e1 AB |
46 | b 0f\r |
47 | \r | |
48 | ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r | |
49 | ASM_PFX(InternalMemZeroMem):\r | |
50 | movi v0.16B, #0\r | |
51 | b 0f\r | |
52 | \r | |
53 | ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r | |
54 | ASM_PFX(InternalMemSetMem):\r | |
55 | dup v0.16B, valw\r | |
56 | 0: add dstend, dstin, count\r | |
57 | mov val, v0.D[0]\r | |
58 | \r | |
59 | cmp count, 96\r | |
60 | b.hi L(set_long)\r | |
61 | cmp count, 16\r | |
62 | b.hs L(set_medium)\r | |
63 | \r | |
64 | // Set 0..15 bytes.\r | |
65 | tbz count, 3, 1f\r | |
66 | str val, [dstin]\r | |
67 | str val, [dstend, -8]\r | |
68 | ret\r | |
69 | nop\r | |
70 | 1: tbz count, 2, 2f\r | |
71 | str valw, [dstin]\r | |
72 | str valw, [dstend, -4]\r | |
73 | ret\r | |
74 | 2: cbz count, 3f\r | |
75 | strb valw, [dstin]\r | |
76 | tbz count, 1, 3f\r | |
77 | strh valw, [dstend, -2]\r | |
78 | 3: ret\r | |
79 | \r | |
80 | // Set 17..96 bytes.\r | |
81 | L(set_medium):\r | |
82 | str q0, [dstin]\r | |
83 | tbnz count, 6, L(set96)\r | |
84 | str q0, [dstend, -16]\r | |
85 | tbz count, 5, 1f\r | |
86 | str q0, [dstin, 16]\r | |
87 | str q0, [dstend, -32]\r | |
88 | 1: ret\r | |
89 | \r | |
90 | .p2align 4\r | |
91 | // Set 64..96 bytes. Write 64 bytes from the start and\r | |
92 | // 32 bytes from the end.\r | |
93 | L(set96):\r | |
94 | str q0, [dstin, 16]\r | |
95 | stp q0, q0, [dstin, 32]\r | |
96 | stp q0, q0, [dstend, -32]\r | |
97 | ret\r | |
98 | \r | |
99 | .p2align 3\r | |
100 | nop\r | |
101 | L(set_long):\r | |
102 | bic dst, dstin, 15\r | |
103 | str q0, [dstin]\r | |
104 | cmp count, 256\r | |
105 | ccmp val, 0, 0, cs\r | |
106 | b.eq L(try_zva)\r | |
107 | L(no_zva):\r | |
108 | sub count, dstend, dst // Count is 16 too large.\r | |
109 | add dst, dst, 16\r | |
110 | sub count, count, 64 + 16 // Adjust count and bias for loop.\r | |
111 | 1: stp q0, q0, [dst], 64\r | |
112 | stp q0, q0, [dst, -32]\r | |
113 | L(tail64):\r | |
114 | subs count, count, 64\r | |
115 | b.hi 1b\r | |
116 | 2: stp q0, q0, [dstend, -64]\r | |
117 | stp q0, q0, [dstend, -32]\r | |
118 | ret\r | |
119 | \r | |
120 | .p2align 3\r | |
121 | L(try_zva):\r | |
122 | mrs tmp1, dczid_el0\r | |
123 | tbnz tmp1w, 4, L(no_zva)\r | |
124 | and tmp1w, tmp1w, 15\r | |
125 | cmp tmp1w, 4 // ZVA size is 64 bytes.\r | |
126 | b.ne L(zva_128)\r | |
127 | \r | |
128 | // Write the first and last 64 byte aligned block using stp rather\r | |
129 | // than using DC ZVA. This is faster on some cores.\r | |
130 | L(zva_64):\r | |
131 | str q0, [dst, 16]\r | |
132 | stp q0, q0, [dst, 32]\r | |
133 | bic dst, dst, 63\r | |
134 | stp q0, q0, [dst, 64]\r | |
135 | stp q0, q0, [dst, 96]\r | |
136 | sub count, dstend, dst // Count is now 128 too large.\r | |
137 | sub count, count, 128+64+64 // Adjust count and bias for loop.\r | |
138 | add dst, dst, 128\r | |
139 | nop\r | |
140 | 1: dc zva, dst\r | |
141 | add dst, dst, 64\r | |
142 | subs count, count, 64\r | |
143 | b.hi 1b\r | |
144 | stp q0, q0, [dst, 0]\r | |
145 | stp q0, q0, [dst, 32]\r | |
146 | stp q0, q0, [dstend, -64]\r | |
147 | stp q0, q0, [dstend, -32]\r | |
148 | ret\r | |
149 | \r | |
150 | .p2align 3\r | |
151 | L(zva_128):\r | |
152 | cmp tmp1w, 5 // ZVA size is 128 bytes.\r | |
153 | b.ne L(zva_other)\r | |
154 | \r | |
155 | str q0, [dst, 16]\r | |
156 | stp q0, q0, [dst, 32]\r | |
157 | stp q0, q0, [dst, 64]\r | |
158 | stp q0, q0, [dst, 96]\r | |
159 | bic dst, dst, 127\r | |
160 | sub count, dstend, dst // Count is now 128 too large.\r | |
161 | sub count, count, 128+128 // Adjust count and bias for loop.\r | |
162 | add dst, dst, 128\r | |
163 | 1: dc zva, dst\r | |
164 | add dst, dst, 128\r | |
165 | subs count, count, 128\r | |
166 | b.hi 1b\r | |
167 | stp q0, q0, [dstend, -128]\r | |
168 | stp q0, q0, [dstend, -96]\r | |
169 | stp q0, q0, [dstend, -64]\r | |
170 | stp q0, q0, [dstend, -32]\r | |
171 | ret\r | |
172 | \r | |
173 | L(zva_other):\r | |
174 | mov tmp2w, 4\r | |
175 | lsl zva_lenw, tmp2w, tmp1w\r | |
176 | add tmp1, zva_len, 64 // Max alignment bytes written.\r | |
177 | cmp count, tmp1\r | |
178 | blo L(no_zva)\r | |
179 | \r | |
180 | sub tmp2, zva_len, 1\r | |
181 | add tmp1, dst, zva_len\r | |
182 | add dst, dst, 16\r | |
183 | subs count, tmp1, dst // Actual alignment bytes to write.\r | |
184 | bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r | |
185 | beq 2f\r | |
186 | 1: stp q0, q0, [dst], 64\r | |
187 | stp q0, q0, [dst, -32]\r | |
188 | subs count, count, 64\r | |
189 | b.hi 1b\r | |
190 | 2: mov dst, tmp1\r | |
191 | sub count, dstend, tmp1 // Remaining bytes to write.\r | |
192 | subs count, count, zva_len\r | |
193 | b.lo 4f\r | |
194 | 3: dc zva, dst\r | |
195 | add dst, dst, zva_len\r | |
196 | subs count, count, zva_len\r | |
197 | b.hs 3b\r | |
198 | 4: add count, count, zva_len\r | |
199 | b L(tail64)\r |