]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2012 - 2016, Linaro Limited\r | |
3 | // All rights reserved.\r | |
4 | //\r | |
5 | // Redistribution and use in source and binary forms, with or without\r | |
6 | // modification, are permitted provided that the following conditions are met:\r | |
7 | // * Redistributions of source code must retain the above copyright\r | |
8 | // notice, this list of conditions and the following disclaimer.\r | |
9 | // * Redistributions in binary form must reproduce the above copyright\r | |
10 | // notice, this list of conditions and the following disclaimer in the\r | |
11 | // documentation and/or other materials provided with the distribution.\r | |
12 | // * Neither the name of the Linaro nor the\r | |
13 | // names of its contributors may be used to endorse or promote products\r | |
14 | // derived from this software without specific prior written permission.\r | |
15 | //\r | |
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r | |
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r | |
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r | |
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r | |
20 | // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r | |
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r | |
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r | |
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r | |
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r | |
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r | |
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r | |
27 | //\r | |
28 | \r | |
29 | //\r | |
30 | // Copyright (c) 2015 ARM Ltd\r | |
31 | // All rights reserved.\r | |
32 | //\r | |
33 | // Redistribution and use in source and binary forms, with or without\r | |
34 | // modification, are permitted provided that the following conditions\r | |
35 | // are met:\r | |
36 | // 1. Redistributions of source code must retain the above copyright\r | |
37 | // notice, this list of conditions and the following disclaimer.\r | |
38 | // 2. Redistributions in binary form must reproduce the above copyright\r | |
39 | // notice, this list of conditions and the following disclaimer in the\r | |
40 | // documentation and/or other materials provided with the distribution.\r | |
41 | // 3. The name of the company may not be used to endorse or promote\r | |
42 | // products derived from this software without specific prior written\r | |
43 | // permission.\r | |
44 | //\r | |
45 | // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r | |
46 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r | |
47 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r | |
48 | // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r | |
49 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r | |
50 | // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r | |
51 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r | |
52 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r | |
53 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r | |
54 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r | |
55 | //\r | |
56 | \r | |
57 | // Assumptions:\r | |
58 | //\r | |
59 | // ARMv8-a, AArch64, unaligned accesses\r | |
60 | //\r | |
61 | //\r | |
62 | \r | |
63 | #define dstin x0\r | |
64 | #define count x1\r | |
65 | #define val x2\r | |
66 | #define valw w2\r | |
67 | #define dst x3\r | |
68 | #define dstend x4\r | |
69 | #define tmp1 x5\r | |
70 | #define tmp1w w5\r | |
71 | #define tmp2 x6\r | |
72 | #define tmp2w w6\r | |
73 | #define zva_len x7\r | |
74 | #define zva_lenw w7\r | |
75 | \r | |
76 | #define L(l) .L ## l\r | |
77 | \r | |
78 | ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r | |
79 | ASM_PFX(InternalMemSetMem16):\r | |
80 | dup v0.8H, valw\r | |
81 | b 0f\r | |
82 | \r | |
83 | ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r | |
84 | ASM_PFX(InternalMemSetMem32):\r | |
85 | dup v0.4S, valw\r | |
86 | b 0f\r | |
87 | \r | |
88 | ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r | |
89 | ASM_PFX(InternalMemSetMem64):\r | |
90 | dup v0.2D, val\r | |
91 | b 0f\r | |
92 | \r | |
93 | ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r | |
94 | ASM_PFX(InternalMemZeroMem):\r | |
95 | movi v0.16B, #0\r | |
96 | b 0f\r | |
97 | \r | |
98 | ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r | |
99 | ASM_PFX(InternalMemSetMem):\r | |
100 | dup v0.16B, valw\r | |
101 | 0: add dstend, dstin, count\r | |
102 | mov val, v0.D[0]\r | |
103 | \r | |
104 | cmp count, 96\r | |
105 | b.hi L(set_long)\r | |
106 | cmp count, 16\r | |
107 | b.hs L(set_medium)\r | |
108 | \r | |
109 | // Set 0..15 bytes.\r | |
110 | tbz count, 3, 1f\r | |
111 | str val, [dstin]\r | |
112 | str val, [dstend, -8]\r | |
113 | ret\r | |
114 | nop\r | |
115 | 1: tbz count, 2, 2f\r | |
116 | str valw, [dstin]\r | |
117 | str valw, [dstend, -4]\r | |
118 | ret\r | |
119 | 2: cbz count, 3f\r | |
120 | strb valw, [dstin]\r | |
121 | tbz count, 1, 3f\r | |
122 | strh valw, [dstend, -2]\r | |
123 | 3: ret\r | |
124 | \r | |
125 | // Set 17..96 bytes.\r | |
126 | L(set_medium):\r | |
127 | str q0, [dstin]\r | |
128 | tbnz count, 6, L(set96)\r | |
129 | str q0, [dstend, -16]\r | |
130 | tbz count, 5, 1f\r | |
131 | str q0, [dstin, 16]\r | |
132 | str q0, [dstend, -32]\r | |
133 | 1: ret\r | |
134 | \r | |
135 | .p2align 4\r | |
136 | // Set 64..96 bytes. Write 64 bytes from the start and\r | |
137 | // 32 bytes from the end.\r | |
138 | L(set96):\r | |
139 | str q0, [dstin, 16]\r | |
140 | stp q0, q0, [dstin, 32]\r | |
141 | stp q0, q0, [dstend, -32]\r | |
142 | ret\r | |
143 | \r | |
144 | .p2align 3\r | |
145 | nop\r | |
146 | L(set_long):\r | |
147 | bic dst, dstin, 15\r | |
148 | str q0, [dstin]\r | |
149 | cmp count, 256\r | |
150 | ccmp val, 0, 0, cs\r | |
151 | b.eq L(try_zva)\r | |
152 | L(no_zva):\r | |
153 | sub count, dstend, dst // Count is 16 too large.\r | |
154 | add dst, dst, 16\r | |
155 | sub count, count, 64 + 16 // Adjust count and bias for loop.\r | |
156 | 1: stp q0, q0, [dst], 64\r | |
157 | stp q0, q0, [dst, -32]\r | |
158 | L(tail64):\r | |
159 | subs count, count, 64\r | |
160 | b.hi 1b\r | |
161 | 2: stp q0, q0, [dstend, -64]\r | |
162 | stp q0, q0, [dstend, -32]\r | |
163 | ret\r | |
164 | \r | |
165 | .p2align 3\r | |
166 | L(try_zva):\r | |
167 | mrs tmp1, dczid_el0\r | |
168 | tbnz tmp1w, 4, L(no_zva)\r | |
169 | and tmp1w, tmp1w, 15\r | |
170 | cmp tmp1w, 4 // ZVA size is 64 bytes.\r | |
171 | b.ne L(zva_128)\r | |
172 | \r | |
173 | // Write the first and last 64 byte aligned block using stp rather\r | |
174 | // than using DC ZVA. This is faster on some cores.\r | |
175 | L(zva_64):\r | |
176 | str q0, [dst, 16]\r | |
177 | stp q0, q0, [dst, 32]\r | |
178 | bic dst, dst, 63\r | |
179 | stp q0, q0, [dst, 64]\r | |
180 | stp q0, q0, [dst, 96]\r | |
181 | sub count, dstend, dst // Count is now 128 too large.\r | |
182 | sub count, count, 128+64+64 // Adjust count and bias for loop.\r | |
183 | add dst, dst, 128\r | |
184 | nop\r | |
185 | 1: dc zva, dst\r | |
186 | add dst, dst, 64\r | |
187 | subs count, count, 64\r | |
188 | b.hi 1b\r | |
189 | stp q0, q0, [dst, 0]\r | |
190 | stp q0, q0, [dst, 32]\r | |
191 | stp q0, q0, [dstend, -64]\r | |
192 | stp q0, q0, [dstend, -32]\r | |
193 | ret\r | |
194 | \r | |
195 | .p2align 3\r | |
196 | L(zva_128):\r | |
197 | cmp tmp1w, 5 // ZVA size is 128 bytes.\r | |
198 | b.ne L(zva_other)\r | |
199 | \r | |
200 | str q0, [dst, 16]\r | |
201 | stp q0, q0, [dst, 32]\r | |
202 | stp q0, q0, [dst, 64]\r | |
203 | stp q0, q0, [dst, 96]\r | |
204 | bic dst, dst, 127\r | |
205 | sub count, dstend, dst // Count is now 128 too large.\r | |
206 | sub count, count, 128+128 // Adjust count and bias for loop.\r | |
207 | add dst, dst, 128\r | |
208 | 1: dc zva, dst\r | |
209 | add dst, dst, 128\r | |
210 | subs count, count, 128\r | |
211 | b.hi 1b\r | |
212 | stp q0, q0, [dstend, -128]\r | |
213 | stp q0, q0, [dstend, -96]\r | |
214 | stp q0, q0, [dstend, -64]\r | |
215 | stp q0, q0, [dstend, -32]\r | |
216 | ret\r | |
217 | \r | |
218 | L(zva_other):\r | |
219 | mov tmp2w, 4\r | |
220 | lsl zva_lenw, tmp2w, tmp1w\r | |
221 | add tmp1, zva_len, 64 // Max alignment bytes written.\r | |
222 | cmp count, tmp1\r | |
223 | blo L(no_zva)\r | |
224 | \r | |
225 | sub tmp2, zva_len, 1\r | |
226 | add tmp1, dst, zva_len\r | |
227 | add dst, dst, 16\r | |
228 | subs count, tmp1, dst // Actual alignment bytes to write.\r | |
229 | bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r | |
230 | beq 2f\r | |
231 | 1: stp q0, q0, [dst], 64\r | |
232 | stp q0, q0, [dst, -32]\r | |
233 | subs count, count, 64\r | |
234 | b.hi 1b\r | |
235 | 2: mov dst, tmp1\r | |
236 | sub count, dstend, tmp1 // Remaining bytes to write.\r | |
237 | subs count, count, zva_len\r | |
238 | b.lo 4f\r | |
239 | 3: dc zva, dst\r | |
240 | add dst, dst, zva_len\r | |
241 | subs count, count, zva_len\r | |
242 | b.hs 3b\r | |
243 | 4: add count, count, zva_len\r | |
244 | b L(tail64)\r |