]>
Commit | Line | Data |
---|---|---|
c150290d | 1 | /* |
e1858b2a | 2 | * Copyright (c) 2011, The Linux Foundation. All rights reserved. |
c150290d RK |
3 | * |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License version 2 and | |
6 | * only version 2 as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 | * GNU General Public License for more details. | |
12 | * | |
13 | * You should have received a copy of the GNU General Public License | |
14 | * along with this program; if not, write to the Free Software | |
15 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | |
16 | * 02110-1301, USA. | |
17 | */ | |
18 | ||
19 | ||
20 | /* HEXAGON assembly optimized memset */ | |
21 | /* Replaces the standard library function memset */ | |
22 | ||
23 | ||
24 | .macro HEXAGON_OPT_FUNC_BEGIN name | |
25 | .text | |
26 | .p2align 4 | |
27 | .globl \name | |
28 | .type \name, @function | |
29 | \name: | |
30 | .endm | |
31 | ||
32 | .macro HEXAGON_OPT_FUNC_FINISH name | |
33 | .size \name, . - \name | |
34 | .endm | |
35 | ||
36 | /* FUNCTION: memset (v2 version) */ | |
37 | #if __HEXAGON_ARCH__ < 3 | |
38 | HEXAGON_OPT_FUNC_BEGIN memset | |
39 | { | |
40 | r6 = #8 | |
41 | r7 = extractu(r0, #3 , #0) | |
42 | p0 = cmp.eq(r2, #0) | |
43 | p1 = cmp.gtu(r2, #7) | |
44 | } | |
45 | { | |
46 | r4 = vsplatb(r1) | |
47 | r8 = r0 /* leave r0 intact for return val */ | |
48 | r9 = sub(r6, r7) /* bytes until double alignment */ | |
49 | if p0 jumpr r31 /* count == 0, so return */ | |
50 | } | |
51 | { | |
52 | r3 = #0 | |
53 | r7 = #0 | |
54 | p0 = tstbit(r9, #0) | |
55 | if p1 jump 2f /* skip byte loop */ | |
56 | } | |
57 | ||
58 | /* less than 8 bytes to set, so just set a byte at a time and return */ | |
59 | ||
60 | loop0(1f, r2) /* byte loop */ | |
61 | .falign | |
62 | 1: /* byte loop */ | |
63 | { | |
64 | memb(r8++#1) = r4 | |
65 | }:endloop0 | |
66 | jumpr r31 | |
67 | .falign | |
68 | 2: /* skip byte loop */ | |
69 | { | |
70 | r6 = #1 | |
71 | p0 = tstbit(r9, #1) | |
72 | p1 = cmp.eq(r2, #1) | |
73 | if !p0 jump 3f /* skip initial byte store */ | |
74 | } | |
75 | { | |
76 | memb(r8++#1) = r4 | |
77 | r3:2 = sub(r3:2, r7:6) | |
78 | if p1 jumpr r31 | |
79 | } | |
80 | .falign | |
81 | 3: /* skip initial byte store */ | |
82 | { | |
83 | r6 = #2 | |
84 | p0 = tstbit(r9, #2) | |
85 | p1 = cmp.eq(r2, #2) | |
86 | if !p0 jump 4f /* skip initial half store */ | |
87 | } | |
88 | { | |
89 | memh(r8++#2) = r4 | |
90 | r3:2 = sub(r3:2, r7:6) | |
91 | if p1 jumpr r31 | |
92 | } | |
93 | .falign | |
94 | 4: /* skip initial half store */ | |
95 | { | |
96 | r6 = #4 | |
97 | p0 = cmp.gtu(r2, #7) | |
98 | p1 = cmp.eq(r2, #4) | |
99 | if !p0 jump 5f /* skip initial word store */ | |
100 | } | |
101 | { | |
102 | memw(r8++#4) = r4 | |
103 | r3:2 = sub(r3:2, r7:6) | |
104 | p0 = cmp.gtu(r2, #11) | |
105 | if p1 jumpr r31 | |
106 | } | |
107 | .falign | |
108 | 5: /* skip initial word store */ | |
109 | { | |
110 | r10 = lsr(r2, #3) | |
111 | p1 = cmp.eq(r3, #1) | |
112 | if !p0 jump 7f /* skip double loop */ | |
113 | } | |
114 | { | |
115 | r5 = r4 | |
116 | r6 = #8 | |
117 | loop0(6f, r10) /* double loop */ | |
118 | } | |
119 | ||
120 | /* set bytes a double word at a time */ | |
121 | ||
122 | .falign | |
123 | 6: /* double loop */ | |
124 | { | |
125 | memd(r8++#8) = r5:4 | |
126 | r3:2 = sub(r3:2, r7:6) | |
127 | p1 = cmp.eq(r2, #8) | |
128 | }:endloop0 | |
129 | .falign | |
130 | 7: /* skip double loop */ | |
131 | { | |
132 | p0 = tstbit(r2, #2) | |
133 | if p1 jumpr r31 | |
134 | } | |
135 | { | |
136 | r6 = #4 | |
137 | p0 = tstbit(r2, #1) | |
138 | p1 = cmp.eq(r2, #4) | |
139 | if !p0 jump 8f /* skip final word store */ | |
140 | } | |
141 | { | |
142 | memw(r8++#4) = r4 | |
143 | r3:2 = sub(r3:2, r7:6) | |
144 | if p1 jumpr r31 | |
145 | } | |
146 | .falign | |
147 | 8: /* skip final word store */ | |
148 | { | |
149 | p1 = cmp.eq(r2, #2) | |
150 | if !p0 jump 9f /* skip final half store */ | |
151 | } | |
152 | { | |
153 | memh(r8++#2) = r4 | |
154 | if p1 jumpr r31 | |
155 | } | |
156 | .falign | |
157 | 9: /* skip final half store */ | |
158 | { | |
159 | memb(r8++#1) = r4 | |
160 | jumpr r31 | |
161 | } | |
162 | HEXAGON_OPT_FUNC_FINISH memset | |
163 | #endif | |
164 | ||
165 | ||
166 | /* FUNCTION: memset (v3 and higher version) */ | |
167 | #if __HEXAGON_ARCH__ >= 3 | |
168 | HEXAGON_OPT_FUNC_BEGIN memset | |
169 | { | |
170 | r7=vsplatb(r1) | |
171 | r6 = r0 | |
172 | if (r2==#0) jump:nt .L1 | |
173 | } | |
174 | { | |
175 | r5:4=combine(r7,r7) | |
176 | p0 = cmp.gtu(r2,#8) | |
177 | if (p0.new) jump:nt .L3 | |
178 | } | |
179 | { | |
180 | r3 = r0 | |
181 | loop0(.L47,r2) | |
182 | } | |
183 | .falign | |
184 | .L47: | |
185 | { | |
186 | memb(r3++#1) = r1 | |
187 | }:endloop0 /* start=.L47 */ | |
188 | jumpr r31 | |
189 | .L3: | |
190 | { | |
191 | p0 = tstbit(r0,#0) | |
192 | if (!p0.new) jump:nt .L8 | |
193 | p1 = cmp.eq(r2, #1) | |
194 | } | |
195 | { | |
196 | r6 = add(r0, #1) | |
197 | r2 = add(r2,#-1) | |
198 | memb(r0) = r1 | |
199 | if (p1) jump .L1 | |
200 | } | |
201 | .L8: | |
202 | { | |
203 | p0 = tstbit(r6,#1) | |
204 | if (!p0.new) jump:nt .L10 | |
205 | } | |
206 | { | |
207 | r2 = add(r2,#-2) | |
208 | memh(r6++#2) = r7 | |
209 | p0 = cmp.eq(r2, #2) | |
210 | if (p0.new) jump:nt .L1 | |
211 | } | |
212 | .L10: | |
213 | { | |
214 | p0 = tstbit(r6,#2) | |
215 | if (!p0.new) jump:nt .L12 | |
216 | } | |
217 | { | |
218 | r2 = add(r2,#-4) | |
219 | memw(r6++#4) = r7 | |
220 | p0 = cmp.eq(r2, #4) | |
221 | if (p0.new) jump:nt .L1 | |
222 | } | |
223 | .L12: | |
224 | { | |
225 | p0 = cmp.gtu(r2,#127) | |
226 | if (!p0.new) jump:nt .L14 | |
227 | } | |
228 | r3 = and(r6,#31) | |
229 | if (r3==#0) jump:nt .L17 | |
230 | { | |
231 | memd(r6++#8) = r5:4 | |
232 | r2 = add(r2,#-8) | |
233 | } | |
234 | r3 = and(r6,#31) | |
235 | if (r3==#0) jump:nt .L17 | |
236 | { | |
237 | memd(r6++#8) = r5:4 | |
238 | r2 = add(r2,#-8) | |
239 | } | |
240 | r3 = and(r6,#31) | |
241 | if (r3==#0) jump:nt .L17 | |
242 | { | |
243 | memd(r6++#8) = r5:4 | |
244 | r2 = add(r2,#-8) | |
245 | } | |
246 | .L17: | |
247 | { | |
248 | r3 = lsr(r2,#5) | |
249 | if (r1!=#0) jump:nt .L18 | |
250 | } | |
251 | { | |
252 | r8 = r3 | |
253 | r3 = r6 | |
254 | loop0(.L46,r3) | |
255 | } | |
256 | .falign | |
257 | .L46: | |
258 | { | |
259 | dczeroa(r6) | |
260 | r6 = add(r6,#32) | |
261 | r2 = add(r2,#-32) | |
262 | }:endloop0 /* start=.L46 */ | |
263 | .L14: | |
264 | { | |
265 | p0 = cmp.gtu(r2,#7) | |
266 | if (!p0.new) jump:nt .L28 | |
267 | r8 = lsr(r2,#3) | |
268 | } | |
269 | loop0(.L44,r8) | |
270 | .falign | |
271 | .L44: | |
272 | { | |
273 | memd(r6++#8) = r5:4 | |
274 | r2 = add(r2,#-8) | |
275 | }:endloop0 /* start=.L44 */ | |
276 | .L28: | |
277 | { | |
278 | p0 = tstbit(r2,#2) | |
279 | if (!p0.new) jump:nt .L33 | |
280 | } | |
281 | { | |
282 | r2 = add(r2,#-4) | |
283 | memw(r6++#4) = r7 | |
284 | } | |
285 | .L33: | |
286 | { | |
287 | p0 = tstbit(r2,#1) | |
288 | if (!p0.new) jump:nt .L35 | |
289 | } | |
290 | { | |
291 | r2 = add(r2,#-2) | |
292 | memh(r6++#2) = r7 | |
293 | } | |
294 | .L35: | |
295 | p0 = cmp.eq(r2,#1) | |
296 | if (p0) memb(r6) = r1 | |
297 | .L1: | |
298 | jumpr r31 | |
299 | .L18: | |
300 | loop0(.L45,r3) | |
301 | .falign | |
302 | .L45: | |
303 | dczeroa(r6) | |
304 | { | |
305 | memd(r6++#8) = r5:4 | |
306 | r2 = add(r2,#-32) | |
307 | } | |
308 | memd(r6++#8) = r5:4 | |
309 | memd(r6++#8) = r5:4 | |
310 | { | |
311 | memd(r6++#8) = r5:4 | |
312 | }:endloop0 /* start=.L45 */ | |
313 | jump .L14 | |
314 | HEXAGON_OPT_FUNC_FINISH memset | |
315 | #endif |