]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/raid/aarch64/xor_gen_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / raid / aarch64 / xor_gen_neon.S
1 ########################################################################
2 # Copyright(c) 2019 Arm Corporation All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
6 # are met:
7 # * Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # * Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in
11 # the documentation and/or other materials provided with the
12 # distribution.
13 # * Neither the name of Arm Corporation nor the names of its
14 # contributors may be used to endorse or promote products derived
15 # from this software without specific prior written permission.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #########################################################################
29
30 .text
31
32 .global xor_gen_neon
33 .type xor_gen_neon, %function
34
35 /* int xor_gen_neon(int vects, int len, void **src) */
36
37 /* arguments */
38 w_vects .req w0 /* MUST >= 2 */
39 x_vects .req x0
40 w_len .req w1
41 x_len .req x1
42 x_src .req x2
43
44 /* returns */
45 w_ret .req w0
46
47 /* local variables */
48 w_in .req w1 /* share w_len */
49 x_src0 .req x3
50 x_src0_end .req x4
51 w_len256 .req w5 /* share w_len16, w_xor */
52 x_len256 .req x5
53 w_len16 .req w5
54 x_len16 .req x5
55 w_xor .req w5
56 w_col .req w6
57 x_col .req x6
58 x_src_ptr .req x7
59 x_srcn .req x9
60 x_dst .req x10
61 x_dst_ptr .req x11
62 /* v0 ~ v15: temporary results */
63 /* v16 ~ v31: next 256 bytes */
64
65 /*
66 * +----------+ +------------------+
67 * src --> | src[0] | - src0 -> | buffer | src0_end
68 * --------+----------+ +------------------+
69 * . | ...... |
70 * . +----------+ +------------------+
71 * src_ptr ~~> | src[n] | - srcn ~> | buffer |
72 * . +----------+ +------------------+
73 * . | ...... |
74 * . +----------+
75 * . | src[v-2] |
76 * --------+----------+ +------------------+
77 * dst_ptr --> | src[v-1] | -- dst --> | buffer |
78 * +----------+ +------------------+
79 */
80
81 xor_gen_neon:
82 add x_dst_ptr, x_src, x_vects, lsl #3
83 ldr x_dst, [x_dst_ptr, #-8]!
84 ldr x_src0, [x_src]
85 add x_src0_end, x_src0, x_len
86
87 sub w_vects, w_vects, #2
88 mov w_col, #0
89
90 .Loop256_init:
91 /* len256 = len - len%256; len %= 256 */
92 mov w_len256, w_len
93 and w_len, w_len, #0xFF
94 sub w_len256, w_len256, w_len
95
96 /* less than 256 byts? */
97 cbz w_len256, .Lloop16_init
98
99 /* save d8 ~ d15 to stack */
100 sub sp, sp, #64
101 stp d8, d9, [sp]
102 stp d10, d11, [sp, #16]
103 stp d12, d13, [sp, #32]
104 stp d14, d15, [sp, #48]
105
106 sub x_src0_end, x_src0_end, #256
107
108 /* batch process (vects-1)*256 bytes */
109 .Lloop256:
110 ldr q0, [x_src0, #16*0]
111 ldr q1, [x_src0, #16*1]
112 ldr q2, [x_src0, #16*2]
113 ldr q3, [x_src0, #16*3]
114 ldr q4, [x_src0, #16*4]
115 ldr q5, [x_src0, #16*5]
116 ldr q6, [x_src0, #16*6]
117 ldr q7, [x_src0, #16*7]
118 ldr q8, [x_src0, #16*8]
119 ldr q9, [x_src0, #16*9]
120 ldr q10, [x_src0, #16*10]
121 ldr q11, [x_src0, #16*11]
122 ldr q12, [x_src0, #16*12]
123 ldr q13, [x_src0, #16*13]
124 ldr q14, [x_src0, #16*14]
125 ldr q15, [x_src0, #16*15]
126 add x_src0, x_src0, #256
127
128 cbz w_vects, .Lloop256_vects_end
129
130 add x_src_ptr, x_src, #8
131 .Lloop256_vects:
132 ldr x_srcn, [x_src_ptr], #8
133 add x_srcn, x_srcn, x_col
134 cmp x_src_ptr, x_dst_ptr
135
136 ldr q16, [x_srcn, #16*0]
137 ldr q17, [x_srcn, #16*1]
138 ldr q18, [x_srcn, #16*2]
139 ldr q19, [x_srcn, #16*3]
140 ldr q20, [x_srcn, #16*4]
141 ldr q21, [x_srcn, #16*5]
142 ldr q22, [x_srcn, #16*6]
143 ldr q23, [x_srcn, #16*7]
144 ldr q24, [x_srcn, #16*8]
145 ldr q25, [x_srcn, #16*9]
146 ldr q26, [x_srcn, #16*10]
147 ldr q27, [x_srcn, #16*11]
148 ldr q28, [x_srcn, #16*12]
149 ldr q29, [x_srcn, #16*13]
150 ldr q30, [x_srcn, #16*14]
151 ldr q31, [x_srcn, #16*15]
152
153 eor v0.16b, v0.16b, v16.16b
154 eor v1.16b, v1.16b, v17.16b
155 eor v2.16b, v2.16b, v18.16b
156 eor v3.16b, v3.16b, v19.16b
157 eor v4.16b, v4.16b, v20.16b
158 eor v5.16b, v5.16b, v21.16b
159 eor v6.16b, v6.16b, v22.16b
160 eor v7.16b, v7.16b, v23.16b
161 eor v8.16b, v8.16b, v24.16b
162 eor v9.16b, v9.16b, v25.16b
163 eor v10.16b, v10.16b, v26.16b
164 eor v11.16b, v11.16b, v27.16b
165 eor v12.16b, v12.16b, v28.16b
166 eor v13.16b, v13.16b, v29.16b
167 eor v14.16b, v14.16b, v30.16b
168 eor v15.16b, v15.16b, v31.16b
169
170 bne .Lloop256_vects
171
172 .Lloop256_vects_end:
173 str q0, [x_dst, #16*0]
174 str q1, [x_dst, #16*1]
175 str q2, [x_dst, #16*2]
176 str q3, [x_dst, #16*3]
177 str q4, [x_dst, #16*4]
178 str q5, [x_dst, #16*5]
179 str q6, [x_dst, #16*6]
180 str q7, [x_dst, #16*7]
181 str q8, [x_dst, #16*8]
182 str q9, [x_dst, #16*9]
183 str q10, [x_dst, #16*10]
184 str q11, [x_dst, #16*11]
185 str q12, [x_dst, #16*12]
186 str q13, [x_dst, #16*13]
187 str q14, [x_dst, #16*14]
188 str q15, [x_dst, #16*15]
189
190 cmp x_src0, x_src0_end
191 add x_dst, x_dst, #256
192 add w_col, w_col, #256
193 bls .Lloop256
194
195 .Lloop256_end:
196 /* restore d8 ~ d15 */
197 ldp d8, d9, [sp]
198 ldp d10, d11, [sp, #16]
199 ldp d12, d13, [sp, #32]
200 ldp d14, d15, [sp, #48]
201 add sp, sp, #64
202
203 add x_src0_end, x_src0_end, #256
204
205 .Lloop16_init:
206 /* len16 = len - len%16; len %= 16 */
207 mov w_len16, w_len
208 and w_len, w_len, #0xF
209 sub w_len16, w_len16, w_len
210
211 /* less than 16 bytes? */
212 cbz w_len16, .Lloop1_init
213
214 sub x_src0_end, x_src0_end, #16
215
216 /* batch process (vects-1)*16 bytes */
217 .Lloop16:
218 ldr q0, [x_src0], #16
219 cbz w_vects, .Lloop16_vects_end
220
221 add x_src_ptr, x_src, #8
222 .Lloop16_vects:
223 ldr x_srcn, [x_src_ptr], #8
224 cmp x_src_ptr, x_dst_ptr
225 ldr q1, [x_srcn, x_col]
226 eor v0.16b, v0.16b, v1.16b
227 bne .Lloop16_vects
228
229 .Lloop16_vects_end:
230 cmp x_src0, x_src0_end
231 str q0, [x_dst], #16
232 add w_col, w_col, #16
233 bls .Lloop16
234
235 .Loop16_end:
236 add x_src0_end, x_src0_end, #16
237
238 .Lloop1_init:
239 cbnz w_len, .Lloop1
240 mov w_ret, #0
241 ret
242
243 /* batch process (vects-1)*1 bytes */
244 .Lloop1:
245 ldrb w_xor, [x_src0], #1
246 cbz w_vects, .Lloop1_vects_end
247
248 add x_src_ptr, x_src, #8
249 .Lloop1_vects:
250 ldr x_srcn, [x_src_ptr], #8
251 cmp x_src_ptr, x_dst_ptr
252 ldrb w_in, [x_srcn, x_col]
253 eor w_xor, w_xor, w_in
254 bne .Lloop1_vects
255
256 .Lloop1_vects_end:
257 cmp x_src0, x_src0_end
258 strb w_xor, [x_dst], #1
259 add w_col, w_col, #1
260 bne .Lloop1
261
262 .Loop1_end:
263 mov w_ret, #0
264 ret