ceph/src/isa-l/raid/aarch64/xor_gen_neon.S

   1 ########################################################################
   2 #  Copyright(c) 2019 Arm Corporation All rights reserved.
   3 #
   4 #  Redistribution and use in source and binary forms, with or without
   5 #  modification, are permitted provided that the following conditions
   6 #  are met:
   7 #    * Redistributions of source code must retain the above copyright
   8 #      notice, this list of conditions and the following disclaimer.
   9 #    * Redistributions in binary form must reproduce the above copyright
  10 #      notice, this list of conditions and the following disclaimer in
  11 #      the documentation and/or other materials provided with the
  12 #      distribution.
  13 #    * Neither the name of Arm Corporation nor the names of its
  14 #      contributors may be used to endorse or promote products derived
  15 #      from this software without specific prior written permission.
  16 #
  17 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 #  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 #  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 #  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 #  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 #  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 #  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 #########################################################################
  29
  30 .text
  31
  32 .global xor_gen_neon
  33 .type xor_gen_neon, %function
  34
  35 /* int xor_gen_neon(int vects, int len, void **src) */
  36
  37 /* arguments */
  38 w_vects         .req    w0      /* MUST >= 2 */
  39 x_vects         .req    x0
  40 w_len           .req    w1
  41 x_len           .req    x1
  42 x_src           .req    x2
  43
  44 /* returns */
  45 w_ret           .req    w0
  46
  47 /* local variables */
  48 w_in            .req    w1      /* share w_len */
  49 x_src0          .req    x3
  50 x_src0_end      .req    x4
  51 w_len256        .req    w5      /* share w_len16, w_xor */
  52 x_len256        .req    x5
  53 w_len16         .req    w5
  54 x_len16         .req    x5
  55 w_xor           .req    w5
  56 w_col           .req    w6
  57 x_col           .req    x6
  58 x_src_ptr       .req    x7
  59 x_srcn          .req    x9
  60 x_dst           .req    x10
  61 x_dst_ptr       .req    x11
  62 /* v0  ~ v15: temporary results */
  63 /* v16 ~ v31: next 256 bytes */
  64
  65 /*
  66  *                 +----------+            +------------------+
  67  *         src --> |  src[0]  | - src0 ->  |      buffer      | src0_end
  68  *         --------+----------+            +------------------+
  69  *           .     |  ......  |
  70  *           .     +----------+            +------------------+
  71  *     src_ptr ~~> |  src[n]  | - srcn ~>  |      buffer      |
  72  *           .     +----------+            +------------------+
  73  *           .     |  ......  |
  74  *           .     +----------+
  75  *           .     | src[v-2] |
  76  *         --------+----------+            +------------------+
  77  *     dst_ptr --> | src[v-1] | -- dst --> |      buffer      |
  78  *                 +----------+            +------------------+
  79  */
  80
  81 xor_gen_neon:
  82         add     x_dst_ptr, x_src, x_vects, lsl #3
  83         ldr     x_dst, [x_dst_ptr, #-8]!
  84         ldr     x_src0, [x_src]
  85         add     x_src0_end, x_src0, x_len
  86
  87         sub     w_vects, w_vects, #2
  88         mov     w_col, #0
  89
  90 .Loop256_init:
  91         /* len256 = len - len%256; len %= 256 */
  92         mov     w_len256, w_len
  93         and     w_len, w_len, #0xFF
  94         sub     w_len256, w_len256, w_len
  95
  96         /* less than 256 byts? */
  97         cbz     w_len256, .Lloop16_init
  98
  99         /* save d8 ~ d15 to stack */
 100         sub     sp, sp, #64
 101         stp     d8, d9, [sp]
 102         stp     d10, d11, [sp, #16]
 103         stp     d12, d13, [sp, #32]
 104         stp     d14, d15, [sp, #48]
 105
 106         sub     x_src0_end, x_src0_end, #256
 107
 108         /* batch process (vects-1)*256 bytes */
 109 .Lloop256:
 110         ldr     q0,  [x_src0, #16*0]
 111         ldr     q1,  [x_src0, #16*1]
 112         ldr     q2,  [x_src0, #16*2]
 113         ldr     q3,  [x_src0, #16*3]
 114         ldr     q4,  [x_src0, #16*4]
 115         ldr     q5,  [x_src0, #16*5]
 116         ldr     q6,  [x_src0, #16*6]
 117         ldr     q7,  [x_src0, #16*7]
 118         ldr     q8,  [x_src0, #16*8]
 119         ldr     q9,  [x_src0, #16*9]
 120         ldr     q10, [x_src0, #16*10]
 121         ldr     q11, [x_src0, #16*11]
 122         ldr     q12, [x_src0, #16*12]
 123         ldr     q13, [x_src0, #16*13]
 124         ldr     q14, [x_src0, #16*14]
 125         ldr     q15, [x_src0, #16*15]
 126         add     x_src0, x_src0, #256
 127
 128         cbz     w_vects, .Lloop256_vects_end
 129
 130         add     x_src_ptr, x_src, #8
 131 .Lloop256_vects:
 132         ldr     x_srcn, [x_src_ptr], #8
 133         add     x_srcn, x_srcn, x_col
 134         cmp     x_src_ptr, x_dst_ptr
 135
 136         ldr     q16, [x_srcn, #16*0]
 137         ldr     q17, [x_srcn, #16*1]
 138         ldr     q18, [x_srcn, #16*2]
 139         ldr     q19, [x_srcn, #16*3]
 140         ldr     q20, [x_srcn, #16*4]
 141         ldr     q21, [x_srcn, #16*5]
 142         ldr     q22, [x_srcn, #16*6]
 143         ldr     q23, [x_srcn, #16*7]
 144         ldr     q24, [x_srcn, #16*8]
 145         ldr     q25, [x_srcn, #16*9]
 146         ldr     q26, [x_srcn, #16*10]
 147         ldr     q27, [x_srcn, #16*11]
 148         ldr     q28, [x_srcn, #16*12]
 149         ldr     q29, [x_srcn, #16*13]
 150         ldr     q30, [x_srcn, #16*14]
 151         ldr     q31, [x_srcn, #16*15]
 152
 153         eor     v0.16b,  v0.16b,  v16.16b
 154         eor     v1.16b,  v1.16b,  v17.16b
 155         eor     v2.16b,  v2.16b,  v18.16b
 156         eor     v3.16b,  v3.16b,  v19.16b
 157         eor     v4.16b,  v4.16b,  v20.16b
 158         eor     v5.16b,  v5.16b,  v21.16b
 159         eor     v6.16b,  v6.16b,  v22.16b
 160         eor     v7.16b,  v7.16b,  v23.16b
 161         eor     v8.16b,  v8.16b,  v24.16b
 162         eor     v9.16b,  v9.16b,  v25.16b
 163         eor     v10.16b, v10.16b, v26.16b
 164         eor     v11.16b, v11.16b, v27.16b
 165         eor     v12.16b, v12.16b, v28.16b
 166         eor     v13.16b, v13.16b, v29.16b
 167         eor     v14.16b, v14.16b, v30.16b
 168         eor     v15.16b, v15.16b, v31.16b
 169
 170         bne     .Lloop256_vects
 171
 172 .Lloop256_vects_end:
 173         str     q0,  [x_dst, #16*0]
 174         str     q1,  [x_dst, #16*1]
 175         str     q2,  [x_dst, #16*2]
 176         str     q3,  [x_dst, #16*3]
 177         str     q4,  [x_dst, #16*4]
 178         str     q5,  [x_dst, #16*5]
 179         str     q6,  [x_dst, #16*6]
 180         str     q7,  [x_dst, #16*7]
 181         str     q8,  [x_dst, #16*8]
 182         str     q9,  [x_dst, #16*9]
 183         str     q10, [x_dst, #16*10]
 184         str     q11, [x_dst, #16*11]
 185         str     q12, [x_dst, #16*12]
 186         str     q13, [x_dst, #16*13]
 187         str     q14, [x_dst, #16*14]
 188         str     q15, [x_dst, #16*15]
 189
 190         cmp     x_src0, x_src0_end
 191         add     x_dst, x_dst, #256
 192         add     w_col, w_col, #256
 193         bls     .Lloop256
 194
 195 .Lloop256_end:
 196         /* restore d8 ~ d15 */
 197         ldp     d8, d9, [sp]
 198         ldp     d10, d11, [sp, #16]
 199         ldp     d12, d13, [sp, #32]
 200         ldp     d14, d15, [sp, #48]
 201         add     sp, sp, #64
 202
 203         add     x_src0_end, x_src0_end, #256
 204
 205 .Lloop16_init:
 206         /* len16 = len - len%16; len %= 16 */
 207         mov     w_len16, w_len
 208         and     w_len, w_len, #0xF
 209         sub     w_len16, w_len16, w_len
 210
 211         /* less than 16 bytes? */
 212         cbz     w_len16, .Lloop1_init
 213
 214         sub     x_src0_end, x_src0_end, #16
 215
 216         /* batch process (vects-1)*16 bytes */
 217 .Lloop16:
 218         ldr     q0, [x_src0], #16
 219         cbz     w_vects, .Lloop16_vects_end
 220
 221         add     x_src_ptr, x_src, #8
 222 .Lloop16_vects:
 223         ldr     x_srcn, [x_src_ptr], #8
 224         cmp     x_src_ptr, x_dst_ptr
 225         ldr     q1, [x_srcn, x_col]
 226         eor     v0.16b, v0.16b, v1.16b
 227         bne     .Lloop16_vects
 228
 229 .Lloop16_vects_end:
 230         cmp     x_src0, x_src0_end
 231         str     q0, [x_dst], #16
 232         add     w_col, w_col, #16
 233         bls     .Lloop16
 234
 235 .Loop16_end:
 236         add     x_src0_end, x_src0_end, #16
 237
 238 .Lloop1_init:
 239         cbnz    w_len, .Lloop1
 240         mov     w_ret, #0
 241         ret
 242
 243         /* batch process (vects-1)*1 bytes */
 244 .Lloop1:
 245         ldrb    w_xor, [x_src0], #1
 246         cbz     w_vects, .Lloop1_vects_end
 247
 248         add     x_src_ptr, x_src, #8
 249 .Lloop1_vects:
 250         ldr     x_srcn, [x_src_ptr], #8
 251         cmp     x_src_ptr, x_dst_ptr
 252         ldrb    w_in, [x_srcn, x_col]
 253         eor     w_xor, w_xor, w_in
 254         bne     .Lloop1_vects
 255
 256 .Lloop1_vects_end:
 257         cmp     x_src0, x_src0_end
 258         strb    w_xor, [x_dst], #1
 259         add     w_col, w_col, #1
 260         bne     .Lloop1
 261
 262 .Loop1_end:
 263         mov     w_ret, #0
 264         ret