ceph/src/spdk/isa-l/raid/xor_gen_avx.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized xor of N source vectors using AVX
  31 ;;; int xor_gen_avx(int vects, int len, void **array)
  32
  33 ;;; Generates xor parity vector from N (vects-1) sources in array of pointers
  34 ;;; (**array).  Last pointer is the dest.
  35 ;;; Vectors must be aligned to 32 bytes.  Length can be any value.
  36
  37 %include "reg_sizes.asm"
  38
  39 %ifidn __OUTPUT_FORMAT__, elf64
  40  %define arg0  rdi
  41  %define arg1  rsi
  42  %define arg2  rdx
  43  %define arg3  rcx
  44  %define arg4  r8
  45  %define arg5  r9
  46  %define tmp   r11
  47  %define tmp3  arg4
  48  %define func(x) x:
  49  %define return rax
  50  %define FUNC_SAVE
  51  %define FUNC_RESTORE
  52
  53 %elifidn __OUTPUT_FORMAT__, win64
  54  %define arg0  rcx
  55  %define arg1  rdx
  56  %define arg2  r8
  57  %define arg3  r9
  58  %define tmp   r11
  59  %define tmp3  r10
  60  %define func(x) proc_frame x
  61  %define return rax
  62  %define stack_size  2*32 + 8   ;must be an odd multiple of 8
  63
  64  %macro FUNC_SAVE 0
  65         alloc_stack     stack_size
  66         vmovdqu [rsp + 0*32], ymm6
  67         vmovdqu [rsp + 1*32], ymm7
  68         end_prolog
  69  %endmacro
  70  %macro FUNC_RESTORE 0
  71         vmovdqu ymm6, [rsp + 0*32]
  72         vmovdqu ymm7, [rsp + 1*32]
  73         add     rsp, stack_size
  74  %endmacro
  75
  76 %endif  ;output formats
  77
  78
  79 %define vec arg0
  80 %define len arg1
  81 %define ptr arg3
  82 %define tmp2 rax
  83 %define tmp2.b al
  84 %define pos tmp3
  85 %define PS 8
  86
  87 ;;; Use Non-temporal load/stor
  88 %ifdef NO_NT_LDST
  89  %define XLDR vmovdqa
  90  %define XSTR vmovdqa
  91 %else
  92  %define XLDR vmovdqa
  93  %define XSTR vmovntdq
  94 %endif
  95
  96
  97 default rel
  98 [bits 64]
  99
 100 section .text
 101
 102 align 16
 103 global xor_gen_avx:function
 104 func(xor_gen_avx)
 105
 106         FUNC_SAVE
 107         sub     vec, 2                  ;Keep as offset to last source
 108         jng     return_fail             ;Must have at least 2 sources
 109         cmp     len, 0
 110         je      return_pass
 111         test    len, (128-1)            ;Check alignment of length
 112         jnz     len_not_aligned
 113
 114
 115 len_aligned_128bytes:
 116         sub     len, 128
 117         mov     pos, 0
 118
 119 loop128:
 120         mov     tmp, vec                ;Back to last vector
 121         mov     tmp2, [arg2+vec*PS]     ;Fetch last pointer in array
 122         sub     tmp, 1                  ;Next vect
 123         XLDR    ymm0, [tmp2+pos]        ;Start with end of array in last vector
 124         XLDR    ymm1, [tmp2+pos+32]     ;Keep xor parity in xmm0-7
 125         XLDR    ymm2, [tmp2+pos+(2*32)]
 126         XLDR    ymm3, [tmp2+pos+(3*32)]
 127
 128 next_vect:
 129         mov     ptr, [arg2+tmp*PS]
 130         sub     tmp, 1
 131         XLDR    ymm4, [ptr+pos]         ;Get next vector (source)
 132         XLDR    ymm5, [ptr+pos+32]
 133         XLDR    ymm6, [ptr+pos+(2*32)]
 134         XLDR    ymm7, [ptr+pos+(3*32)]
 135         vxorpd  ymm0, ymm0, ymm4        ;Add to xor parity
 136         vxorpd  ymm1, ymm1, ymm5
 137         vxorpd  ymm2, ymm2, ymm6
 138         vxorpd  ymm3, ymm3, ymm7
 139         jge     next_vect               ;Loop for each source
 140
 141         mov     ptr, [arg2+PS+vec*PS]   ;Address of parity vector
 142         XSTR    [ptr+pos], ymm0         ;Write parity xor vector
 143         XSTR    [ptr+pos+(1*32)], ymm1
 144         XSTR    [ptr+pos+(2*32)], ymm2
 145         XSTR    [ptr+pos+(3*32)], ymm3
 146         add     pos, 128
 147         cmp     pos, len
 148         jle     loop128
 149
 150 return_pass:
 151         FUNC_RESTORE
 152         mov     return, 0
 153         ret
 154
 155
 156 ;;; Do one byte at a time for no alignment case
 157 loop_1byte:
 158         mov     tmp, vec                ;Back to last vector
 159         mov     ptr, [arg2+vec*PS]      ;Fetch last pointer in array
 160         mov     tmp2.b, [ptr+len-1]     ;Get array n
 161         sub     tmp, 1
 162 nextvect_1byte:
 163         mov     ptr, [arg2+tmp*PS]
 164         xor     tmp2.b, [ptr+len-1]
 165         sub     tmp, 1
 166         jge     nextvect_1byte
 167
 168         mov     tmp, vec
 169         add     tmp, 1                  ;Add back to point to last vec
 170         mov     ptr, [arg2+tmp*PS]
 171         mov     [ptr+len-1], tmp2.b     ;Write parity
 172         sub     len, 1
 173         test    len, (PS-1)
 174         jnz     loop_1byte
 175
 176         cmp     len, 0
 177         je      return_pass
 178         test    len, (128-1)            ;If not 0 and 128bit aligned
 179         jz      len_aligned_128bytes    ; then do aligned case. len = y * 128
 180
 181         ;; else we are 8-byte aligned so fall through to recheck
 182
 183
 184         ;; Unaligned length cases
 185 len_not_aligned:
 186         test    len, (PS-1)
 187         jne     loop_1byte
 188         mov     tmp3, len
 189         and     tmp3, (128-1)           ;Do the unaligned bytes 8 at a time
 190
 191         ;; Run backwards 8 bytes at a time for (tmp3) bytes
 192 loop8_bytes:
 193         mov     tmp, vec                ;Back to last vector
 194         mov     ptr, [arg2+vec*PS]      ;Fetch last pointer in array
 195         mov     tmp2, [ptr+len-PS]      ;Get array n
 196         sub     tmp, 1
 197 nextvect_8bytes:
 198         mov     ptr, [arg2+tmp*PS]      ;Get pointer to next vector
 199         xor     tmp2, [ptr+len-PS]
 200         sub     tmp, 1
 201         jge     nextvect_8bytes         ;Loop for each source
 202
 203         mov     tmp, vec
 204         add     tmp, 1                  ;Add back to point to last vec
 205         mov     ptr, [arg2+tmp*PS]
 206         mov     [ptr+len-PS], tmp2      ;Write parity
 207         sub     len, PS
 208         sub     tmp3, PS
 209         jg      loop8_bytes
 210
 211         cmp     len, 128                ;Now len is aligned to 128B
 212         jge     len_aligned_128bytes    ;We can do the rest aligned
 213
 214         cmp     len, 0
 215         je      return_pass
 216
 217 return_fail:
 218         FUNC_RESTORE
 219         mov     return, 1
 220         ret
 221
 222 endproc_frame
 223
 224 section .data
 225
 226 ;;;       func         core, ver, snum
 227 slversion xor_gen_avx, 02,   05,  0037
 228