ceph/src/isa-l/raid/xor_gen_avx512.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized xor of N source vectors using AVX512
  31 ;;; int xor_gen_avx512(int vects, int len, void **array)
  32
  33 ;;; Generates xor parity vector from N (vects-1) sources in array of pointers
  34 ;;; (**array).  Last pointer is the dest.
  35 ;;; Vectors must be aligned to 32 bytes.  Length can be any value.
  36
  37 %include "reg_sizes.asm"
  38
  39 %ifdef HAVE_AS_KNOWS_AVX512
  40
  41 %ifidn __OUTPUT_FORMAT__, elf64
  42  %define arg0  rdi
  43  %define arg1  rsi
  44  %define arg2  rdx
  45  %define arg3  rcx
  46  %define arg4  r8
  47  %define arg5  r9
  48  %define tmp   r11
  49  %define tmp3  arg4
  50  %define func(x) x:
  51  %define return rax
  52  %define FUNC_SAVE
  53  %define FUNC_RESTORE
  54
  55 %elifidn __OUTPUT_FORMAT__, win64
  56  %define arg0  rcx
  57  %define arg1  rdx
  58  %define arg2  r8
  59  %define arg3  r9
  60  %define tmp   r11
  61  %define tmp3  r10
  62  %define func(x) proc_frame x
  63  %define return rax
  64  %define stack_size  2*16 + 8   ;must be an odd multiple of 8
  65
  66  %macro FUNC_SAVE 0
  67         alloc_stack     stack_size
  68         vmovdqu [rsp + 0*16], xmm6
  69         vmovdqu [rsp + 1*16], xmm7
  70         end_prolog
  71  %endmacro
  72  %macro FUNC_RESTORE 0
  73         vmovdqu xmm6, [rsp + 0*16]
  74         vmovdqu xmm7, [rsp + 1*316]
  75         add     rsp, stack_size
  76  %endmacro
  77
  78 %endif  ;output formats
  79
  80
  81 %define vec arg0
  82 %define len arg1
  83 %define ptr arg3
  84 %define tmp2 rax
  85 %define tmp2.b al
  86 %define pos tmp3
  87 %define PS 8
  88
  89 %define NO_NT_LDST
  90 ;;; Use Non-temporal load/stor
  91 %ifdef NO_NT_LDST
  92  %define XLDR vmovdqu8
  93  %define XSTR vmovdqu8
  94 %else
  95  %define XLDR vmovntdqa
  96  %define XSTR vmovntdq
  97 %endif
  98
  99
 100 default rel
 101 [bits 64]
 102
 103 section .text
 104
 105 align 16
 106 global xor_gen_avx512:function
 107 func(xor_gen_avx512)
 108         FUNC_SAVE
 109         sub     vec, 2                  ;Keep as offset to last source
 110         jng     return_fail             ;Must have at least 2 sources
 111         cmp     len, 0
 112         je      return_pass
 113         test    len, (128-1)            ;Check alignment of length
 114         jnz     len_not_aligned
 115
 116 len_aligned_128bytes:
 117         sub     len, 128
 118         mov     pos, 0
 119
 120 loop128:
 121         mov     tmp, vec                ;Back to last vector
 122         mov     tmp2, [arg2+vec*PS]     ;Fetch last pointer in array
 123         sub     tmp, 1                  ;Next vect
 124         XLDR    zmm0, [tmp2+pos]        ;Start with end of array in last vector
 125         XLDR    zmm1, [tmp2+pos+64]     ;Keep xor parity in xmm0-7
 126
 127 next_vect:
 128         mov     ptr, [arg2+tmp*PS]
 129         sub     tmp, 1
 130         XLDR    zmm4, [ptr+pos]         ;Get next vector (source)
 131         XLDR    zmm5, [ptr+pos+64]
 132         vpxorq  zmm0, zmm0, zmm4        ;Add to xor parity
 133         vpxorq  zmm1, zmm1, zmm5
 134         jge     next_vect               ;Loop for each source
 135
 136         mov     ptr, [arg2+PS+vec*PS]   ;Address of parity vector
 137         XSTR    [ptr+pos], zmm0         ;Write parity xor vector
 138         XSTR    [ptr+pos+64], zmm1
 139         add     pos, 128
 140         cmp     pos, len
 141         jle     loop128
 142
 143 return_pass:
 144         FUNC_RESTORE
 145         mov     return, 0
 146         ret
 147
 148
 149 ;;; Do one byte at a time for no alignment case
 150 loop_1byte:
 151         mov     tmp, vec                ;Back to last vector
 152         mov     ptr, [arg2+vec*PS]      ;Fetch last pointer in array
 153         mov     tmp2.b, [ptr+len-1]     ;Get array n
 154         sub     tmp, 1
 155 nextvect_1byte:
 156         mov     ptr, [arg2+tmp*PS]
 157         xor     tmp2.b, [ptr+len-1]
 158         sub     tmp, 1
 159         jge     nextvect_1byte
 160
 161         mov     tmp, vec
 162         add     tmp, 1                  ;Add back to point to last vec
 163         mov     ptr, [arg2+tmp*PS]
 164         mov     [ptr+len-1], tmp2.b     ;Write parity
 165         sub     len, 1
 166         test    len, (PS-1)
 167         jnz     loop_1byte
 168
 169         cmp     len, 0
 170         je      return_pass
 171         test    len, (128-1)            ;If not 0 and 128bit aligned
 172         jz      len_aligned_128bytes    ; then do aligned case. len = y * 128
 173
 174         ;; else we are 8-byte aligned so fall through to recheck
 175
 176
 177         ;; Unaligned length cases
 178 len_not_aligned:
 179         test    len, (PS-1)
 180         jne     loop_1byte
 181         mov     tmp3, len
 182         and     tmp3, (128-1)           ;Do the unaligned bytes 8 at a time
 183
 184         ;; Run backwards 8 bytes at a time for (tmp3) bytes
 185 loop8_bytes:
 186         mov     tmp, vec                ;Back to last vector
 187         mov     ptr, [arg2+vec*PS]      ;Fetch last pointer in array
 188         mov     tmp2, [ptr+len-PS]      ;Get array n
 189         sub     tmp, 1
 190 nextvect_8bytes:
 191         mov     ptr, [arg2+tmp*PS]      ;Get pointer to next vector
 192         xor     tmp2, [ptr+len-PS]
 193         sub     tmp, 1
 194         jge     nextvect_8bytes ;Loop for each source
 195
 196         mov     tmp, vec
 197         add     tmp, 1                  ;Add back to point to last vec
 198         mov     ptr, [arg2+tmp*PS]
 199         mov     [ptr+len-PS], tmp2      ;Write parity
 200         sub     len, PS
 201         sub     tmp3, PS
 202         jg      loop8_bytes
 203
 204         cmp     len, 128                ;Now len is aligned to 128B
 205         jge     len_aligned_128bytes    ;We can do the rest aligned
 206
 207         cmp     len, 0
 208         je      return_pass
 209
 210 return_fail:
 211         FUNC_RESTORE
 212         mov     return, 1
 213         ret
 214
 215 endproc_frame
 216
 217 %endif  ; ifdef HAVE_AS_KNOWS_AVX512