ceph/src/isa-l/raid/pq_gen_avx512.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized pq of N source vectors using AVX512
  31 ;;; int pq_gen_avx512(int vects, int len, void **array)
  32
  33 ;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
  34 ;;; (**array).  Last two pointers are the P and Q destinations respectively.
  35 ;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined.
  36 ;;; Length must be 32 byte multiple.
  37
  38 %include "reg_sizes.asm"
  39
  40 %ifdef HAVE_AS_KNOWS_AVX512
  41
  42 %ifidn __OUTPUT_FORMAT__, elf64
  43  %define arg0  rdi
  44  %define arg1  rsi
  45  %define arg2  rdx
  46  %define arg3  rcx
  47  %define arg4  r8
  48  %define arg5  r9
  49  %define tmp   r11
  50  %define tmp3  arg4
  51  %define return rax
  52  %define func(x) x:
  53  %define FUNC_SAVE
  54  %define FUNC_RESTORE
  55 %endif
  56
  57 %ifidn __OUTPUT_FORMAT__, win64
  58  %define arg0  rcx
  59  %define arg1  rdx
  60  %define arg2  r8
  61  %define arg3  r9
  62  %define tmp   r11
  63  %define tmp3  r10
  64  %define return rax
  65  %define stack_size  4*16 + 8   ; must be an odd multiple of 8
  66  %define func(x) proc_frame x
  67  %macro FUNC_SAVE 0
  68         alloc_stack     stack_size
  69         vmovdqu [rsp + 0*16], xmm6
  70         vmovdqu [rsp + 1*16], xmm7
  71         vmovdqu [rsp + 2*16], xmm8
  72         vmovdqu [rsp + 3*16], xmm9
  73         end_prolog
  74  %endmacro
  75
  76  %macro FUNC_RESTORE 0
  77         vmovdqu xmm6, [rsp + 0*16]
  78         vmovdqu xmm7, [rsp + 1*16]
  79         vmovdqu xmm8, [rsp + 2*16]
  80         vmovdqu xmm9, [rsp + 3*16]
  81         add     rsp, stack_size
  82  %endmacro
  83 %endif
  84
  85 %define vec    arg0
  86 %define len    arg1
  87 %define ptr    arg3
  88 %define pos    rax
  89
  90 %define xp1    zmm0
  91 %define xq1    zmm1
  92 %define xtmp1  zmm2
  93 %define xs1    zmm3
  94
  95 %define xp2    zmm4
  96 %define xq2    zmm5
  97 %define xtmp2  zmm6
  98 %define xs2    zmm7
  99
 100 %define xzero  zmm8
 101 %define xpoly  zmm9
 102
 103 %define xp1y   ymm0
 104 %define xq1y   ymm1
 105 %define xtmp1y ymm2
 106 %define xs1y   ymm3
 107 %define xzeroy ymm8
 108 %define xpolyy ymm9
 109
 110 %define NO_NT_LDST
 111 ;;; Use Non-temporal load/stor
 112 %ifdef NO_NT_LDST
 113  %define XLDR vmovdqu8          ;u8
 114  %define XSTR vmovdqu8
 115 %else
 116  %define XLDR vmovntdqa
 117  %define XSTR vmovntdq
 118 %endif
 119
 120 default rel
 121
 122 [bits 64]
 123 section .text
 124
 125 align 16
 126 global pq_gen_avx512:function
 127 func(pq_gen_avx512)
 128         FUNC_SAVE
 129         sub     vec, 3                  ;Keep as offset to last source
 130         jng     return_fail             ;Must have at least 2 sources
 131         cmp     len, 0
 132         je      return_pass
 133         test    len, (32-1)             ;Check alignment of length
 134         jnz     return_fail
 135         mov     pos, 0
 136         mov     tmp, 0x1d
 137         vpbroadcastb xpoly, tmp
 138         vpxorq  xzero, xzero, xzero
 139         cmp     len, 128
 140         jl      loop32
 141
 142 len_aligned_32bytes:
 143         sub     len, 2*64               ;Len points to last block
 144
 145 loop128:
 146         mov     ptr, [arg2+vec*8]       ;Fetch last source pointer
 147         mov     tmp, vec                ;Set tmp to point back to last vector
 148         XLDR    xs1, [ptr+pos]          ;Preload last vector (source)
 149         XLDR    xs2, [ptr+pos+64]       ;Preload last vector (source)
 150         vpxorq  xp1, xp1, xp1           ;p1 = 0
 151         vpxorq  xp2, xp2, xp2           ;p2 = 0
 152         vpxorq  xq1, xq1, xq1           ;q1 = 0
 153         vpxorq  xq2, xq2, xq2           ;q2 = 0
 154
 155 next_vect:
 156         sub     tmp, 1                  ;Inner loop for each source vector
 157         mov     ptr, [arg2+tmp*8]       ; get pointer to next vect
 158         vpxorq  xq1, xq1, xs1           ; q1 ^= s1
 159         vpxorq  xq2, xq2, xs2           ; q2 ^= s2
 160         vpxorq  xp1, xp1, xs1           ; p1 ^= s1
 161         vpxorq  xp2, xp2, xs2           ; p2 ^= s2
 162         vpcmpb  k1, xq1, xzero, 1
 163         vpcmpb  k2, xq2, xzero, 1
 164         vpblendmb xtmp1 {k1}, xzero, xpoly
 165         vpblendmb xtmp2 {k2}, xzero, xpoly
 166         XLDR    xs1, [ptr+pos]          ; Get next vector (source data1)
 167         XLDR    xs2, [ptr+pos+64]       ; Get next vector (source data2)
 168         vpaddb  xq1, xq1, xq1           ; q1 = q1<<1
 169         vpaddb  xq2, xq2, xq2           ; q2 = q2<<1
 170         vpxorq  xq1, xq1, xtmp1         ; q1 = q1<<1 ^ poly_masked
 171         vpxorq  xq2, xq2, xtmp2         ; q2 = q2<<1 ^ poly_masked
 172         jg      next_vect               ; Loop for each vect except 0
 173
 174         mov     ptr, [arg2+8+vec*8]     ;Get address of P parity vector
 175         mov     tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
 176         vpxorq  xp1, xp1, xs1           ;p1 ^= s1[0] - last source is already loaded
 177         vpxorq  xq1, xq1, xs1           ;q1 ^= 1 * s1[0]
 178         vpxorq  xp2, xp2, xs2           ;p2 ^= s2[0]
 179         vpxorq  xq2, xq2, xs2           ;q2 ^= 1 * s2[0]
 180         XSTR    [ptr+pos], xp1          ;Write parity P1 vector
 181         XSTR    [ptr+pos+64], xp2       ;Write parity P2 vector
 182         XSTR    [tmp+pos], xq1          ;Write parity Q1 vector
 183         XSTR    [tmp+pos+64], xq2       ;Write parity Q2 vector
 184         add     pos, 2*64
 185         cmp     pos, len
 186         jle     loop128
 187
 188         ;; ------------------------------
 189         ;; Do last 32 or 64 Bytes remaining
 190         add     len, 2*64
 191         cmp     pos, len
 192         je      return_pass
 193
 194 loop32:
 195         mov     ptr, [arg2+vec*8]       ;Fetch last source pointer
 196         mov     tmp, vec                ;Set tmp to point back to last vector
 197         XLDR    xs1y, [ptr+pos]         ;Preload last vector (source)
 198         vpxorq  xp1y, xp1y, xp1y        ;p = 0
 199         vpxorq  xq1y, xq1y, xq1y        ;q = 0
 200
 201 next_vect32:
 202         sub     tmp, 1                  ;Inner loop for each source vector
 203         mov     ptr, [arg2+tmp*8]       ; get pointer to next vect
 204         vpxorq  xq1y, xq1y, xs1y        ; q1 ^= s1
 205         vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00
 206         vpxorq  xp1y, xp1y, xs1y        ; p ^= s
 207         vpaddb  xq1y, xq1y, xq1y        ; q = q<<1
 208         vpxorq  xq1y, xq1y, xtmp1y      ; q = q<<1 ^ poly_masked
 209         XLDR    xs1y, [ptr+pos]         ; Get next vector (source data)
 210         jg      next_vect32             ; Loop for each vect except 0
 211
 212         mov     ptr, [arg2+8+vec*8]     ;Get address of P parity vector
 213         mov     tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
 214         vpxorq  xp1y, xp1y, xs1y        ;p ^= s[0] - last source is already loaded
 215         vpxorq  xq1y, xq1y, xs1y        ;q ^= 1 * s[0]
 216         XSTR    [ptr+pos], xp1y         ;Write parity P vector
 217         XSTR    [tmp+pos], xq1y         ;Write parity Q vector
 218         add     pos, 32
 219         cmp     pos, len
 220         jl      loop32
 221
 222
 223 return_pass:
 224         mov     return, 0
 225         FUNC_RESTORE
 226         ret
 227
 228 return_fail:
 229         mov     return, 1
 230         FUNC_RESTORE
 231         ret
 232
 233 endproc_frame
 234
 235 %endif  ; ifdef HAVE_AS_KNOWS_AVX512