ceph/src/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, *dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifdef HAVE_AS_KNOWS_AVX512
  37
  38 %ifidn __OUTPUT_FORMAT__, elf64
  39  %define arg0  rdi
  40  %define arg1  rsi
  41  %define arg2  rdx
  42  %define arg3  rcx
  43  %define arg4  r8
  44  %define arg5  r9
  45
  46  %define tmp   r11
  47  %define tmp2  r10
  48  %define return rax
  49  %define PS     8
  50  %define LOG_PS 3
  51
  52  %define func(x) x: endbranch
  53  %define FUNC_SAVE
  54  %define FUNC_RESTORE
  55 %endif
  56
  57 %ifidn __OUTPUT_FORMAT__, win64
  58  %define arg0   rcx
  59  %define arg1   rdx
  60  %define arg2   r8
  61  %define arg3   r9
  62
  63  %define arg4   r12             ; must be saved, loaded and restored
  64  %define arg5   r15             ; must be saved and restored
  65  %define tmp    r11
  66  %define tmp2   r10
  67  %define return rax
  68  %define PS     8
  69  %define LOG_PS 3
  70  %define stack_size  0*16 + 3*8         ; must be an odd multiple of 8
  71  %define arg(x)      [rsp + stack_size + PS + PS*x]
  72
  73  %define func(x) proc_frame x
  74  %macro FUNC_SAVE 0
  75         alloc_stack     stack_size
  76         save_reg        r12,  0*8
  77         save_reg        r15,  1*8
  78         end_prolog
  79         mov     arg4, arg(4)
  80  %endmacro
  81
  82  %macro FUNC_RESTORE 0
  83         mov     r12,  [rsp + 0*8]
  84         mov     r15,  [rsp + 1*8]
  85         add     rsp, stack_size
  86  %endmacro
  87 %endif
  88
  89
  90 %define len    arg0
  91 %define vec    arg1
  92 %define mul_array arg2
  93 %define src    arg3
  94 %define dest1  arg4
  95 %define ptr    arg5
  96 %define vec_i  tmp2
  97 %define pos    return
  98
  99
 100 %ifndef EC_ALIGNED_ADDR
 101 ;;; Use Un-aligned load/store
 102  %define XLDR vmovdqu8
 103  %define XSTR vmovdqu8
 104 %else
 105 ;;; Use Non-temporal load/stor
 106  %ifdef NO_NT_LDST
 107   %define XLDR vmovdqa
 108   %define XSTR vmovdqa
 109  %else
 110   %define XLDR vmovntdqa
 111   %define XSTR vmovntdq
 112  %endif
 113 %endif
 114
 115 %define xmask0f   zmm5
 116 %define xgft1_lo  zmm4
 117 %define xgft1_loy ymm4
 118 %define xgft1_hi  zmm3
 119 %define x0        zmm0
 120 %define xgft1_loy ymm4
 121 %define x0y       ymm0
 122 %define xtmpa     zmm1
 123 %define xp1       zmm2
 124 %define xp1y      ymm2
 125
 126 default rel
 127 [bits 64]
 128 section .text
 129
 130 align 16
 131 mk_global gf_vect_dot_prod_avx512, function
 132 func(gf_vect_dot_prod_avx512)
 133         FUNC_SAVE
 134         xor     pos, pos
 135         mov     tmp, 0x0f
 136         vpbroadcastb xmask0f, tmp       ;Construct mask 0x0f0f0f...
 137         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 138         sub     len, 64
 139         jl      .len_lt_64
 140
 141 .loop64:
 142         vpxorq  xp1, xp1, xp1
 143         mov     tmp, mul_array
 144         xor     vec_i, vec_i
 145
 146 .next_vect:
 147         mov     ptr, [src+vec_i]
 148         XLDR    x0, [ptr+pos]           ;Get next source vector
 149         add     vec_i, PS
 150
 151         vpandq  xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 152         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 153         vpandq  x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 154
 155         vmovdqu8 xgft1_loy, [tmp]               ;Load array Ax{00}..{0f}, Ax{00}..{f0}
 156         add     tmp, 32
 157
 158         vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
 159         vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
 160
 161         vpshufb xgft1_hi, xgft1_hi, x0          ;Lookup mul table of high nibble
 162         vpshufb xgft1_lo, xgft1_lo, xtmpa       ;Lookup mul table of low nibble
 163         vpxorq  xgft1_hi, xgft1_hi, xgft1_lo    ;GF add high and low partials
 164         vpxorq  xp1, xp1, xgft1_hi              ;xp1 += partial
 165
 166         cmp     vec_i, vec
 167         jl      .next_vect
 168
 169         XSTR    [dest1+pos], xp1
 170
 171         add     pos, 64                 ;Loop on 64 bytes at a time
 172         cmp     pos, len
 173         jle     .loop64
 174
 175         lea     tmp, [len + 64]
 176         cmp     pos, tmp
 177         je      .return_pass
 178
 179         ;; Tail len
 180         mov     pos, len        ;Overlapped offset length-64
 181         jmp     .loop64         ;Do one more overlap pass
 182
 183
 184 .len_lt_64:                     ; 32-byte version
 185         add     len, 32
 186         jl      .return_fail
 187
 188 .loop32:
 189         vpxorq  xp1, xp1, xp1
 190         mov     tmp, mul_array
 191         xor     vec_i, vec_i
 192
 193 .next_vect2:
 194         mov     ptr, [src+vec_i]
 195         XLDR    x0y, [ptr+pos]          ;Get next source vector 32B
 196         add     vec_i, PS
 197         vpsraw  xtmpa, x0, 4            ;Shift to put high nibble into bits 4-0
 198         vshufi64x2 x0, x0, xtmpa, 0x44  ;put x0 = xl:xh
 199         vpandq  x0, x0, xmask0f         ;Mask bits 4-0
 200         vmovdqu8 xgft1_loy, [tmp]       ;Load array Ax{00}..{0f}, Ax{00}..{f0}
 201         add     tmp, 32
 202         vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x50   ;=AlAh:AlAh
 203         vpshufb    xgft1_lo, xgft1_lo, x0               ;Lookup mul table
 204         vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x0e   ;=xh:
 205         vpxorq  xgft1_hi, xgft1_hi, xgft1_lo    ;GF add high and low partials
 206         vpxorq  xp1, xp1, xgft1_hi              ;xp1 += partial
 207         cmp     vec_i, vec
 208         jl      .next_vect2
 209
 210         XSTR    [dest1+pos], xp1y
 211         add     pos, 32                 ;Loop on 32 bytes at a time
 212         cmp     pos, len
 213         jle     .loop32
 214
 215         lea     tmp, [len + 32]
 216         cmp     pos, tmp
 217         je      .return_pass
 218
 219         ;; Tail len
 220         mov     pos, len        ;Overlapped offset length-32
 221         jmp     .loop32         ;Do one more overlap pass
 222
 223 .return_pass:
 224         mov     return, 0
 225         FUNC_RESTORE
 226         ret
 227
 228 .return_fail:
 229         mov     return, 1
 230         FUNC_RESTORE
 231         ret
 232
 233 endproc_frame
 234
 235 %else
 236 %ifidn __OUTPUT_FORMAT__, win64
 237 global no_gf_vect_dot_prod_avx512
 238 no_gf_vect_dot_prod_avx512:
 239 %endif
 240 %endif  ; ifdef HAVE_AS_KNOWS_AVX512