ceph/src/isa-l/erasure_code/gf_vect_mad_avx512.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifdef HAVE_AS_KNOWS_AVX512
  37
  38 %ifidn __OUTPUT_FORMAT__, elf64
  39  %define arg0  rdi
  40  %define arg1  rsi
  41  %define arg2  rdx
  42  %define arg3  rcx
  43  %define arg4  r8
  44  %define arg5  r9
  45  %define tmp   r11
  46  %define return rax
  47  %define func(x) x:
  48  %define FUNC_SAVE
  49  %define FUNC_RESTORE
  50 %endif
  51
  52 %ifidn __OUTPUT_FORMAT__, win64
  53  %define arg0   rcx
  54  %define arg1   rdx
  55  %define arg2   r8
  56  %define arg3   r9
  57  %define arg4   r12             ; must be saved and loaded
  58  %define arg5   r15
  59  %define tmp    r11
  60  %define return rax
  61  %define PS 8
  62  %define stack_size 16*3 + 3*8
  63  %define arg(x)      [rsp + stack_size + PS + PS*x]
  64  %define func(x) proc_frame x
  65
  66  %macro FUNC_SAVE 0
  67         sub     rsp, stack_size
  68         vmovdqa [rsp+16*0],xmm6
  69         vmovdqa [rsp+16*1],xmm7
  70         vmovdqa [rsp+16*2],xmm8
  71         save_reg        r12,  3*16 + 0*8
  72         save_reg        r15,  3*16 + 1*8
  73         end_prolog
  74         mov     arg4, arg(4)
  75         mov     arg5, arg(5)
  76  %endmacro
  77
  78  %macro FUNC_RESTORE 0
  79         vmovdqa xmm6, [rsp+16*0]
  80         vmovdqa xmm7, [rsp+16*1]
  81         vmovdqa xmm8, [rsp+16*2]
  82         mov     r12,  [rsp + 3*16 + 0*8]
  83         mov     r15,  [rsp + 3*16 + 1*8]
  84         add     rsp, stack_size
  85  %endmacro
  86 %endif
  87
  88 ;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest)
  89 %define len   arg0
  90 %define vec   arg1
  91 %define vec_i    arg2
  92 %define mul_array arg3
  93 %define src   arg4
  94 %define dest  arg5
  95 %define pos   return
  96
  97 %ifndef EC_ALIGNED_ADDR
  98 ;;; Use Un-aligned load/store
  99  %define XLDR vmovdqu8
 100  %define XSTR vmovdqu8
 101 %else
 102 ;;; Use Non-temporal load/stor
 103  %ifdef NO_NT_LDST
 104   %define XLDR vmovdqa
 105   %define XSTR vmovdqa
 106  %else
 107   %define XLDR vmovntdqa
 108   %define XSTR vmovntdq
 109  %endif
 110 %endif
 111
 112
 113 default rel
 114
 115 [bits 64]
 116 section .text
 117
 118 %define x0       zmm0
 119 %define xtmpa    zmm1
 120 %define xtmph    zmm2
 121 %define xtmpl    zmm3
 122 %define xd       zmm4
 123 %define xtmpd    zmm5
 124 %define xgft_hi  zmm6
 125 %define xgft_lo  zmm7
 126 %define xgft_loy ymm7
 127 %define xmask0f  zmm8
 128
 129 align 16
 130 global gf_vect_mad_avx512:function
 131 func(gf_vect_mad_avx512)
 132         FUNC_SAVE
 133         sub     len, 64
 134         jl      .return_fail
 135         xor     pos, pos
 136         mov     tmp, 0x0f
 137         vpbroadcastb xmask0f, tmp       ;Construct mask 0x0f0f0f...
 138         sal     vec_i, 5                ;Multiply by 32
 139         vmovdqu8 xgft_loy, [vec_i+mul_array]    ;Load array Cx{00}..{0f}, Cx{00}..{f0}
 140         vshufi64x2 xgft_hi, xgft_lo, xgft_lo, 0x55
 141         vshufi64x2 xgft_lo, xgft_lo, xgft_lo, 0x00
 142         mov     tmp, -1
 143         kmovq   k1, tmp
 144
 145 .loop64:
 146         XLDR    xd, [dest+pos]          ;Get next dest vector
 147         XLDR    x0, [src+pos]           ;Get next source vector
 148
 149         vpandq  xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 150         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 151         vpandq  x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 152
 153         vpshufb xtmph {k1}{z}, xgft_hi, x0      ;Lookup mul table of high nibble
 154         vpshufb xtmpl {k1}{z}, xgft_lo, xtmpa   ;Lookup mul table of low nibble
 155         vpxorq  xtmph, xtmph, xtmpl     ;GF add high and low partials
 156         vpxorq  xd, xd, xtmph           ;xd += partial
 157
 158         XSTR    [dest+pos], xd
 159         add     pos, 64                 ;Loop on 64 bytes at a time
 160         cmp     pos, len
 161         jle     .loop64
 162
 163         lea     tmp, [len + 64]
 164         cmp     pos, tmp
 165         je      .return_pass
 166
 167         ;; Tail len
 168         mov     pos, (1 << 63)
 169         lea     tmp, [len + 64 - 1]
 170         and     tmp, 63
 171         sarx    pos, pos, tmp
 172         kmovq   k1, pos
 173         mov     pos, len        ;Overlapped offset length-64
 174         jmp     .loop64         ;Do one more overlap pass
 175
 176 .return_pass:
 177         mov     return, 0
 178         FUNC_RESTORE
 179         ret
 180
 181 .return_fail:
 182         mov     return, 1
 183         FUNC_RESTORE
 184         ret
 185
 186 endproc_frame
 187
 188 %else
 189 %ifidn __OUTPUT_FORMAT__, win64
 190 global no_gf_vect_mad_avx512
 191 no_gf_vect_mad_avx512:
 192 %endif
 193 %endif  ; ifdef HAVE_AS_KNOWS_AVX512