ceph/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_vect_mul_avx(len, mul_array, src, dest)
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43  %define tmp   r11
  44  %define return rax
  45  %define func(x) x:
  46  %define FUNC_SAVE
  47  %define FUNC_RESTORE
  48
  49 %elifidn __OUTPUT_FORMAT__, win64
  50  %define arg0  rcx
  51  %define arg1  rdx
  52  %define arg2  r8
  53  %define arg3  r9
  54  %define return rax
  55  %define stack_size  5*16 + 8   ; must be an odd multiple of 8
  56  %define func(x) proc_frame x
  57  %macro FUNC_SAVE 0
  58         alloc_stack     stack_size
  59         save_xmm128     xmm6, 0*16
  60         save_xmm128     xmm7, 1*16
  61         save_xmm128     xmm13, 2*16
  62         save_xmm128     xmm14, 3*16
  63         save_xmm128     xmm15, 4*16
  64         end_prolog
  65  %endmacro
  66
  67  %macro FUNC_RESTORE 0
  68         vmovdqa xmm6, [rsp + 0*16]
  69         vmovdqa xmm7, [rsp + 1*16]
  70         vmovdqa xmm13, [rsp + 2*16]
  71         vmovdqa xmm14, [rsp + 3*16]
  72         vmovdqa xmm15, [rsp + 4*16]
  73         add     rsp, stack_size
  74  %endmacro
  75
  76 %endif
  77
  78
  79 %define len   arg0
  80 %define mul_array arg1
  81 %define src   arg2
  82 %define dest  arg3
  83 %define pos   return
  84
  85
  86 ;;; Use Non-temporal load/stor
  87 %ifdef NO_NT_LDST
  88  %define XLDR vmovdqa
  89  %define XSTR vmovdqa
  90 %else
  91  %define XLDR vmovntdqa
  92  %define XSTR vmovntdq
  93 %endif
  94
  95 default rel
  96
  97 [bits 64]
  98 section .text
  99
 100 %define xmask0f  xmm15
 101 %define xgft_lo  xmm14
 102 %define xgft_hi  xmm13
 103
 104 %define x0     xmm0
 105 %define xtmp1a xmm1
 106 %define xtmp1b xmm2
 107 %define xtmp1c xmm3
 108 %define x1     xmm4
 109 %define xtmp2a xmm5
 110 %define xtmp2b xmm6
 111 %define xtmp2c xmm7
 112
 113 align 16
 114 global gf_vect_mul_avx:function
 115 func(gf_vect_mul_avx)
 116         FUNC_SAVE
 117         mov     pos, 0
 118         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 119         vmovdqu xgft_lo, [mul_array]    ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 120         vmovdqu xgft_hi, [mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 121
 122 loop32:
 123         XLDR    x0, [src+pos]           ;Get next source vector
 124         XLDR    x1, [src+pos+16]        ;Get next source vector + 16B ahead
 125         add     pos, 32                 ;Loop on 16 bytes at a time
 126         cmp     pos, len
 127         vpand   xtmp1a, x0, xmask0f     ;Mask low src nibble in bits 4-0
 128         vpand   xtmp2a, x1, xmask0f
 129         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 130         vpsraw  x1, x1, 4
 131         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 132         vpand   x1, x1, xmask0f
 133         vpshufb xtmp1b, xgft_hi, x0     ;Lookup mul table of high nibble
 134         vpshufb xtmp1c, xgft_lo, xtmp1a ;Lookup mul table of low nibble
 135         vpshufb xtmp2b, xgft_hi, x1     ;Lookup mul table of high nibble
 136         vpshufb xtmp2c, xgft_lo, xtmp2a ;Lookup mul table of low nibble
 137         vpxor   xtmp1b, xtmp1b, xtmp1c  ;GF add high and low partials
 138         vpxor   xtmp2b, xtmp2b, xtmp2c
 139         XSTR    [dest+pos-32], xtmp1b   ;Store result
 140         XSTR    [dest+pos-16], xtmp2b   ;Store +16B result
 141         jl      loop32
 142
 143
 144 return_pass:
 145         FUNC_RESTORE
 146         sub     pos, len
 147         ret
 148
 149 return_fail:
 150         FUNC_RESTORE
 151         mov     return, 1
 152         ret
 153
 154 endproc_frame
 155
 156 section .data
 157
 158 align 16
 159
 160 mask0f:
 161 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 162
 163 ;;;       func             core, ver, snum
 164 slversion gf_vect_mul_avx, 01,   03,  0036