ceph/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42
  43  %define tmp   r11
  44  %define tmp2  r10
  45  %define tmp3  r9
  46  %define return rax
  47  %macro  SLDR 2
  48  %endmacro
  49  %define SSTR SLDR
  50  %define PS 8
  51  %define func(x) x:
  52  %define FUNC_SAVE
  53  %define FUNC_RESTORE
  54 %endif
  55
  56 %ifidn __OUTPUT_FORMAT__, win64
  57  %define arg0   rcx
  58  %define arg1   rdx
  59  %define arg2   r8
  60  %define arg3   r9
  61
  62  %define arg4   r12             ; must be saved and loaded
  63  %define tmp    r11
  64  %define tmp2   r10
  65  %define tmp3   rdi             ; must be saved and loaded
  66  %define return rax
  67  %macro  SLDR 2
  68  %endmacro
  69  %define SSTR SLDR
  70  %define PS 8
  71  %define frame_size 2*8
  72  %define arg(x)      [rsp + frame_size + PS + PS*x]
  73
  74  %define func(x) proc_frame x
  75  %macro FUNC_SAVE 0
  76         rex_push_reg    r12
  77         push_reg        rdi
  78         end_prolog
  79         mov     arg4, arg(4)
  80  %endmacro
  81
  82  %macro FUNC_RESTORE 0
  83         pop     rdi
  84         pop     r12
  85  %endmacro
  86 %endif
  87
  88 %ifidn __OUTPUT_FORMAT__, elf32
  89
  90 ;;;================== High Address;
  91 ;;;     arg4
  92 ;;;     arg3
  93 ;;;     arg2
  94 ;;;     arg1
  95 ;;;     arg0
  96 ;;;     return
  97 ;;;<================= esp of caller
  98 ;;;     ebp
  99 ;;;<================= ebp = esp
 100 ;;;     esi
 101 ;;;     edi
 102 ;;;     ebx
 103 ;;;<================= esp of callee
 104 ;;;
 105 ;;;================== Low Address;
 106
 107  %define PS 4
 108  %define LOG_PS 2
 109  %define func(x) x:
 110  %define arg(x) [ebp + PS*2 + PS*x]
 111
 112  %define trans   ecx                    ;trans is for the variables in stack
 113  %define arg0    trans
 114  %define arg0_m  arg(0)
 115  %define arg1    trans
 116  %define arg1_m  arg(1)
 117  %define arg2    arg2_m
 118  %define arg2_m  arg(2)
 119  %define arg3    ebx
 120  %define arg4    trans
 121  %define arg4_m  arg(4)
 122  %define tmp     edx
 123  %define tmp2    edi
 124  %define tmp3    esi
 125  %define return  eax
 126  %macro SLDR 2  ;; stack load/restore
 127         mov %1, %2
 128  %endmacro
 129  %define SSTR SLDR
 130
 131  %macro FUNC_SAVE 0
 132         push    ebp
 133         mov     ebp, esp
 134         push    esi
 135         push    edi
 136         push    ebx
 137         mov     arg3, arg(3)
 138  %endmacro
 139
 140  %macro FUNC_RESTORE 0
 141         pop     ebx
 142         pop     edi
 143         pop     esi
 144         mov     esp, ebp
 145         pop     ebp
 146  %endmacro
 147
 148 %endif  ; output formats
 149
 150 %define len   arg0
 151 %define vec   arg1
 152 %define mul_array arg2
 153 %define src   arg3
 154 %define dest  arg4
 155
 156 %define vec_i tmp2
 157 %define ptr   tmp3
 158 %define pos   return
 159
 160  %ifidn PS,4                            ;32-bit code
 161         %define  vec_m  arg1_m
 162         %define  len_m  arg0_m
 163         %define  dest_m arg4_m
 164  %endif
 165
 166 %ifndef EC_ALIGNED_ADDR
 167 ;;; Use Un-aligned load/store
 168  %define XLDR vmovdqu
 169  %define XSTR vmovdqu
 170 %else
 171 ;;; Use Non-temporal load/stor
 172  %ifdef NO_NT_LDST
 173   %define XLDR vmovdqa
 174   %define XSTR vmovdqa
 175  %else
 176   %define XLDR vmovntdqa
 177   %define XSTR vmovntdq
 178  %endif
 179 %endif
 180
 181 %ifidn PS,8                     ; 64-bit code
 182  default rel
 183   [bits 64]
 184 %endif
 185
 186 section .text
 187
 188 %define xmask0f  xmm5
 189 %define xgft_lo  xmm4
 190 %define xgft_hi  xmm3
 191
 192 %define x0     xmm0
 193 %define xtmpa  xmm1
 194 %define xp     xmm2
 195
 196 align 16
 197 global gf_vect_dot_prod_avx:function
 198 func(gf_vect_dot_prod_avx)
 199         FUNC_SAVE
 200         SLDR    len, len_m
 201         sub     len, 16
 202         SSTR    len_m, len
 203         jl      .return_fail
 204         xor     pos, pos
 205         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 206
 207 .loop16:
 208         vpxor   xp, xp
 209         mov     tmp, mul_array
 210         xor     vec_i, vec_i
 211
 212 .next_vect:
 213
 214         mov     ptr, [src+vec_i*PS]
 215         vmovdqu xgft_lo, [tmp]          ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 216         vmovdqu xgft_hi, [tmp+16]       ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 217         XLDR    x0, [ptr+pos]           ;Get next source vector
 218
 219         add     tmp, 32
 220         add     vec_i, 1
 221
 222         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 223         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 224         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 225
 226         vpshufb xgft_hi, xgft_hi, x0    ;Lookup mul table of high nibble
 227         vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
 228         vpxor   xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
 229         vpxor   xp, xp, xgft_hi         ;xp += partial
 230
 231         SLDR    vec, vec_m
 232         cmp     vec_i, vec
 233         jl      .next_vect
 234
 235         SLDR    dest, dest_m
 236         XSTR    [dest+pos], xp
 237
 238         add     pos, 16                 ;Loop on 16 bytes at a time
 239         SLDR    len, len_m
 240         cmp     pos, len
 241         jle     .loop16
 242
 243         lea     tmp, [len + 16]
 244         cmp     pos, tmp
 245         je      .return_pass
 246
 247         ;; Tail len
 248         mov     pos, len        ;Overlapped offset length-16
 249         jmp     .loop16         ;Do one more overlap pass
 250
 251 .return_pass:
 252         mov     return, 0
 253         FUNC_RESTORE
 254         ret
 255
 256 .return_fail:
 257         mov     return, 1
 258         FUNC_RESTORE
 259         ret
 260
 261 endproc_frame
 262
 263 section .data
 264
 265 align 16
 266
 267 mask0f:
 268 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 269
 270 ;;;       func                 core, ver, snum
 271 slversion gf_vect_dot_prod_avx, 02,  05,  0061