ceph/src/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp2  r10
  46  %define tmp3  r9
  47  %define tmp4  r12              ; must be saved and restored
  48  %define return rax
  49  %macro  SLDR 2
  50  %endmacro
  51  %define SSTR SLDR
  52  %define PS 8
  53  %define LOG_PS 3
  54
  55  %define func(x) x:
  56  %macro FUNC_SAVE 0
  57         push    r12
  58  %endmacro
  59  %macro FUNC_RESTORE 0
  60         pop     r12
  61  %endmacro
  62 %endif
  63
  64 %ifidn __OUTPUT_FORMAT__, win64
  65  %define arg0   rcx
  66  %define arg1   rdx
  67  %define arg2   r8
  68  %define arg3   r9
  69
  70  %define arg4   r12             ; must be saved, loaded and restored
  71  %define tmp    r11
  72  %define tmp2   r10
  73  %define tmp3   r13             ; must be saved and restored
  74  %define tmp4   r14             ; must be saved and restored
  75  %define return rax
  76  %macro  SLDR 2
  77  %endmacro
  78  %define SSTR SLDR
  79  %define PS     8
  80  %define LOG_PS 3
  81  %define stack_size  3*16 + 3*8         ; must be an odd multiple of 8
  82  %define arg(x)      [rsp + stack_size + PS + PS*x]
  83
  84  %define func(x) proc_frame x
  85  %macro FUNC_SAVE 0
  86         alloc_stack     stack_size
  87         save_xmm128     xmm6, 0*16
  88         save_xmm128     xmm7, 1*16
  89         save_xmm128     xmm8, 2*16
  90         save_reg        r12,  3*16 + 0*8
  91         save_reg        r13,  3*16 + 1*8
  92         save_reg        r14,  3*16 + 2*8
  93         end_prolog
  94         mov     arg4, arg(4)
  95  %endmacro
  96
  97  %macro FUNC_RESTORE 0
  98         vmovdqa xmm6, [rsp + 0*16]
  99         vmovdqa xmm7, [rsp + 1*16]
 100         vmovdqa xmm8, [rsp + 2*16]
 101         mov     r12,  [rsp + 3*16 + 0*8]
 102         mov     r13,  [rsp + 3*16 + 1*8]
 103         mov     r14,  [rsp + 3*16 + 2*8]
 104         add     rsp, stack_size
 105  %endmacro
 106 %endif
 107
 108 %ifidn __OUTPUT_FORMAT__, elf32
 109
 110 ;;;================== High Address;
 111 ;;;     arg4
 112 ;;;     arg3
 113 ;;;     arg2
 114 ;;;     arg1
 115 ;;;     arg0
 116 ;;;     return
 117 ;;;<================= esp of caller
 118 ;;;     ebp
 119 ;;;<================= ebp = esp
 120 ;;;     var0
 121 ;;;     esi
 122 ;;;     edi
 123 ;;;     ebx
 124 ;;;<================= esp of callee
 125 ;;;
 126 ;;;================== Low Address;
 127
 128  %define PS 4
 129  %define LOG_PS 2
 130  %define func(x) x:
 131  %define arg(x) [ebp + PS*2 + PS*x]
 132  %define var(x) [ebp - PS - PS*x]
 133
 134  %define trans   ecx
 135  %define trans2  esi
 136  %define arg0    trans          ;trans and trans2 are for the variables in stack
 137  %define arg0_m  arg(0)
 138  %define arg1    ebx
 139  %define arg2    arg2_m
 140  %define arg2_m  arg(2)
 141  %define arg3    trans
 142  %define arg3_m  arg(3)
 143  %define arg4    trans
 144  %define arg4_m  arg(4)
 145  %define tmp     edx
 146  %define tmp2    edi
 147  %define tmp3    trans2
 148  %define tmp4    trans2
 149  %define tmp4_m  var(0)
 150  %define return  eax
 151  %macro SLDR 2  ;; stack load/restore
 152         mov %1, %2
 153  %endmacro
 154  %define SSTR SLDR
 155
 156  %macro FUNC_SAVE 0
 157         push    ebp
 158         mov     ebp, esp
 159         sub     esp, PS*1       ;1 local variable
 160         push    esi
 161         push    edi
 162         push    ebx
 163         mov     arg1, arg(1)
 164  %endmacro
 165
 166  %macro FUNC_RESTORE 0
 167         pop     ebx
 168         pop     edi
 169         pop     esi
 170         add     esp, PS*1       ;1 local variable
 171         pop     ebp
 172  %endmacro
 173
 174 %endif  ; output formats
 175
 176 %define len   arg0
 177 %define vec   arg1
 178 %define mul_array arg2
 179 %define src   arg3
 180 %define dest1  arg4
 181
 182 %define vec_i tmp2
 183 %define ptr   tmp3
 184 %define dest2 tmp4
 185 %define pos   return
 186
 187  %ifidn PS,4                            ;32-bit code
 188         %define  len_m  arg0_m
 189         %define  src_m  arg3_m
 190         %define  dest1_m arg4_m
 191         %define  dest2_m tmp4_m
 192  %endif
 193
 194 %ifndef EC_ALIGNED_ADDR
 195 ;;; Use Un-aligned load/store
 196  %define XLDR vmovdqu
 197  %define XSTR vmovdqu
 198 %else
 199 ;;; Use Non-temporal load/stor
 200  %ifdef NO_NT_LDST
 201   %define XLDR vmovdqa
 202   %define XSTR vmovdqa
 203  %else
 204   %define XLDR vmovntdqa
 205   %define XSTR vmovntdq
 206  %endif
 207 %endif
 208
 209 %ifidn PS,8                     ; 64-bit code
 210  default rel
 211   [bits 64]
 212 %endif
 213
 214 section .text
 215
 216 %ifidn PS,8                     ;64-bit code
 217  %define xmask0f   xmm8
 218  %define xgft1_lo  xmm7
 219  %define xgft1_hi  xmm6
 220  %define xgft2_lo  xmm5
 221  %define xgft2_hi  xmm4
 222
 223  %define x0     xmm0
 224  %define xtmpa  xmm1
 225  %define xp1    xmm2
 226  %define xp2    xmm3
 227 %else                           ;32-bit code
 228  %define xmask0f   xmm4
 229  %define xgft1_lo  xmm7
 230  %define xgft1_hi  xmm6
 231  %define xgft2_lo  xgft1_lo
 232  %define xgft2_hi  xgft1_hi
 233
 234  %define x0     xmm0
 235  %define xtmpa  xmm1
 236  %define xp1    xmm2
 237  %define xp2    xmm3
 238 %endif
 239
 240 align 16
 241 global gf_2vect_dot_prod_avx:function
 242
 243 func(gf_2vect_dot_prod_avx)
 244         FUNC_SAVE
 245         SLDR    len, len_m
 246         sub     len, 16
 247         SSTR    len_m, len
 248         jl      .return_fail
 249         xor     pos, pos
 250         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 251         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 252         SLDR    dest1, dest1_m
 253         mov     dest2, [dest1+PS]
 254         SSTR    dest2_m, dest2
 255         mov     dest1, [dest1]
 256         SSTR    dest1_m, dest1
 257
 258 .loop16:
 259         vpxor   xp1, xp1
 260         vpxor   xp2, xp2
 261         mov     tmp, mul_array
 262         xor     vec_i, vec_i
 263
 264 .next_vect:
 265         SLDR    src, src_m
 266         mov     ptr, [src+vec_i]
 267
 268         vmovdqu xgft1_lo, [tmp]         ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 269         vmovdqu xgft1_hi, [tmp+16]      ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 270  %ifidn PS,8                            ; 64-bit code
 271         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 272         vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16]  ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 273         add     tmp, 32
 274         add     vec_i, PS
 275  %endif
 276         XLDR    x0, [ptr+pos]           ;Get next source vector
 277
 278         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 279         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 280         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 281
 282         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 283         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 284         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 285         vpxor   xp1, xgft1_hi           ;xp1 += partial
 286
 287  %ifidn PS,4                            ; 32-bit code
 288         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 289         vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16]  ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 290         add     tmp, 32
 291         add     vec_i, PS
 292  %endif
 293         vpshufb xgft2_hi, x0            ;Lookup mul table of high nibble
 294         vpshufb xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 295         vpxor   xgft2_hi, xgft2_lo      ;GF add high and low partials
 296         vpxor   xp2, xgft2_hi           ;xp2 += partial
 297
 298         cmp     vec_i, vec
 299         jl      .next_vect
 300
 301         SLDR    dest1, dest1_m
 302         SLDR    dest2, dest2_m
 303         XSTR    [dest1+pos], xp1
 304         XSTR    [dest2+pos], xp2
 305
 306         SLDR    len, len_m
 307         add     pos, 16                 ;Loop on 16 bytes at a time
 308         cmp     pos, len
 309         jle     .loop16
 310
 311         lea     tmp, [len + 16]
 312         cmp     pos, tmp
 313         je      .return_pass
 314
 315         ;; Tail len
 316         mov     pos, len        ;Overlapped offset length-16
 317         jmp     .loop16         ;Do one more overlap pass
 318
 319 .return_pass:
 320         mov     return, 0
 321         FUNC_RESTORE
 322         ret
 323
 324 .return_fail:
 325         mov     return, 1
 326         FUNC_RESTORE
 327         ret
 328
 329 endproc_frame
 330
 331 section .data
 332
 333 align 16
 334 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 335
 336 ;;;       func                  core, ver, snum
 337 slversion gf_2vect_dot_prod_avx, 02,  05,  0191