ceph/src/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp2  r10
  46  %define tmp3  r13              ; must be saved and restored
  47  %define tmp4  r12              ; must be saved and restored
  48  %define return rax
  49  %macro  SLDR 2
  50  %endmacro
  51  %define SSTR SLDR
  52  %define PS 8
  53  %define LOG_PS 3
  54
  55  %define func(x) x: endbranch
  56  %macro FUNC_SAVE 0
  57         push    r12
  58         push    r13
  59  %endmacro
  60  %macro FUNC_RESTORE 0
  61         pop     r13
  62         pop     r12
  63  %endmacro
  64 %endif
  65
  66 %ifidn __OUTPUT_FORMAT__, win64
  67  %define arg0   rcx
  68  %define arg1   rdx
  69  %define arg2   r8
  70  %define arg3   r9
  71
  72  %define arg4   r12             ; must be saved, loaded and restored
  73  %define arg5   r15             ; must be saved and restored
  74  %define tmp    r11
  75  %define tmp2   r10
  76  %define tmp3   r13             ; must be saved and restored
  77  %define tmp4   r14             ; must be saved and restored
  78  %define return rax
  79  %macro  SLDR 2
  80  %endmacro
  81  %define SSTR SLDR
  82  %define PS     8
  83  %define LOG_PS 3
  84  %define stack_size  6*16 + 5*8         ; must be an odd multiple of 8
  85  %define arg(x)      [rsp + stack_size + PS + PS*x]
  86
  87  %define func(x) proc_frame x
  88  %macro FUNC_SAVE 0
  89         alloc_stack     stack_size
  90         vmovdqa         [rsp + 0*16], xmm6
  91         vmovdqa         [rsp + 1*16], xmm7
  92         vmovdqa         [rsp + 2*16], xmm8
  93         vmovdqa         [rsp + 3*16], xmm9
  94         vmovdqa         [rsp + 4*16], xmm10
  95         vmovdqa         [rsp + 5*16], xmm11
  96         save_reg        r12,  6*16 + 0*8
  97         save_reg        r13,  6*16 + 1*8
  98         save_reg        r14,  6*16 + 2*8
  99         save_reg        r15,  6*16 + 3*8
 100         end_prolog
 101         mov     arg4, arg(4)
 102  %endmacro
 103
 104  %macro FUNC_RESTORE 0
 105         vmovdqa xmm6, [rsp + 0*16]
 106         vmovdqa xmm7, [rsp + 1*16]
 107         vmovdqa xmm8, [rsp + 2*16]
 108         vmovdqa xmm9, [rsp + 3*16]
 109         vmovdqa xmm10, [rsp + 4*16]
 110         vmovdqa xmm11, [rsp + 5*16]
 111         mov     r12,  [rsp + 6*16 + 0*8]
 112         mov     r13,  [rsp + 6*16 + 1*8]
 113         mov     r14,  [rsp + 6*16 + 2*8]
 114         mov     r15,  [rsp + 6*16 + 3*8]
 115         add     rsp, stack_size
 116  %endmacro
 117 %endif
 118
 119 %ifidn __OUTPUT_FORMAT__, elf32
 120
 121 ;;;================== High Address;
 122 ;;;     arg4
 123 ;;;     arg3
 124 ;;;     arg2
 125 ;;;     arg1
 126 ;;;     arg0
 127 ;;;     return
 128 ;;;<================= esp of caller
 129 ;;;     ebp
 130 ;;;<================= ebp = esp
 131 ;;;     var0
 132 ;;;     var1
 133 ;;;     esi
 134 ;;;     edi
 135 ;;;     ebx
 136 ;;;<================= esp of callee
 137 ;;;
 138 ;;;================== Low Address;
 139
 140  %define PS 4
 141  %define LOG_PS 2
 142  %define func(x) x: endbranch
 143  %define arg(x) [ebp + PS*2 + PS*x]
 144  %define var(x) [ebp - PS - PS*x]
 145
 146  %define trans   ecx
 147  %define trans2  esi
 148  %define arg0    trans          ;trans and trans2 are for the variables in stack
 149  %define arg0_m  arg(0)
 150  %define arg1    ebx
 151  %define arg2    arg2_m
 152  %define arg2_m  arg(2)
 153  %define arg3    trans
 154  %define arg3_m  arg(3)
 155  %define arg4    trans
 156  %define arg4_m  arg(4)
 157  %define arg5    trans2
 158  %define tmp     edx
 159  %define tmp2    edi
 160  %define tmp3    trans2
 161  %define tmp3_m  var(0)
 162  %define tmp4    trans2
 163  %define tmp4_m  var(1)
 164  %define return  eax
 165  %macro SLDR 2  ;; stack load/restore
 166         mov %1, %2
 167  %endmacro
 168  %define SSTR SLDR
 169
 170  %macro FUNC_SAVE 0
 171         push    ebp
 172         mov     ebp, esp
 173         sub     esp, PS*2               ;2 local variables
 174         push    esi
 175         push    edi
 176         push    ebx
 177         mov     arg1, arg(1)
 178  %endmacro
 179
 180  %macro FUNC_RESTORE 0
 181         pop     ebx
 182         pop     edi
 183         pop     esi
 184         add     esp, PS*2               ;2 local variables
 185         pop     ebp
 186  %endmacro
 187
 188 %endif  ; output formats
 189
 190 %define len   arg0
 191 %define vec   arg1
 192 %define mul_array arg2
 193 %define src   arg3
 194 %define dest1  arg4
 195 %define ptr   arg5
 196
 197 %define vec_i tmp2
 198 %define dest2 tmp3
 199 %define dest3 tmp4
 200 %define pos   return
 201
 202  %ifidn PS,4                            ;32-bit code
 203         %define  len_m  arg0_m
 204         %define  src_m  arg3_m
 205         %define  dest1_m arg4_m
 206         %define  dest2_m tmp3_m
 207         %define  dest3_m tmp4_m
 208  %endif
 209
 210 %ifndef EC_ALIGNED_ADDR
 211 ;;; Use Un-aligned load/store
 212  %define XLDR vmovdqu
 213  %define XSTR vmovdqu
 214 %else
 215 ;;; Use Non-temporal load/stor
 216  %ifdef NO_NT_LDST
 217   %define XLDR vmovdqa
 218   %define XSTR vmovdqa
 219  %else
 220   %define XLDR vmovntdqa
 221   %define XSTR vmovntdq
 222  %endif
 223 %endif
 224
 225 %ifidn PS,8                     ; 64-bit code
 226  default rel
 227   [bits 64]
 228 %endif
 229
 230
 231 section .text
 232
 233 %ifidn PS,8                     ;64-bit code
 234  %define xmask0f   xmm11
 235  %define xgft1_lo  xmm10
 236  %define xgft1_hi  xmm9
 237  %define xgft2_lo  xmm8
 238  %define xgft2_hi  xmm7
 239  %define xgft3_lo  xmm6
 240  %define xgft3_hi  xmm5
 241
 242  %define x0     xmm0
 243  %define xtmpa  xmm1
 244  %define xp1    xmm2
 245  %define xp2    xmm3
 246  %define xp3    xmm4
 247 %else
 248  %define xmask0f   xmm7
 249  %define xgft1_lo  xmm6
 250  %define xgft1_hi  xmm5
 251  %define xgft2_lo  xgft1_lo
 252  %define xgft2_hi  xgft1_hi
 253  %define xgft3_lo  xgft1_lo
 254  %define xgft3_hi  xgft1_hi
 255
 256  %define x0     xmm0
 257  %define xtmpa  xmm1
 258  %define xp1    xmm2
 259  %define xp2    xmm3
 260  %define xp3    xmm4
 261 %endif
 262
 263 align 16
 264 mk_global gf_3vect_dot_prod_avx, function
 265 func(gf_3vect_dot_prod_avx)
 266         FUNC_SAVE
 267         SLDR    len, len_m
 268         sub     len, 16
 269         SSTR    len_m, len
 270         jl      .return_fail
 271         xor     pos, pos
 272         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 273         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 274         SLDR    dest1, dest1_m
 275         mov     dest2, [dest1+PS]
 276         SSTR    dest2_m, dest2
 277         mov     dest3, [dest1+2*PS]
 278         SSTR    dest3_m, dest3
 279         mov     dest1, [dest1]
 280         SSTR    dest1_m, dest1
 281
 282 .loop16:
 283         vpxor   xp1, xp1
 284         vpxor   xp2, xp2
 285         vpxor   xp3, xp3
 286         mov     tmp, mul_array
 287         xor     vec_i, vec_i
 288
 289 .next_vect:
 290         SLDR    src, src_m
 291         mov     ptr, [src+vec_i]
 292
 293         vmovdqu xgft1_lo, [tmp]         ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 294         vmovdqu xgft1_hi, [tmp+16]      ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 295  %ifidn PS,8                            ; 64-bit code
 296         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 297         vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16]  ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 298         vmovdqu xgft3_lo, [tmp+vec*(64/PS)]     ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 299         vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16]  ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 300         add     tmp, 32
 301         add     vec_i, PS
 302  %endif
 303         XLDR    x0, [ptr+pos]           ;Get next source vector
 304
 305         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 306         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 307         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 308
 309         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 310         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 311         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 312         vpxor   xp1, xgft1_hi           ;xp1 += partial
 313
 314  %ifidn PS,4                            ; 32-bit code
 315         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 316         vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16]  ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 317  %endif
 318         vpshufb xgft2_hi, x0            ;Lookup mul table of high nibble
 319         vpshufb xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 320         vpxor   xgft2_hi, xgft2_lo      ;GF add high and low partials
 321         vpxor   xp2, xgft2_hi           ;xp2 += partial
 322
 323  %ifidn PS,4                            ; 32-bit code
 324         sal     vec, 1
 325         vmovdqu xgft3_lo, [tmp+vec*(32/PS)]     ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 326         vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16]  ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 327         sar     vec, 1
 328         add     tmp, 32
 329         add     vec_i, PS
 330  %endif
 331         vpshufb xgft3_hi, x0            ;Lookup mul table of high nibble
 332         vpshufb xgft3_lo, xtmpa         ;Lookup mul table of low nibble
 333         vpxor   xgft3_hi, xgft3_lo      ;GF add high and low partials
 334         vpxor   xp3, xgft3_hi           ;xp3 += partial
 335
 336         cmp     vec_i, vec
 337         jl      .next_vect
 338
 339         SLDR    dest1, dest1_m
 340         SLDR    dest2, dest2_m
 341         XSTR    [dest1+pos], xp1
 342         XSTR    [dest2+pos], xp2
 343         SLDR    dest3, dest3_m
 344         XSTR    [dest3+pos], xp3
 345
 346         SLDR    len, len_m
 347         add     pos, 16                 ;Loop on 16 bytes at a time
 348         cmp     pos, len
 349         jle     .loop16
 350
 351         lea     tmp, [len + 16]
 352         cmp     pos, tmp
 353         je      .return_pass
 354
 355         ;; Tail len
 356         mov     pos, len        ;Overlapped offset length-16
 357         jmp     .loop16         ;Do one more overlap pass
 358
 359 .return_pass:
 360         mov     return, 0
 361         FUNC_RESTORE
 362         ret
 363
 364 .return_fail:
 365         mov     return, 1
 366         FUNC_RESTORE
 367         ret
 368
 369 endproc_frame
 370
 371 section .data
 372
 373 align 16
 374 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 375
 376 ;;;       func                  core, ver, snum
 377 slversion gf_3vect_dot_prod_avx, 02,  05,  0192