ceph/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp.w r11d
  46  %define tmp.b r11b
  47  %define tmp2  r10
  48  %define tmp3  r13              ; must be saved and restored
  49  %define tmp4  r12              ; must be saved and restored
  50  %define tmp5  r14              ; must be saved and restored
  51  %define tmp6  r15              ; must be saved and restored
  52  %define return rax
  53  %macro  SLDR   2
  54  %endmacro
  55  %define SSTR   SLDR
  56  %define PS     8
  57  %define LOG_PS 3
  58
  59  %define func(x) x:
  60  %macro FUNC_SAVE 0
  61         push    r12
  62         push    r13
  63         push    r14
  64         push    r15
  65  %endmacro
  66  %macro FUNC_RESTORE 0
  67         pop     r15
  68         pop     r14
  69         pop     r13
  70         pop     r12
  71  %endmacro
  72 %endif
  73
  74 %ifidn __OUTPUT_FORMAT__, win64
  75  %define arg0   rcx
  76  %define arg1   rdx
  77  %define arg2   r8
  78  %define arg3   r9
  79
  80  %define arg4   r12             ; must be saved, loaded and restored
  81  %define arg5   r15             ; must be saved and restored
  82  %define tmp    r11
  83  %define tmp.w  r11d
  84  %define tmp.b  r11b
  85  %define tmp2   r10
  86  %define tmp3   r13             ; must be saved and restored
  87  %define tmp4   r14             ; must be saved and restored
  88  %define tmp5   rdi             ; must be saved and restored
  89  %define tmp6   rsi             ; must be saved and restored
  90  %define return rax
  91  %macro  SLDR   2
  92  %endmacro
  93  %define SSTR   SLDR
  94  %define PS     8
  95  %define LOG_PS 3
  96  %define stack_size  9*16 + 7*8         ; must be an odd multiple of 8
  97  %define arg(x)      [rsp + stack_size + PS + PS*x]
  98
  99  %define func(x) proc_frame x
 100  %macro FUNC_SAVE 0
 101         alloc_stack     stack_size
 102         vmovdqa [rsp + 0*16], xmm6
 103         vmovdqa [rsp + 1*16], xmm7
 104         vmovdqa [rsp + 2*16], xmm8
 105         vmovdqa [rsp + 3*16], xmm9
 106         vmovdqa [rsp + 4*16], xmm10
 107         vmovdqa [rsp + 5*16], xmm11
 108         vmovdqa [rsp + 6*16], xmm12
 109         vmovdqa [rsp + 7*16], xmm13
 110         vmovdqa [rsp + 8*16], xmm14
 111         save_reg        r12,  9*16 + 0*8
 112         save_reg        r13,  9*16 + 1*8
 113         save_reg        r14,  9*16 + 2*8
 114         save_reg        r15,  9*16 + 3*8
 115         save_reg        rdi,  9*16 + 4*8
 116         save_reg        rsi,  9*16 + 5*8
 117         end_prolog
 118         mov     arg4, arg(4)
 119  %endmacro
 120
 121  %macro FUNC_RESTORE 0
 122         vmovdqa xmm6, [rsp + 0*16]
 123         vmovdqa xmm7, [rsp + 1*16]
 124         vmovdqa xmm8, [rsp + 2*16]
 125         vmovdqa xmm9, [rsp + 3*16]
 126         vmovdqa xmm10, [rsp + 4*16]
 127         vmovdqa xmm11, [rsp + 5*16]
 128         vmovdqa xmm12, [rsp + 6*16]
 129         vmovdqa xmm13, [rsp + 7*16]
 130         vmovdqa xmm14, [rsp + 8*16]
 131         mov     r12,  [rsp + 9*16 + 0*8]
 132         mov     r13,  [rsp + 9*16 + 1*8]
 133         mov     r14,  [rsp + 9*16 + 2*8]
 134         mov     r15,  [rsp + 9*16 + 3*8]
 135         mov     rdi,  [rsp + 9*16 + 4*8]
 136         mov     rsi,  [rsp + 9*16 + 5*8]
 137         add     rsp, stack_size
 138  %endmacro
 139 %endif
 140
 141 %ifidn __OUTPUT_FORMAT__, elf32
 142
 143 ;;;================== High Address;
 144 ;;;     arg4
 145 ;;;     arg3
 146 ;;;     arg2
 147 ;;;     arg1
 148 ;;;     arg0
 149 ;;;     return
 150 ;;;<================= esp of caller
 151 ;;;     ebp
 152 ;;;<================= ebp = esp
 153 ;;;     var0
 154 ;;;     var1
 155 ;;;     var2
 156 ;;;     var3
 157 ;;;     esi
 158 ;;;     edi
 159 ;;;     ebx
 160 ;;;<================= esp of callee
 161 ;;;
 162 ;;;================== Low Address;
 163
 164  %define PS     4
 165  %define LOG_PS 2
 166  %define func(x) x:
 167  %define arg(x) [ebp + PS*2 + PS*x]
 168  %define var(x) [ebp - PS - PS*x]
 169
 170  %define trans   ecx
 171  %define trans2  esi
 172  %define arg0    trans          ;trans and trans2 are for the variables in stack
 173  %define arg0_m  arg(0)
 174  %define arg1    ebx
 175  %define arg2    arg2_m
 176  %define arg2_m  arg(2)
 177  %define arg3    trans
 178  %define arg3_m  arg(3)
 179  %define arg4    trans
 180  %define arg4_m  arg(4)
 181  %define arg5    trans2
 182  %define tmp     edx
 183  %define tmp.w   edx
 184  %define tmp.b   dl
 185  %define tmp2    edi
 186  %define tmp3    trans2
 187  %define tmp3_m  var(0)
 188  %define tmp4    trans2
 189  %define tmp4_m  var(1)
 190  %define tmp5    trans2
 191  %define tmp5_m  var(2)
 192  %define tmp6    trans2
 193  %define tmp6_m  var(3)
 194  %define return  eax
 195  %macro SLDR 2                          ;stack load/restore
 196         mov %1, %2
 197  %endmacro
 198  %define SSTR SLDR
 199
 200  %macro FUNC_SAVE 0
 201         push    ebp
 202         mov     ebp, esp
 203         sub     esp, PS*4               ;4 local variables
 204         push    esi
 205         push    edi
 206         push    ebx
 207         mov     arg1, arg(1)
 208  %endmacro
 209
 210  %macro FUNC_RESTORE 0
 211         pop     ebx
 212         pop     edi
 213         pop     esi
 214         add     esp, PS*4               ;4 local variables
 215         pop     ebp
 216  %endmacro
 217
 218 %endif  ; output formats
 219
 220 %define len    arg0
 221 %define vec    arg1
 222 %define mul_array arg2
 223 %define src    arg3
 224 %define dest1  arg4
 225 %define ptr    arg5
 226 %define vec_i  tmp2
 227 %define dest2  tmp3
 228 %define dest3  tmp4
 229 %define dest4  tmp5
 230 %define vskip3 tmp6
 231 %define pos    return
 232
 233  %ifidn PS,4                            ;32-bit code
 234         %define  len_m  arg0_m
 235         %define  src_m  arg3_m
 236         %define  dest1_m arg4_m
 237         %define  dest2_m tmp3_m
 238         %define  dest3_m tmp4_m
 239         %define  dest4_m tmp5_m
 240         %define  vskip3_m tmp6_m
 241  %endif
 242
 243 %ifndef EC_ALIGNED_ADDR
 244 ;;; Use Un-aligned load/store
 245  %define XLDR vmovdqu
 246  %define XSTR vmovdqu
 247 %else
 248 ;;; Use Non-temporal load/stor
 249  %ifdef NO_NT_LDST
 250   %define XLDR vmovdqa
 251   %define XSTR vmovdqa
 252  %else
 253   %define XLDR vmovntdqa
 254   %define XSTR vmovntdq
 255  %endif
 256 %endif
 257
 258 %ifidn PS,8                             ;64-bit code
 259  default rel
 260   [bits 64]
 261 %endif
 262
 263
 264 section .text
 265
 266 %ifidn PS,8                             ;64-bit code
 267  %define xmask0f   ymm14
 268  %define xmask0fx  xmm14
 269  %define xgft1_lo  ymm13
 270  %define xgft1_hi  ymm12
 271  %define xgft2_lo  ymm11
 272  %define xgft2_hi  ymm10
 273  %define xgft3_lo  ymm9
 274  %define xgft3_hi  ymm8
 275  %define xgft4_lo  ymm7
 276  %define xgft4_hi  ymm6
 277
 278  %define x0     ymm0
 279  %define xtmpa  ymm1
 280  %define xp1    ymm2
 281  %define xp2    ymm3
 282  %define xp3    ymm4
 283  %define xp4    ymm5
 284 %else
 285  %define ymm_trans ymm7                 ;reuse xmask0f and xgft1_hi
 286  %define xmask0f   ymm_trans
 287  %define xmask0fx  xmm7
 288  %define xgft1_lo  ymm6
 289  %define xgft1_hi  ymm_trans
 290  %define xgft2_lo  xgft1_lo
 291  %define xgft2_hi  xgft1_hi
 292  %define xgft3_lo  xgft1_lo
 293  %define xgft3_hi  xgft1_hi
 294  %define xgft4_lo  xgft1_lo
 295  %define xgft4_hi  xgft1_hi
 296
 297  %define x0     ymm0
 298  %define xtmpa  ymm1
 299  %define xp1    ymm2
 300  %define xp2    ymm3
 301  %define xp3    ymm4
 302  %define xp4    ymm5
 303 %endif
 304 align 16
 305 global gf_4vect_dot_prod_avx2:function
 306 func(gf_4vect_dot_prod_avx2)
 307         FUNC_SAVE
 308         SLDR    len, len_m
 309         sub     len, 32
 310         SSTR    len_m, len
 311         jl      .return_fail
 312         xor     pos, pos
 313         mov     tmp.b, 0x0f
 314         vpinsrb xmask0fx, xmask0fx, tmp.w, 0
 315         vpbroadcastb xmask0f, xmask0fx  ;Construct mask 0x0f0f0f...
 316         mov     vskip3, vec
 317         imul    vskip3, 96
 318         SSTR    vskip3_m, vskip3
 319         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 320         SLDR    dest1, dest1_m
 321         mov     dest2, [dest1+PS]
 322         SSTR    dest2_m, dest2
 323         mov     dest3, [dest1+2*PS]
 324         SSTR    dest3_m, dest3
 325         mov     dest4, [dest1+3*PS]
 326         SSTR    dest4_m, dest4
 327         mov     dest1, [dest1]
 328         SSTR    dest1_m, dest1
 329
 330 .loop32:
 331         vpxor   xp1, xp1
 332         vpxor   xp2, xp2
 333         vpxor   xp3, xp3
 334         vpxor   xp4, xp4
 335         mov     tmp, mul_array
 336         xor     vec_i, vec_i
 337
 338 .next_vect:
 339         SLDR    src, src_m
 340         mov     ptr, [src+vec_i]
 341         XLDR    x0, [ptr+pos]           ;Get next source vector
 342
 343         add     vec_i, PS
 344  %ifidn PS,8                            ;64-bit code
 345         vpand   xgft4_lo, x0, xmask0f   ;Mask low src nibble in bits 4-0
 346         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 347         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 348         vperm2i128 xtmpa, xgft4_lo, x0, 0x30    ;swap xtmpa from 1lo|2lo to 1lo|2hi
 349         vperm2i128 x0, xgft4_lo, x0, 0x12       ;swap x0 from    1hi|2hi to 1hi|2lo
 350
 351         vmovdqu xgft1_lo, [tmp]                 ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 352                                                 ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 353         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 354                                                 ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 355         vmovdqu xgft3_lo, [tmp+vec*(64/PS)]     ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 356                                                 ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 357         vmovdqu xgft4_lo, [tmp+vskip3]          ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
 358                                                 ;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 359
 360         vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 361         vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
 362         vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
 363         vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
 364         add     tmp, 32
 365  %else                                  ;32-bit code
 366         mov     cl, 0x0f                ;use ecx as a temp variable
 367         vpinsrb xmask0fx, xmask0fx, ecx, 0
 368         vpbroadcastb xmask0f, xmask0fx  ;Construct mask 0x0f0f0f...
 369
 370         vpand   xgft4_lo, x0, xmask0f   ;Mask low src nibble in bits 4-0
 371         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 372         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 373         vperm2i128 xtmpa, xgft4_lo, x0, 0x30    ;swap xtmpa from 1lo|2lo to 1lo|2hi
 374         vperm2i128 x0, xgft4_lo, x0, 0x12       ;swap x0 from    1hi|2hi to 1hi|2lo
 375
 376         vmovdqu xgft1_lo, [tmp]                 ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 377                                                 ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 378         vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 379  %endif
 380
 381         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 382         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 383         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 384         vpxor   xp1, xgft1_hi           ;xp1 += partial
 385
 386  %ifidn PS,4                            ; 32-bit code
 387         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 388                                                 ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 389         vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
 390  %endif
 391         vpshufb xgft2_hi, x0            ;Lookup mul table of high nibble
 392         vpshufb xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 393         vpxor   xgft2_hi, xgft2_lo      ;GF add high and low partials
 394         vpxor   xp2, xgft2_hi           ;xp2 += partial
 395
 396  %ifidn PS,4                            ; 32-bit code
 397         sal     vec, 1
 398         vmovdqu xgft3_lo, [tmp+vec*(32/PS)]     ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 399                                                 ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 400         vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
 401         sar     vec, 1
 402  %endif
 403         vpshufb xgft3_hi, x0            ;Lookup mul table of high nibble
 404         vpshufb xgft3_lo, xtmpa         ;Lookup mul table of low nibble
 405         vpxor   xgft3_hi, xgft3_lo      ;GF add high and low partials
 406         vpxor   xp3, xgft3_hi           ;xp3 += partial
 407
 408  %ifidn PS,4                            ; 32-bit code
 409         SLDR    vskip3, vskip3_m
 410         vmovdqu xgft4_lo, [tmp+vskip3]          ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
 411                                                 ;     "     DX{00}, Dx{10}, ..., Dx{f0}
 412         vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
 413         add     tmp, 32
 414  %endif
 415         vpshufb xgft4_hi, x0            ;Lookup mul table of high nibble
 416         vpshufb xgft4_lo, xtmpa         ;Lookup mul table of low nibble
 417         vpxor   xgft4_hi, xgft4_lo      ;GF add high and low partials
 418         vpxor   xp4, xgft4_hi           ;xp4 += partial
 419
 420         cmp     vec_i, vec
 421         jl      .next_vect
 422
 423         SLDR    dest1, dest1_m
 424         SLDR    dest2, dest2_m
 425         XSTR    [dest1+pos], xp1
 426         XSTR    [dest2+pos], xp2
 427         SLDR    dest3, dest3_m
 428         XSTR    [dest3+pos], xp3
 429         SLDR    dest4, dest4_m
 430         XSTR    [dest4+pos], xp4
 431
 432         SLDR    len, len_m
 433         add     pos, 32                 ;Loop on 32 bytes at a time
 434         cmp     pos, len
 435         jle     .loop32
 436
 437         lea     tmp, [len + 32]
 438         cmp     pos, tmp
 439         je      .return_pass
 440
 441         ;; Tail len
 442         mov     pos, len        ;Overlapped offset length-32
 443         jmp     .loop32         ;Do one more overlap pass
 444
 445 .return_pass:
 446         mov     return, 0
 447         FUNC_RESTORE
 448         ret
 449
 450 .return_fail:
 451         mov     return, 1
 452         FUNC_RESTORE
 453         ret
 454
 455 endproc_frame
 456
 457 section .data
 458
 459 ;;;       func                   core, ver, snum
 460 slversion gf_4vect_dot_prod_avx2, 04,  05,  0198