ceph/src/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp.w r11d
  46  %define tmp.b r11b
  47  %define tmp2  r10
  48  %define tmp3  r9
  49  %define tmp4  r12              ; must be saved and restored
  50  %define return rax
  51  %macro  SLDR   2
  52  %endmacro
  53  %define SSTR   SLDR
  54  %define PS     8
  55  %define LOG_PS 3
  56
  57  %define func(x) x: endbranch
  58  %macro FUNC_SAVE 0
  59         push    r12
  60  %endmacro
  61  %macro FUNC_RESTORE 0
  62         pop     r12
  63  %endmacro
  64 %endif
  65
  66 %ifidn __OUTPUT_FORMAT__, win64
  67  %define arg0   rcx
  68  %define arg1   rdx
  69  %define arg2   r8
  70  %define arg3   r9
  71
  72  %define arg4   r12             ; must be saved, loaded and restored
  73  %define tmp    r11
  74  %define tmp.w  r11d
  75  %define tmp.b  r11b
  76  %define tmp2   r10
  77  %define tmp3   r13             ; must be saved and restored
  78  %define tmp4   r14             ; must be saved and restored
  79  %define return rax
  80  %macro  SLDR   2
  81  %endmacro
  82  %define SSTR   SLDR
  83  %define PS     8
  84  %define LOG_PS 3
  85  %define stack_size  3*16 + 3*8         ; must be an odd multiple of 8
  86  %define arg(x)      [rsp + stack_size + PS + PS*x]
  87
  88  %define func(x) proc_frame x
  89  %macro FUNC_SAVE 0
  90         alloc_stack     stack_size
  91         vmovdqa [rsp + 0*16], xmm6
  92         vmovdqa [rsp + 1*16], xmm7
  93         vmovdqa [rsp + 2*16], xmm8
  94         save_reg        r12,  3*16 + 0*8
  95         save_reg        r13,  3*16 + 1*8
  96         save_reg        r14,  3*16 + 2*8
  97         end_prolog
  98         mov     arg4, arg(4)
  99  %endmacro
 100
 101  %macro FUNC_RESTORE 0
 102         vmovdqa xmm6, [rsp + 0*16]
 103         vmovdqa xmm7, [rsp + 1*16]
 104         vmovdqa xmm8, [rsp + 2*16]
 105         mov     r12,  [rsp + 3*16 + 0*8]
 106         mov     r13,  [rsp + 3*16 + 1*8]
 107         mov     r14,  [rsp + 3*16 + 2*8]
 108         add     rsp, stack_size
 109  %endmacro
 110 %endif
 111
 112 %ifidn __OUTPUT_FORMAT__, elf32
 113
 114 ;;;================== High Address;
 115 ;;;     arg4
 116 ;;;     arg3
 117 ;;;     arg2
 118 ;;;     arg1
 119 ;;;     arg0
 120 ;;;     return
 121 ;;;<================= esp of caller
 122 ;;;     ebp
 123 ;;;<================= ebp = esp
 124 ;;;     var0
 125 ;;;     esi
 126 ;;;     edi
 127 ;;;     ebx
 128 ;;;<================= esp of callee
 129 ;;;
 130 ;;;================== Low Address;
 131
 132  %define PS 4
 133  %define LOG_PS 2
 134  %define func(x) x: endbranch
 135  %define arg(x) [ebp + PS*2 + PS*x]
 136  %define var(x) [ebp - PS - PS*x]
 137
 138  %define trans   ecx
 139  %define trans2  esi
 140  %define arg0    trans                  ;trans and trans2 are for the variables in stack
 141  %define arg0_m  arg(0)
 142  %define arg1    ebx
 143  %define arg2    arg2_m
 144  %define arg2_m  arg(2)
 145  %define arg3    trans
 146  %define arg3_m  arg(3)
 147  %define arg4    trans
 148  %define arg4_m  arg(4)
 149  %define tmp     edx
 150  %define tmp.w   edx
 151  %define tmp.b   dl
 152  %define tmp2    edi
 153  %define tmp3    trans2
 154  %define tmp4    trans2
 155  %define tmp4_m  var(0)
 156  %define return  eax
 157  %macro SLDR     2                      ;stack load/restore
 158         mov %1, %2
 159  %endmacro
 160  %define SSTR SLDR
 161
 162  %macro FUNC_SAVE 0
 163         push    ebp
 164         mov     ebp, esp
 165         sub     esp, PS*1               ;1 local variable
 166         push    esi
 167         push    edi
 168         push    ebx
 169         mov     arg1, arg(1)
 170  %endmacro
 171
 172  %macro FUNC_RESTORE 0
 173         pop     ebx
 174         pop     edi
 175         pop     esi
 176         add     esp, PS*1               ;1 local variable
 177         pop     ebp
 178  %endmacro
 179
 180 %endif  ; output formats
 181
 182 %define len   arg0
 183 %define vec   arg1
 184 %define mul_array arg2
 185 %define src   arg3
 186 %define dest1 arg4
 187
 188 %define vec_i tmp2
 189 %define ptr   tmp3
 190 %define dest2 tmp4
 191 %define pos   return
 192
 193 %ifidn PS,4                             ;32-bit code
 194  %define  len_m   arg0_m
 195  %define  src_m   arg3_m
 196  %define  dest1_m arg4_m
 197  %define  dest2_m tmp4_m
 198 %endif
 199
 200 %ifndef EC_ALIGNED_ADDR
 201 ;;; Use Un-aligned load/store
 202  %define XLDR vmovdqu
 203  %define XSTR vmovdqu
 204 %else
 205
 206 ;;; Use Non-temporal load/stor
 207  %ifdef NO_NT_LDST
 208   %define XLDR vmovdqa
 209   %define XSTR vmovdqa
 210  %else
 211   %define XLDR vmovntdqa
 212   %define XSTR vmovntdq
 213  %endif
 214 %endif
 215
 216 %ifidn PS,8                             ;64-bit code
 217  default rel
 218  [bits 64]
 219 %endif
 220
 221 section .text
 222
 223 %ifidn PS,8                             ;64-bit code
 224  %define xmask0f   ymm8
 225  %define xmask0fx  xmm8
 226  %define xgft1_lo  ymm7
 227  %define xgft1_hi  ymm6
 228  %define xgft2_lo  ymm5
 229  %define xgft2_hi  ymm4
 230
 231  %define x0     ymm0
 232  %define xtmpa  ymm1
 233  %define xp1    ymm2
 234  %define xp2    ymm3
 235 %else                                   ;32-bit code
 236  %define xmask0f   ymm7
 237  %define xmask0fx  xmm7
 238  %define xgft1_lo  ymm5
 239  %define xgft1_hi  ymm4
 240  %define xgft2_lo  xgft1_lo
 241  %define xgft2_hi  xgft1_hi
 242
 243  %define x0     ymm0
 244  %define xtmpa  ymm1
 245  %define xp1    ymm2
 246  %define xp2    ymm3
 247
 248 %endif
 249
 250 align 16
 251 mk_global gf_2vect_dot_prod_avx2, function
 252
 253 func(gf_2vect_dot_prod_avx2)
 254         FUNC_SAVE
 255         SLDR    len, len_m
 256         sub     len, 32
 257         SSTR    len_m, len
 258         jl      .return_fail
 259         xor     pos, pos
 260         mov     tmp.b, 0x0f
 261         vpinsrb xmask0fx, xmask0fx, tmp.w, 0
 262         vpbroadcastb xmask0f, xmask0fx  ;Construct mask 0x0f0f0f...
 263
 264         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 265         SLDR    dest1, dest1_m
 266         mov     dest2, [dest1+PS]
 267         SSTR    dest2_m, dest2
 268         mov     dest1, [dest1]
 269         SSTR    dest1_m, dest1
 270
 271 .loop32:
 272         vpxor   xp1, xp1
 273         vpxor   xp2, xp2
 274         mov     tmp, mul_array
 275         xor     vec_i, vec_i
 276
 277 .next_vect:
 278         SLDR    src, src_m
 279         mov     ptr, [src+vec_i]
 280
 281         vmovdqu xgft1_lo, [tmp]         ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 282                                         ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 283         vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
 284         vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
 285  %ifidn PS,8                            ; 64-bit code
 286         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 287                                                 ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 288         vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 289         vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 290
 291         XLDR    x0, [ptr+pos]           ;Get next source vector
 292         add     tmp, 32
 293         add     vec_i, PS
 294  %else
 295         XLDR    x0, [ptr+pos]           ;Get next source vector
 296  %endif
 297
 298         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 299         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 300         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 301
 302         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 303         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 304         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 305         vpxor   xp1, xgft1_hi           ;xp1 += partial
 306
 307  %ifidn PS,4                            ; 32-bit code
 308         vmovdqu xgft2_lo, [tmp+vec*(32/PS)]     ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 309                                                 ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 310         vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 311         vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 312         add     tmp, 32
 313         add     vec_i, PS
 314  %endif
 315         vpshufb xgft2_hi, x0            ;Lookup mul table of high nibble
 316         vpshufb xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 317         vpxor   xgft2_hi, xgft2_lo      ;GF add high and low partials
 318         vpxor   xp2, xgft2_hi           ;xp2 += partial
 319
 320         cmp     vec_i, vec
 321         jl      .next_vect
 322
 323         SLDR    dest1, dest1_m
 324         SLDR    dest2, dest2_m
 325         XSTR    [dest1+pos], xp1
 326         XSTR    [dest2+pos], xp2
 327
 328         SLDR    len, len_m
 329         add     pos, 32                 ;Loop on 32 bytes at a time
 330         cmp     pos, len
 331         jle     .loop32
 332
 333         lea     tmp, [len + 32]
 334         cmp     pos, tmp
 335         je      .return_pass
 336
 337         ;; Tail len
 338         mov     pos, len        ;Overlapped offset length-16
 339         jmp     .loop32         ;Do one more overlap pass
 340
 341 .return_pass:
 342         mov     return, 0
 343         FUNC_RESTORE
 344         ret
 345
 346 .return_fail:
 347         mov     return, 1
 348         FUNC_RESTORE
 349         ret
 350
 351 endproc_frame
 352
 353 section .data
 354
 355 ;;;       func                   core, ver, snum
 356 slversion gf_2vect_dot_prod_avx2, 04,  05,  0196