ceph/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp.w r11d
  46  %define tmp.b r11b
  47  %define tmp2  r10
  48  %define tmp3  r13              ; must be saved and restored
  49  %define tmp4  r12              ; must be saved and restored
  50  %define tmp5  r14              ; must be saved and restored
  51  %define tmp6  r15              ; must be saved and restored
  52  %define return rax
  53  %define PS 8
  54  %define LOG_PS 3
  55
  56  %define func(x) x:
  57  %macro FUNC_SAVE 0
  58         push    r12
  59         push    r13
  60         push    r14
  61         push    r15
  62  %endmacro
  63  %macro FUNC_RESTORE 0
  64         pop     r15
  65         pop     r14
  66         pop     r13
  67         pop     r12
  68  %endmacro
  69 %endif
  70
  71 %ifidn __OUTPUT_FORMAT__, win64
  72  %define arg0   rcx
  73  %define arg1   rdx
  74  %define arg2   r8
  75  %define arg3   r9
  76
  77  %define arg4   r12             ; must be saved, loaded and restored
  78  %define arg5   r15             ; must be saved and restored
  79  %define tmp    r11
  80  %define tmp.w  r11d
  81  %define tmp.b  r11b
  82  %define tmp2   r10
  83  %define tmp3   r13             ; must be saved and restored
  84  %define tmp4   r14             ; must be saved and restored
  85  %define tmp5   rdi             ; must be saved and restored
  86  %define tmp6   rsi             ; must be saved and restored
  87  %define return rax
  88  %define PS     8
  89  %define LOG_PS 3
  90  %define stack_size  10*16 + 7*8                ; must be an odd multiple of 8
  91  %define arg(x)      [rsp + stack_size + PS + PS*x]
  92
  93  %define func(x) proc_frame x
  94  %macro FUNC_SAVE 0
  95         alloc_stack     stack_size
  96         vmovdqa [rsp + 0*16], xmm6
  97         vmovdqa [rsp + 1*16], xmm7
  98         vmovdqa [rsp + 2*16], xmm8
  99         vmovdqa [rsp + 3*16], xmm9
 100         vmovdqa [rsp + 4*16], xmm10
 101         vmovdqa [rsp + 5*16], xmm11
 102         vmovdqa [rsp + 6*16], xmm12
 103         vmovdqa [rsp + 7*16], xmm13
 104         vmovdqa [rsp + 8*16], xmm14
 105         vmovdqa [rsp + 9*16], xmm15
 106         save_reg        r12,  10*16 + 0*8
 107         save_reg        r13,  10*16 + 1*8
 108         save_reg        r14,  10*16 + 2*8
 109         save_reg        r15,  10*16 + 3*8
 110         save_reg        rdi,  10*16 + 4*8
 111         save_reg        rsi,  10*16 + 5*8
 112         end_prolog
 113         mov     arg4, arg(4)
 114  %endmacro
 115
 116  %macro FUNC_RESTORE 0
 117         vmovdqa xmm6, [rsp + 0*16]
 118         vmovdqa xmm7, [rsp + 1*16]
 119         vmovdqa xmm8, [rsp + 2*16]
 120         vmovdqa xmm9, [rsp + 3*16]
 121         vmovdqa xmm10, [rsp + 4*16]
 122         vmovdqa xmm11, [rsp + 5*16]
 123         vmovdqa xmm12, [rsp + 6*16]
 124         vmovdqa xmm13, [rsp + 7*16]
 125         vmovdqa xmm14, [rsp + 8*16]
 126         vmovdqa xmm15, [rsp + 9*16]
 127         mov     r12,  [rsp + 10*16 + 0*8]
 128         mov     r13,  [rsp + 10*16 + 1*8]
 129         mov     r14,  [rsp + 10*16 + 2*8]
 130         mov     r15,  [rsp + 10*16 + 3*8]
 131         mov     rdi,  [rsp + 10*16 + 4*8]
 132         mov     rsi,  [rsp + 10*16 + 5*8]
 133         add     rsp, stack_size
 134  %endmacro
 135 %endif
 136
 137 %define len    arg0
 138 %define vec    arg1
 139 %define mul_array arg2
 140 %define src    arg3
 141 %define dest   arg4
 142 %define ptr    arg5
 143 %define vec_i  tmp2
 144 %define dest1  tmp3
 145 %define dest2  tmp4
 146 %define vskip1 tmp5
 147 %define vskip3 tmp6
 148 %define pos    return
 149
 150
 151 %ifndef EC_ALIGNED_ADDR
 152 ;;; Use Un-aligned load/store
 153  %define XLDR vmovdqu
 154  %define XSTR vmovdqu
 155 %else
 156 ;;; Use Non-temporal load/stor
 157  %ifdef NO_NT_LDST
 158   %define XLDR vmovdqa
 159   %define XSTR vmovdqa
 160  %else
 161   %define XLDR vmovntdqa
 162   %define XSTR vmovntdq
 163  %endif
 164 %endif
 165
 166 default rel
 167
 168 [bits 64]
 169 section .text
 170
 171 %define xmask0f   ymm15
 172 %define xmask0fx  xmm15
 173 %define xgft1_lo  ymm14
 174 %define xgft1_hi  ymm13
 175 %define xgft2_lo  ymm12
 176 %define xgft2_hi  ymm11
 177 %define xgft3_lo  ymm10
 178 %define xgft3_hi  ymm9
 179 %define xgft4_lo  ymm8
 180 %define xgft4_hi  ymm7
 181
 182
 183 %define x0     ymm0
 184 %define xtmpa  ymm1
 185 %define xp1    ymm2
 186 %define xp2    ymm3
 187 %define xp3    ymm4
 188 %define xp4    ymm5
 189 %define xp5    ymm6
 190
 191 align 16
 192 global gf_5vect_dot_prod_avx2:function
 193 func(gf_5vect_dot_prod_avx2)
 194         FUNC_SAVE
 195         sub     len, 32
 196         jl      .return_fail
 197         xor     pos, pos
 198         mov     tmp.b, 0x0f
 199         vpinsrb xmask0fx, xmask0fx, tmp.w, 0
 200         vpbroadcastb xmask0f, xmask0fx  ;Construct mask 0x0f0f0f...
 201         mov     vskip1, vec
 202         imul    vskip1, 32
 203         mov     vskip3, vec
 204         imul    vskip3, 96
 205         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 206         mov     dest1, [dest]
 207         mov     dest2, [dest+PS]
 208
 209
 210 .loop32:
 211         mov     tmp, mul_array
 212         xor     vec_i, vec_i
 213         vpxor   xp1, xp1
 214         vpxor   xp2, xp2
 215         vpxor   xp3, xp3
 216         vpxor   xp4, xp4
 217         vpxor   xp5, xp5
 218
 219
 220 .next_vect:
 221         mov     ptr, [src+vec_i]
 222         XLDR    x0, [ptr+pos]           ;Get next source vector
 223         add     vec_i, PS
 224
 225         vpand   xgft4_lo, x0, xmask0f   ;Mask low src nibble in bits 4-0
 226         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 227         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 228         vperm2i128 xtmpa, xgft4_lo, x0, 0x30    ;swap xtmpa from 1lo|2lo to 1lo|2hi
 229         vperm2i128 x0, xgft4_lo, x0, 0x12       ;swap x0 from    1hi|2hi to 1hi|2lo
 230
 231         vmovdqu xgft1_lo, [tmp]                 ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 232                                                 ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 233         vmovdqu xgft2_lo, [tmp+vskip1*1]        ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 234                                                 ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 235         vmovdqu xgft3_lo, [tmp+vskip1*2]        ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 236                                                 ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 237         vmovdqu xgft4_lo, [tmp+vskip3]          ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
 238                                                 ;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 239
 240         vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 241         vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
 242         vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
 243         vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
 244
 245         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 246         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 247         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 248         vpxor   xp1, xgft1_hi           ;xp1 += partial
 249
 250         vpshufb xgft2_hi, x0            ;Lookup mul table of high nibble
 251         vpshufb xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 252         vpxor   xgft2_hi, xgft2_lo      ;GF add high and low partials
 253         vpxor   xp2, xgft2_hi           ;xp2 += partial
 254
 255         vmovdqu xgft1_lo, [tmp+vskip1*4]        ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
 256                                                 ;     "     Ex{00}, Ex{10}, ..., Ex{f0}
 257         vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 258         add     tmp, 32
 259
 260         vpshufb xgft3_hi, x0            ;Lookup mul table of high nibble
 261         vpshufb xgft3_lo, xtmpa         ;Lookup mul table of low nibble
 262         vpxor   xgft3_hi, xgft3_lo      ;GF add high and low partials
 263         vpxor   xp3, xgft3_hi           ;xp3 += partial
 264
 265         vpshufb xgft4_hi, x0            ;Lookup mul table of high nibble
 266         vpshufb xgft4_lo, xtmpa         ;Lookup mul table of low nibble
 267         vpxor   xgft4_hi, xgft4_lo      ;GF add high and low partials
 268         vpxor   xp4, xgft4_hi           ;xp4 += partial
 269
 270         vpshufb xgft1_hi, x0            ;Lookup mul table of high nibble
 271         vpshufb xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 272         vpxor   xgft1_hi, xgft1_lo      ;GF add high and low partials
 273         vpxor   xp5, xgft1_hi           ;xp5 += partial
 274
 275         cmp     vec_i, vec
 276         jl      .next_vect
 277
 278         mov     tmp, [dest+2*PS]
 279         mov     ptr, [dest+3*PS]
 280         mov     vec_i, [dest+4*PS]
 281
 282         XSTR    [dest1+pos], xp1
 283         XSTR    [dest2+pos], xp2
 284         XSTR    [tmp+pos], xp3
 285         XSTR    [ptr+pos], xp4
 286         XSTR    [vec_i+pos], xp5
 287
 288         add     pos, 32                 ;Loop on 32 bytes at a time
 289         cmp     pos, len
 290         jle     .loop32
 291
 292         lea     tmp, [len + 32]
 293         cmp     pos, tmp
 294         je      .return_pass
 295
 296         ;; Tail len
 297         mov     pos, len        ;Overlapped offset length-16
 298         jmp     .loop32         ;Do one more overlap pass
 299
 300 .return_pass:
 301         FUNC_RESTORE
 302         mov     return, 0
 303         ret
 304
 305 .return_fail:
 306         FUNC_RESTORE
 307         mov     return, 1
 308         ret
 309
 310 endproc_frame
 311
 312 section .data
 313
 314 ;;;       func                  core, ver, snum
 315 slversion gf_5vect_dot_prod_avx2, 04,  04,  0199