ceph/src/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, elf64
  37  %define arg0  rdi
  38  %define arg1  rsi
  39  %define arg2  rdx
  40  %define arg3  rcx
  41  %define arg4  r8
  42  %define arg5  r9
  43
  44  %define tmp   r11
  45  %define tmp2  r10
  46  %define tmp3  r13              ; must be saved and restored
  47  %define tmp4  r12              ; must be saved and restored
  48  %define tmp5  r14              ; must be saved and restored
  49  %define tmp6  r15              ; must be saved and restored
  50  %define return rax
  51  %define PS 8
  52  %define LOG_PS 3
  53
  54  %define func(x) x:
  55  %macro FUNC_SAVE 0
  56         push    r12
  57         push    r13
  58         push    r14
  59         push    r15
  60  %endmacro
  61  %macro FUNC_RESTORE 0
  62         pop     r15
  63         pop     r14
  64         pop     r13
  65         pop     r12
  66  %endmacro
  67 %endif
  68
  69 %ifidn __OUTPUT_FORMAT__, win64
  70  %define arg0   rcx
  71  %define arg1   rdx
  72  %define arg2   r8
  73  %define arg3   r9
  74
  75  %define arg4   r12             ; must be saved, loaded and restored
  76  %define arg5   r15             ; must be saved and restored
  77  %define tmp    r11
  78  %define tmp2   r10
  79  %define tmp3   r13             ; must be saved and restored
  80  %define tmp4   r14             ; must be saved and restored
  81  %define tmp5   rdi             ; must be saved and restored
  82  %define tmp6   rsi             ; must be saved and restored
  83  %define return rax
  84  %define PS     8
  85  %define LOG_PS 3
  86  %define stack_size  10*16 + 7*8                ; must be an odd multiple of 8
  87  %define arg(x)      [rsp + stack_size + PS + PS*x]
  88
  89  %define func(x) proc_frame x
  90  %macro FUNC_SAVE 0
  91         alloc_stack     stack_size
  92         save_xmm128     xmm6, 0*16
  93         save_xmm128     xmm7, 1*16
  94         save_xmm128     xmm8, 2*16
  95         save_xmm128     xmm9, 3*16
  96         save_xmm128     xmm10, 4*16
  97         save_xmm128     xmm11, 5*16
  98         save_xmm128     xmm12, 6*16
  99         save_xmm128     xmm13, 7*16
 100         save_xmm128     xmm14, 8*16
 101         save_xmm128     xmm15, 9*16
 102         save_reg        r12,  10*16 + 0*8
 103         save_reg        r13,  10*16 + 1*8
 104         save_reg        r14,  10*16 + 2*8
 105         save_reg        r15,  10*16 + 3*8
 106         save_reg        rdi,  10*16 + 4*8
 107         save_reg        rsi,  10*16 + 5*8
 108         end_prolog
 109         mov     arg4, arg(4)
 110  %endmacro
 111
 112  %macro FUNC_RESTORE 0
 113         movdqa  xmm6, [rsp + 0*16]
 114         movdqa  xmm7, [rsp + 1*16]
 115         movdqa  xmm8, [rsp + 2*16]
 116         movdqa  xmm9, [rsp + 3*16]
 117         movdqa  xmm10, [rsp + 4*16]
 118         movdqa  xmm11, [rsp + 5*16]
 119         movdqa  xmm12, [rsp + 6*16]
 120         movdqa  xmm13, [rsp + 7*16]
 121         movdqa  xmm14, [rsp + 8*16]
 122         movdqa  xmm15, [rsp + 9*16]
 123         mov     r12,  [rsp + 10*16 + 0*8]
 124         mov     r13,  [rsp + 10*16 + 1*8]
 125         mov     r14,  [rsp + 10*16 + 2*8]
 126         mov     r15,  [rsp + 10*16 + 3*8]
 127         mov     rdi,  [rsp + 10*16 + 4*8]
 128         mov     rsi,  [rsp + 10*16 + 5*8]
 129         add     rsp, stack_size
 130  %endmacro
 131 %endif
 132
 133 %define len    arg0
 134 %define vec    arg1
 135 %define mul_array arg2
 136 %define src    arg3
 137 %define dest   arg4
 138 %define ptr    arg5
 139 %define vec_i  tmp2
 140 %define dest1  tmp3
 141 %define dest2  tmp4
 142 %define vskip1 tmp5
 143 %define vskip3 tmp6
 144 %define pos    return
 145
 146
 147 %ifndef EC_ALIGNED_ADDR
 148 ;;; Use Un-aligned load/store
 149  %define XLDR movdqu
 150  %define XSTR movdqu
 151 %else
 152 ;;; Use Non-temporal load/stor
 153  %ifdef NO_NT_LDST
 154   %define XLDR movdqa
 155   %define XSTR movdqa
 156  %else
 157   %define XLDR movntdqa
 158   %define XSTR movntdq
 159  %endif
 160 %endif
 161
 162 default rel
 163
 164 [bits 64]
 165 section .text
 166
 167 %define xmask0f   xmm15
 168 %define xgft1_lo  xmm2
 169 %define xgft1_hi  xmm3
 170 %define xgft2_lo  xmm4
 171 %define xgft2_hi  xmm5
 172 %define xgft3_lo  xmm10
 173 %define xgft3_hi  xmm6
 174 %define xgft4_lo  xmm8
 175 %define xgft4_hi  xmm7
 176
 177
 178 %define x0     xmm0
 179 %define xtmpa  xmm1
 180 %define xp1    xmm9
 181 %define xp2    xmm11
 182 %define xp3    xmm12
 183 %define xp4    xmm13
 184 %define xp5    xmm14
 185
 186 align 16
 187 global gf_5vect_dot_prod_sse:function
 188 func(gf_5vect_dot_prod_sse)
 189         FUNC_SAVE
 190         sub     len, 16
 191         jl      .return_fail
 192         xor     pos, pos
 193         movdqa  xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 194         mov     vskip1, vec
 195         imul    vskip1, 32
 196         mov     vskip3, vec
 197         imul    vskip3, 96
 198         sal     vec, LOG_PS             ;vec *= PS. Make vec_i count by PS
 199         mov     dest1, [dest]
 200         mov     dest2, [dest+PS]
 201
 202
 203 .loop16:
 204         mov     tmp, mul_array
 205         xor     vec_i, vec_i
 206         pxor    xp1, xp1
 207         pxor    xp2, xp2
 208         pxor    xp3, xp3
 209         pxor    xp4, xp4
 210         pxor    xp5, xp5
 211
 212
 213 .next_vect:
 214         mov     ptr, [src+vec_i]
 215         add     vec_i, PS
 216         XLDR    x0, [ptr+pos]           ;Get next source vector
 217
 218         movdqu  xgft1_lo, [tmp]                 ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 219         movdqu  xgft1_hi, [tmp+16]              ;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 220         movdqu  xgft2_lo, [tmp+vskip1*1]        ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 221         movdqu  xgft2_hi, [tmp+vskip1*1+16]     ;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 222         movdqu  xgft3_lo, [tmp+vskip1*2]        ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 223         movdqu  xgft3_hi, [tmp+vskip1*2+16]     ;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 224         movdqu  xgft4_lo, [tmp+vskip3]          ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
 225         movdqu  xgft4_hi, [tmp+vskip3+16]       ;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 226
 227         movdqa  xtmpa, x0               ;Keep unshifted copy of src
 228         psraw   x0, 4                   ;Shift to put high nibble into bits 4-0
 229         pand    x0, xmask0f             ;Mask high src nibble in bits 4-0
 230         pand    xtmpa, xmask0f          ;Mask low src nibble in bits 4-0
 231
 232         pshufb  xgft1_hi, x0            ;Lookup mul table of high nibble
 233         pshufb  xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 234         pxor    xgft1_hi, xgft1_lo      ;GF add high and low partials
 235         pxor    xp1, xgft1_hi           ;xp1 += partial
 236
 237         pshufb  xgft2_hi, x0            ;Lookup mul table of high nibble
 238         pshufb  xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 239         pxor    xgft2_hi, xgft2_lo      ;GF add high and low partials
 240         pxor    xp2, xgft2_hi           ;xp2 += partial
 241
 242         movdqu  xgft1_lo, [tmp+vskip1*4]        ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
 243         movdqu  xgft1_hi, [tmp+vskip1*4+16]     ;     "     Ex{00}, Ex{10}, ..., Ex{f0}
 244         add     tmp, 32
 245
 246         pshufb  xgft3_hi, x0            ;Lookup mul table of high nibble
 247         pshufb  xgft3_lo, xtmpa         ;Lookup mul table of low nibble
 248         pxor    xgft3_hi, xgft3_lo      ;GF add high and low partials
 249         pxor    xp3, xgft3_hi           ;xp3 += partial
 250
 251         pshufb  xgft4_hi, x0            ;Lookup mul table of high nibble
 252         pshufb  xgft4_lo, xtmpa         ;Lookup mul table of low nibble
 253         pxor    xgft4_hi, xgft4_lo      ;GF add high and low partials
 254         pxor    xp4, xgft4_hi           ;xp4 += partial
 255
 256         pshufb  xgft1_hi, x0            ;Lookup mul table of high nibble
 257         pshufb  xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 258         pxor    xgft1_hi, xgft1_lo      ;GF add high and low partials
 259         pxor    xp5, xgft1_hi           ;xp5 += partial
 260
 261         cmp     vec_i, vec
 262         jl      .next_vect
 263
 264         mov     tmp, [dest+2*PS]
 265         mov     ptr, [dest+3*PS]
 266         mov     vec_i, [dest+4*PS]
 267
 268         XSTR    [dest1+pos], xp1
 269         XSTR    [dest2+pos], xp2
 270         XSTR    [tmp+pos], xp3
 271         XSTR    [ptr+pos], xp4
 272         XSTR    [vec_i+pos], xp5
 273
 274         add     pos, 16                 ;Loop on 16 bytes at a time
 275         cmp     pos, len
 276         jle     .loop16
 277
 278         lea     tmp, [len + 16]
 279         cmp     pos, tmp
 280         je      .return_pass
 281
 282         ;; Tail len
 283         mov     pos, len        ;Overlapped offset length-16
 284         jmp     .loop16         ;Do one more overlap pass
 285
 286 .return_pass:
 287         FUNC_RESTORE
 288         mov     return, 0
 289         ret
 290
 291 .return_fail:
 292         FUNC_RESTORE
 293         mov     return, 1
 294         ret
 295
 296 endproc_frame
 297
 298 section .data
 299
 300 align 16
 301 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 302
 303 ;;;       func                  core, ver, snum
 304 slversion gf_5vect_dot_prod_sse, 00,  05,  0065