ceph/src/isa-l/erasure_code/gf_4vect_mad_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %define PS 8
  37
  38 %ifidn __OUTPUT_FORMAT__, win64
  39  %define arg0  rcx
  40  %define arg0.w ecx
  41  %define arg1  rdx
  42  %define arg2  r8
  43  %define arg3  r9
  44  %define arg4  r12
  45  %define arg5  r15
  46  %define tmp   r11
  47  %define tmp2   r10
  48  %define tmp3   r13
  49  %define return rax
  50  %define return.w eax
  51  %define stack_size 16*10 + 3*8
  52  %define arg(x)      [rsp + stack_size + PS + PS*x]
  53  %define func(x) proc_frame x
  54
  55 %macro FUNC_SAVE 0
  56         sub     rsp, stack_size
  57         movdqa  [rsp+16*0],xmm6
  58         movdqa  [rsp+16*1],xmm7
  59         movdqa  [rsp+16*2],xmm8
  60         movdqa  [rsp+16*3],xmm9
  61         movdqa  [rsp+16*4],xmm10
  62         movdqa  [rsp+16*5],xmm11
  63         movdqa  [rsp+16*6],xmm12
  64         movdqa  [rsp+16*7],xmm13
  65         movdqa  [rsp+16*8],xmm14
  66         movdqa  [rsp+16*9],xmm15
  67         save_reg        r12,  10*16 + 0*8
  68         save_reg        r13,  10*16 + 1*8
  69         save_reg        r15,  10*16 + 2*8
  70         end_prolog
  71         mov     arg4, arg(4)
  72         mov     arg5, arg(5)
  73 %endmacro
  74
  75 %macro FUNC_RESTORE 0
  76         movdqa  xmm6, [rsp+16*0]
  77         movdqa  xmm7, [rsp+16*1]
  78         movdqa  xmm8, [rsp+16*2]
  79         movdqa  xmm9, [rsp+16*3]
  80         movdqa  xmm10, [rsp+16*4]
  81         movdqa  xmm11, [rsp+16*5]
  82         movdqa  xmm12, [rsp+16*6]
  83         movdqa  xmm13, [rsp+16*7]
  84         movdqa  xmm14, [rsp+16*8]
  85         movdqa  xmm15, [rsp+16*9]
  86         mov     r12,  [rsp + 10*16 + 0*8]
  87         mov     r13,  [rsp + 10*16 + 1*8]
  88         mov     r15,  [rsp + 10*16 + 2*8]
  89         add     rsp, stack_size
  90 %endmacro
  91
  92 %elifidn __OUTPUT_FORMAT__, elf64
  93  %define arg0  rdi
  94  %define arg0.w edi
  95  %define arg1  rsi
  96  %define arg2  rdx
  97  %define arg3  rcx
  98  %define arg4  r8
  99  %define arg5  r9
 100  %define tmp   r11
 101  %define tmp2   r10
 102  %define tmp3   r12
 103  %define return rax
 104  %define return.w eax
 105
 106  %define func(x) x:
 107  %macro FUNC_SAVE 0
 108         push    r12
 109  %endmacro
 110  %macro FUNC_RESTORE 0
 111         pop     r12
 112  %endmacro
 113 %endif
 114
 115 ;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
 116 %define len   arg0
 117 %define len.w arg0.w
 118 %define vec    arg1
 119 %define vec_i    arg2
 120 %define mul_array arg3
 121 %define src   arg4
 122 %define dest1  arg5
 123 %define pos   return
 124 %define pos.w return.w
 125
 126 %define dest2 mul_array
 127 %define dest3 tmp2
 128 %define dest4 vec_i
 129
 130 %ifndef EC_ALIGNED_ADDR
 131 ;;; Use Un-aligned load/store
 132  %define XLDR movdqu
 133  %define XSTR movdqu
 134 %else
 135 ;;; Use Non-temporal load/stor
 136  %ifdef NO_NT_LDST
 137   %define XLDR movdqa
 138   %define XSTR movdqa
 139  %else
 140   %define XLDR movntdqa
 141   %define XSTR movntdq
 142  %endif
 143 %endif
 144
 145 default rel
 146
 147 [bits 64]
 148 section .text
 149
 150 %define xmask0f  xmm15
 151 %define xgft3_hi xmm14
 152 %define xgft4_hi xmm13
 153 %define xgft4_lo xmm12
 154
 155 %define x0      xmm0
 156 %define xtmpa   xmm1
 157 %define xtmph1  xmm2
 158 %define xtmpl1  xmm3
 159 %define xtmph2  xmm4
 160 %define xtmpl2  xmm5
 161 %define xtmph3  xmm6
 162 %define xtmpl3  xmm7
 163 %define xtmph4  xmm8
 164 %define xtmpl4  xmm9
 165 %define xd1     xmm10
 166 %define xd2     xmm11
 167 %define xd3     xtmph1
 168 %define xd4     xtmpl1
 169
 170 align 16
 171 global gf_4vect_mad_sse:function
 172 func(gf_4vect_mad_sse)
 173         FUNC_SAVE
 174         sub     len, 16
 175         jl      .return_fail
 176         xor     pos, pos
 177         movdqa  xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 178         mov     tmp, vec
 179
 180         sal     vec_i, 5                ;Multiply by 32
 181         lea     tmp3, [mul_array + vec_i]
 182
 183         sal     tmp, 6                  ;Multiply by 64
 184
 185         movdqu  xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 186         sal     vec, 5                  ;Multiply by 32
 187         add     tmp, vec
 188         movdqu  xgft4_lo, [tmp3+tmp]    ;Load array Dx{00}, Dx{01}, Dx{02}, ...
 189         movdqu  xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
 190
 191         mov     dest2, [dest1+PS]               ; reuse mul_array
 192         mov     dest3, [dest1+2*PS]
 193         mov     dest4, [dest1+3*PS]             ; reuse vec_i
 194         mov     dest1, [dest1]
 195
 196 .loop16:
 197         XLDR    x0, [src+pos]           ;Get next source vector
 198         movdqu  xtmph1, [tmp3+16]       ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 199         movdqu  xtmpl1, [tmp3]          ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 200         movdqu  xtmph2, [tmp3+vec+16]   ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 201         movdqu  xtmpl2, [tmp3+vec]      ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 202         movdqu  xtmpl3, [tmp3+2*vec]    ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 203
 204         movdqa  xtmph3, xgft3_hi
 205         movdqa  xtmpl4, xgft4_lo
 206         movdqa  xtmph4, xgft4_hi
 207
 208         XLDR    xd1, [dest1+pos]        ;Get next dest vector
 209         XLDR    xd2, [dest2+pos]        ;Get next dest vector
 210
 211         movdqa  xtmpa, x0               ;Keep unshifted copy of src
 212         psraw   x0, 4                   ;Shift to put high nibble into bits 4-0
 213         pand    x0, xmask0f             ;Mask high src nibble in bits 4-0
 214         pand    xtmpa, xmask0f          ;Mask low src nibble in bits 4-0
 215
 216         ; dest1
 217         pshufb  xtmph1, x0              ;Lookup mul table of high nibble
 218         pshufb  xtmpl1, xtmpa           ;Lookup mul table of low nibble
 219         pxor    xtmph1, xtmpl1          ;GF add high and low partials
 220         pxor    xd1, xtmph1
 221
 222         XLDR    xd3, [dest3+pos]        ;Reuse xtmph1, Get next dest vector
 223         XLDR    xd4, [dest4+pos]        ;Reuse xtmpl1, Get next dest vector
 224
 225         ; dest2
 226         pshufb  xtmph2, x0              ;Lookup mul table of high nibble
 227         pshufb  xtmpl2, xtmpa           ;Lookup mul table of low nibble
 228         pxor    xtmph2, xtmpl2          ;GF add high and low partials
 229         pxor    xd2, xtmph2
 230
 231         ; dest3
 232         pshufb  xtmph3, x0              ;Lookup mul table of high nibble
 233         pshufb  xtmpl3, xtmpa           ;Lookup mul table of low nibble
 234         pxor    xtmph3, xtmpl3          ;GF add high and low partials
 235         pxor    xd3, xtmph3
 236
 237         ; dest4
 238         pshufb  xtmph4, x0              ;Lookup mul table of high nibble
 239         pshufb  xtmpl4, xtmpa           ;Lookup mul table of low nibble
 240         pxor    xtmph4, xtmpl4          ;GF add high and low partials
 241         pxor    xd4, xtmph4
 242
 243         XSTR    [dest1+pos], xd1        ;Store result
 244         XSTR    [dest2+pos], xd2        ;Store result
 245         XSTR    [dest3+pos], xd3        ;Store result
 246         XSTR    [dest4+pos], xd4        ;Store result
 247
 248         add     pos, 16                 ;Loop on 16 bytes at a time
 249         cmp     pos, len
 250         jle     .loop16
 251
 252         lea     tmp, [len + 16]
 253         cmp     pos, tmp
 254         je      .return_pass
 255
 256 .lessthan16:
 257         ;; Tail len
 258         ;; Do one more overlap pass
 259         mov     tmp, len        ;Overlapped offset length-16
 260
 261         XLDR    x0, [src+tmp]           ;Get next source vector
 262
 263         movdqu  xtmph1, [tmp3+16]       ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 264         movdqu  xtmpl1, [tmp3]          ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 265         movdqu  xtmph2, [tmp3+vec+16]   ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 266         movdqu  xtmpl2, [tmp3+vec]      ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 267         movdqu  xtmpl3, [tmp3+2*vec]    ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 268
 269         XLDR    xd1, [dest1+tmp]        ;Get next dest vector
 270         XLDR    xd2, [dest2+tmp]        ;Get next dest vector
 271         XLDR    xtmph4, [dest3+tmp]     ;Reuse xtmph1. Get next dest vector
 272
 273         sub     len, pos
 274
 275         movdqa  xtmpl4, [constip16]     ;Load const of i + 16
 276         pinsrb  xtmph3, len.w, 15
 277         pshufb  xtmph3, xmask0f         ;Broadcast len to all bytes
 278         pcmpgtb xtmph3, xtmpl4
 279
 280         XLDR    xtmpl4, [dest4+tmp]     ;Get next dest vector
 281
 282         movdqa  xtmpa, x0               ;Keep unshifted copy of src
 283         psraw   x0, 4                   ;Shift to put high nibble into bits 4-0
 284         pand    x0, xmask0f             ;Mask high src nibble in bits 4-0
 285         pand    xtmpa, xmask0f          ;Mask low src nibble in bits 4-0
 286
 287         ; dest1
 288         pshufb  xtmph1, x0              ;Lookup mul table of high nibble
 289         pshufb  xtmpl1, xtmpa           ;Lookup mul table of low nibble
 290         pxor    xtmph1, xtmpl1          ;GF add high and low partials
 291         pand    xtmph1, xtmph3
 292         pxor    xd1, xtmph1
 293
 294         ; dest2
 295         pshufb  xtmph2, x0              ;Lookup mul table of high nibble
 296         pshufb  xtmpl2, xtmpa           ;Lookup mul table of low nibble
 297         pxor    xtmph2, xtmpl2          ;GF add high and low partials
 298         pand    xtmph2, xtmph3
 299         pxor    xd2, xtmph2
 300
 301         ; dest3
 302         pshufb  xgft3_hi, x0            ;Lookup mul table of high nibble
 303         pshufb  xtmpl3, xtmpa           ;Lookup mul table of low nibble
 304         pxor    xgft3_hi, xtmpl3        ;GF add high and low partials
 305         pand    xgft3_hi, xtmph3
 306         pxor    xtmph4, xgft3_hi
 307
 308         ; dest4
 309         pshufb  xgft4_hi, x0            ;Lookup mul table of high nibble
 310         pshufb  xgft4_lo, xtmpa         ;Lookup mul table of low nibble
 311         pxor    xgft4_hi, xgft4_lo      ;GF add high and low partials
 312         pand    xgft4_hi, xtmph3
 313         pxor    xtmpl4, xgft4_hi
 314
 315         XSTR    [dest1+tmp], xd1        ;Store result
 316         XSTR    [dest2+tmp], xd2        ;Store result
 317         XSTR    [dest3+tmp], xtmph4     ;Store result
 318         XSTR    [dest4+tmp], xtmpl4     ;Store result
 319
 320 .return_pass:
 321         FUNC_RESTORE
 322         mov     return, 0
 323         ret
 324
 325 .return_fail:
 326         FUNC_RESTORE
 327         mov     return, 1
 328         ret
 329
 330 endproc_frame
 331
 332 section .data
 333
 334 align 16
 335
 336 mask0f:
 337         dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 338 constip16:
 339         dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
 340
 341 ;;;       func             core, ver, snum
 342 slversion gf_4vect_mad_sse, 00,  01,  0209