ceph/src/isa-l/erasure_code/gf_3vect_mad_avx.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %define PS 8
  37
  38 %ifidn __OUTPUT_FORMAT__, win64
  39  %define arg0  rcx
  40  %define arg0.w ecx
  41  %define arg1  rdx
  42  %define arg2  r8
  43  %define arg3  r9
  44  %define arg4  r12
  45  %define arg5  r15
  46  %define tmp   r11
  47  %define return rax
  48  %define return.w eax
  49  %define stack_size 16*10 + 3*8
  50  %define arg(x)      [rsp + stack_size + PS + PS*x]
  51  %define func(x) proc_frame x
  52
  53 %macro FUNC_SAVE 0
  54         sub     rsp, stack_size
  55         vmovdqa [rsp+16*0],xmm6
  56         vmovdqa [rsp+16*1],xmm7
  57         vmovdqa [rsp+16*2],xmm8
  58         vmovdqa [rsp+16*3],xmm9
  59         vmovdqa [rsp+16*4],xmm10
  60         vmovdqa [rsp+16*5],xmm11
  61         vmovdqa [rsp+16*6],xmm12
  62         vmovdqa [rsp+16*7],xmm13
  63         vmovdqa [rsp+16*8],xmm14
  64         vmovdqa [rsp+16*9],xmm15
  65         save_reg        r12,  10*16 + 0*8
  66         save_reg        r15,  10*16 + 1*8
  67         end_prolog
  68         mov     arg4, arg(4)
  69         mov     arg5, arg(5)
  70 %endmacro
  71
  72 %macro FUNC_RESTORE 0
  73         vmovdqa xmm6, [rsp+16*0]
  74         vmovdqa xmm7, [rsp+16*1]
  75         vmovdqa xmm8, [rsp+16*2]
  76         vmovdqa xmm9, [rsp+16*3]
  77         vmovdqa xmm10, [rsp+16*4]
  78         vmovdqa xmm11, [rsp+16*5]
  79         vmovdqa xmm12, [rsp+16*6]
  80         vmovdqa xmm13, [rsp+16*7]
  81         vmovdqa xmm14, [rsp+16*8]
  82         vmovdqa xmm15, [rsp+16*9]
  83         mov     r12,  [rsp + 10*16 + 0*8]
  84         mov     r15,  [rsp + 10*16 + 1*8]
  85         add     rsp, stack_size
  86 %endmacro
  87
  88 %elifidn __OUTPUT_FORMAT__, elf64
  89  %define arg0  rdi
  90  %define arg0.w edi
  91  %define arg1  rsi
  92  %define arg2  rdx
  93  %define arg3  rcx
  94  %define arg4  r8
  95  %define arg5  r9
  96  %define tmp   r11
  97  %define return rax
  98  %define return.w eax
  99
 100  %define func(x) x:
 101  %define FUNC_SAVE
 102  %define FUNC_RESTORE
 103 %endif
 104
 105 ;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
 106 %define len   arg0
 107 %define len.w arg0.w
 108 %define vec    arg1
 109 %define vec_i    arg2
 110 %define mul_array arg3
 111 %define src   arg4
 112 %define dest1 arg5
 113 %define pos   return
 114 %define pos.w return.w
 115
 116 %define dest2 mul_array
 117 %define dest3 vec_i
 118
 119 %ifndef EC_ALIGNED_ADDR
 120 ;;; Use Un-aligned load/store
 121  %define XLDR vmovdqu
 122  %define XSTR vmovdqu
 123 %else
 124 ;;; Use Non-temporal load/stor
 125  %ifdef NO_NT_LDST
 126   %define XLDR vmovdqa
 127   %define XSTR vmovdqa
 128  %else
 129   %define XLDR vmovntdqa
 130   %define XSTR vmovntdq
 131  %endif
 132 %endif
 133
 134
 135 default rel
 136
 137 [bits 64]
 138 section .text
 139
 140 %define xmask0f  xmm15
 141 %define xgft1_lo  xmm14
 142 %define xgft1_hi  xmm13
 143 %define xgft2_lo  xmm12
 144 %define xgft2_hi  xmm11
 145 %define xgft3_lo  xmm10
 146 %define xgft3_hi  xmm9
 147
 148 %define x0      xmm0
 149 %define xtmpa   xmm1
 150 %define xtmph1  xmm2
 151 %define xtmpl1  xmm3
 152 %define xtmph2  xmm4
 153 %define xtmpl2  xmm5
 154 %define xtmph3  xmm6
 155 %define xtmpl3  xmm7
 156 %define xd1     xmm8
 157 %define xd2     xtmpl1
 158 %define xd3     xtmph1
 159
 160 align 16
 161 global gf_3vect_mad_avx:function
 162 func(gf_3vect_mad_avx)
 163         FUNC_SAVE
 164         sub     len, 16
 165         jl      .return_fail
 166         xor     pos, pos
 167         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 168
 169         sal     vec_i, 5                ;Multiply by 32
 170         sal     vec, 5
 171         lea     tmp, [mul_array + vec_i]
 172         vmovdqu xgft1_lo, [tmp]         ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 173         vmovdqu xgft1_hi, [tmp+16]      ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 174         vmovdqu xgft2_lo, [tmp+vec]     ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 175         vmovdqu xgft2_hi, [tmp+vec+16]  ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 176         vmovdqu xgft3_lo, [tmp+2*vec]   ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 177         vmovdqu xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 178         mov     dest2, [dest1+PS]       ; reuse mul_array
 179         mov     dest3, [dest1+2*PS]     ; reuse vec_i
 180         mov     dest1, [dest1]
 181
 182 .loop16:
 183         XLDR    x0, [src+pos]           ;Get next source vector
 184         XLDR    xd1, [dest1+pos]                ;Get next dest vector
 185
 186         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 187         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 188         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 189
 190         ; dest1
 191         vpshufb xtmph1, xgft1_hi, x0            ;Lookup mul table of high nibble
 192         vpshufb xtmpl1, xgft1_lo, xtmpa         ;Lookup mul table of low nibble
 193         vpxor   xtmph1, xtmph1, xtmpl1          ;GF add high and low partials
 194         vpxor   xd1, xd1, xtmph1                ;xd1 += partial
 195
 196         XLDR    xd2, [dest2+pos]        ;reuse xtmpl1. Get next dest vector
 197         XLDR    xd3, [dest3+pos]        ;reuse xtmph1. Get next dest vector
 198
 199         ; dest2
 200         vpshufb xtmph2, xgft2_hi, x0            ;Lookup mul table of high nibble
 201         vpshufb xtmpl2, xgft2_lo, xtmpa         ;Lookup mul table of low nibble
 202         vpxor   xtmph2, xtmph2, xtmpl2          ;GF add high and low partials
 203         vpxor   xd2, xd2, xtmph2                ;xd2 += partial
 204
 205         ; dest3
 206         vpshufb xtmph3, xgft3_hi, x0            ;Lookup mul table of high nibble
 207         vpshufb xtmpl3, xgft3_lo, xtmpa         ;Lookup mul table of low nibble
 208         vpxor   xtmph3, xtmph3, xtmpl3          ;GF add high and low partials
 209         vpxor   xd3, xd3, xtmph3                ;xd3 += partial
 210
 211         XSTR    [dest1+pos], xd1
 212         XSTR    [dest2+pos], xd2
 213         XSTR    [dest3+pos], xd3
 214
 215         add     pos, 16                 ;Loop on 16 bytes at a time
 216         cmp     pos, len
 217         jle     .loop16
 218
 219         lea     tmp, [len + 16]
 220         cmp     pos, tmp
 221         je      .return_pass
 222
 223 .lessthan16:
 224         ;; Tail len
 225         ;; Do one more overlap pass
 226         mov     tmp, len                ;Overlapped offset length-16
 227         XLDR    x0, [src+tmp]           ;Get next source vector
 228         XLDR    xd1, [dest1+tmp]        ;Get next dest vector
 229         XLDR    xd2, [dest2+tmp]        ;reuse xtmpl1. Get next dest vector
 230         XLDR    xd3, [dest3+tmp]        ;reuse xtmph1. Get next dest vector
 231
 232         sub     len, pos
 233
 234         movdqa  xtmph3, [constip16]     ;Load const of i + 16
 235         vpinsrb xtmpl3, xtmpl3, len.w, 15
 236         vpshufb xtmpl3, xtmpl3, xmask0f         ;Broadcast len to all bytes
 237         vpcmpgtb        xtmpl3, xtmpl3, xtmph3
 238
 239         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 240         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 241         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 242
 243         ; dest1
 244         vpshufb xgft1_hi, xgft1_hi, x0          ;Lookup mul table of high nibble
 245         vpshufb xgft1_lo, xgft1_lo, xtmpa       ;Lookup mul table of low nibble
 246         vpxor   xgft1_hi, xgft1_hi, xgft1_lo    ;GF add high and low partials
 247         vpand   xgft1_hi, xgft1_hi, xtmpl3
 248         vpxor   xd1, xd1, xgft1_hi
 249
 250         ; dest2
 251         vpshufb xgft2_hi, xgft2_hi, x0          ;Lookup mul table of high nibble
 252         vpshufb xgft2_lo, xgft2_lo, xtmpa       ;Lookup mul table of low nibble
 253         vpxor   xgft2_hi, xgft2_hi, xgft2_lo    ;GF add high and low partials
 254         vpand   xgft2_hi, xgft2_hi, xtmpl3
 255         vpxor   xd2, xd2, xgft2_hi
 256
 257         ; dest3
 258         vpshufb xgft3_hi, xgft3_hi, x0          ;Lookup mul table of high nibble
 259         vpshufb xgft3_lo, xgft3_lo, xtmpa       ;Lookup mul table of low nibble
 260         vpxor   xgft3_hi, xgft3_hi, xgft3_lo    ;GF add high and low partials
 261         vpand   xgft3_hi, xgft3_hi, xtmpl3
 262         vpxor   xd3, xd3, xgft3_hi
 263
 264         XSTR    [dest1+tmp], xd1
 265         XSTR    [dest2+tmp], xd2
 266         XSTR    [dest3+tmp], xd3
 267
 268 .return_pass:
 269         mov     return, 0
 270         FUNC_RESTORE
 271         ret
 272
 273 .return_fail:
 274         mov     return, 1
 275         FUNC_RESTORE
 276         ret
 277
 278 endproc_frame
 279
 280 section .data
 281
 282 align 16
 283 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 284 constip16:
 285         dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
 286
 287 ;;;       func             core, ver, snum
 288 slversion gf_3vect_mad_avx, 02,  01,  0207