ceph/src/isa-l/erasure_code/gf_5vect_mad_avx.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %define PS 8
  37
  38 %ifidn __OUTPUT_FORMAT__, win64
  39  %define arg0  rcx
  40  %define arg0.w ecx
  41  %define arg1  rdx
  42  %define arg2  r8
  43  %define arg3  r9
  44  %define arg4  r12
  45  %define arg5  r15
  46  %define tmp   r11
  47  %define tmp2   r10
  48  %define tmp3   r13
  49  %define tmp4   r14
  50  %define return rax
  51  %define return.w eax
  52  %define stack_size 16*10 + 5*8
  53  %define arg(x)      [rsp + stack_size + PS + PS*x]
  54  %define func(x) proc_frame x
  55
  56 %macro FUNC_SAVE 0
  57         sub     rsp, stack_size
  58         movdqa  [rsp+16*0],xmm6
  59         movdqa  [rsp+16*1],xmm7
  60         movdqa  [rsp+16*2],xmm8
  61         movdqa  [rsp+16*3],xmm9
  62         movdqa  [rsp+16*4],xmm10
  63         movdqa  [rsp+16*5],xmm11
  64         movdqa  [rsp+16*6],xmm12
  65         movdqa  [rsp+16*7],xmm13
  66         movdqa  [rsp+16*8],xmm14
  67         movdqa  [rsp+16*9],xmm15
  68         save_reg        r12,  10*16 + 0*8
  69         save_reg        r13,  10*16 + 1*8
  70         save_reg        r14,  10*16 + 2*8
  71         save_reg        r15,  10*16 + 3*8
  72         end_prolog
  73         mov     arg4, arg(4)
  74         mov     arg5, arg(5)
  75 %endmacro
  76
  77 %macro FUNC_RESTORE 0
  78         movdqa  xmm6, [rsp+16*0]
  79         movdqa  xmm7, [rsp+16*1]
  80         movdqa  xmm8, [rsp+16*2]
  81         movdqa  xmm9, [rsp+16*3]
  82         movdqa  xmm10, [rsp+16*4]
  83         movdqa  xmm11, [rsp+16*5]
  84         movdqa  xmm12, [rsp+16*6]
  85         movdqa  xmm13, [rsp+16*7]
  86         movdqa  xmm14, [rsp+16*8]
  87         movdqa  xmm15, [rsp+16*9]
  88         mov     r12,  [rsp + 10*16 + 0*8]
  89         mov     r13,  [rsp + 10*16 + 1*8]
  90         mov     r14,  [rsp + 10*16 + 2*8]
  91         mov     r15,  [rsp + 10*16 + 3*8]
  92         add     rsp, stack_size
  93 %endmacro
  94
  95 %elifidn __OUTPUT_FORMAT__, elf64
  96  %define arg0  rdi
  97  %define arg0.w edi
  98  %define arg1  rsi
  99  %define arg2  rdx
 100  %define arg3  rcx
 101  %define arg4  r8
 102  %define arg5  r9
 103  %define tmp   r11
 104  %define tmp2   r10
 105  %define tmp3   r12
 106  %define tmp4   r13
 107  %define return rax
 108  %define return.w eax
 109
 110  %define func(x) x:
 111  %macro FUNC_SAVE 0
 112         push    r12
 113         push    r13
 114  %endmacro
 115  %macro FUNC_RESTORE 0
 116         pop     r13
 117         pop     r12
 118  %endmacro
 119 %endif
 120
 121 ;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
 122 %define len   arg0
 123 %define len.w arg0.w
 124 %define vec    arg1
 125 %define vec_i    arg2
 126 %define mul_array arg3
 127 %define src   arg4
 128 %define dest1  arg5
 129 %define pos   return
 130 %define pos.w return.w
 131
 132 %define dest2 tmp4
 133 %define dest3 mul_array
 134 %define dest4 tmp2
 135 %define dest5 vec_i
 136
 137
 138 %ifndef EC_ALIGNED_ADDR
 139 ;;; Use Un-aligned load/store
 140  %define XLDR vmovdqu
 141  %define XSTR vmovdqu
 142 %else
 143 ;;; Use Non-temporal load/stor
 144  %ifdef NO_NT_LDST
 145   %define XLDR vmovdqa
 146   %define XSTR vmovdqa
 147  %else
 148   %define XLDR vmovntdqa
 149   %define XSTR vmovntdq
 150  %endif
 151 %endif
 152
 153 default rel
 154
 155 [bits 64]
 156 section .text
 157
 158 %define xmask0f  xmm15
 159 %define xgft5_hi xmm14
 160 %define xgft4_lo xmm13
 161 %define xgft4_hi xmm12
 162
 163 %define x0      xmm0
 164 %define xtmpa   xmm1
 165 %define xtmph1  xmm2
 166 %define xtmpl1  xmm3
 167 %define xtmph2  xmm4
 168 %define xtmpl2  xmm5
 169 %define xtmph3  xmm6
 170 %define xtmpl3  xmm7
 171 %define xtmph5  xmm8
 172 %define xtmpl5  xmm9
 173 %define xd1     xmm10
 174 %define xd2     xmm11
 175 %define xd3     xtmpl1
 176 %define xd4     xtmph1
 177 %define xd5     xtmpl2
 178
 179
 180 align 16
 181 global gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
 182 func(gf_5vect_mad_avx)
 183         FUNC_SAVE
 184         sub     len, 16
 185         jl      .return_fail
 186         xor     pos, pos
 187         vmovdqa xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 188         mov     tmp, vec
 189         sal     vec_i, 5                ;Multiply by 32
 190         lea     tmp3, [mul_array + vec_i]
 191         sal     tmp, 6                  ;Multiply by 64
 192         vmovdqu xgft5_hi, [tmp3+2*tmp+16]       ;     "     Ex{00}, Ex{10}, ..., Ex{f0}
 193         sal     vec, 5                  ;Multiply by 32
 194         add     tmp, vec
 195         vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
 196         vmovdqu xgft4_lo, [tmp3+tmp]    ;Load array Dx{00}, Dx{01}, Dx{02}, ...
 197
 198         mov     dest3, [dest1+2*PS]     ; reuse mul_array
 199         mov     dest4, [dest1+3*PS]
 200         mov     dest5, [dest1+4*PS]     ; reuse vec_i
 201         mov     dest2, [dest1+PS]
 202         mov     dest1, [dest1]
 203
 204 .loop16:
 205         XLDR    x0, [src+pos]           ;Get next source vector
 206
 207         vmovdqu xtmph1, [tmp3+16]       ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 208         vmovdqu xtmpl1, [tmp3]          ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 209         vmovdqu xtmph2, [tmp3+vec+16]   ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 210         vmovdqu xtmpl2, [tmp3+vec]      ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 211         vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 212         vmovdqu xtmpl3, [tmp3+2*vec]    ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 213         vmovdqu xtmpl5, [tmp3+4*vec]    ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
 214
 215         XLDR    xd1, [dest1+pos]        ;Get next dest vector
 216         XLDR    xd2, [dest2+pos]        ;Get next dest vector
 217
 218         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 219         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 220         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 221
 222         ; dest1
 223         vpshufb xtmph1, xtmph1, x0              ;Lookup mul table of high nibble
 224         vpshufb xtmpl1, xtmpl1, xtmpa           ;Lookup mul table of low nibble
 225         vpxor   xtmph1, xtmph1, xtmpl1          ;GF add high and low partials
 226         vpxor   xd1, xd1, xtmph1
 227
 228         XLDR    xd3, [dest3+pos]        ;Reuse xtmpl1, Get next dest vector
 229         XLDR    xd4, [dest4+pos]        ;Reuse xtmph1, Get next dest vector
 230
 231         ; dest2
 232         vpshufb xtmph2, xtmph2, x0              ;Lookup mul table of high nibble
 233         vpshufb xtmpl2, xtmpl2, xtmpa           ;Lookup mul table of low nibble
 234         vpxor   xtmph2, xtmph2, xtmpl2          ;GF add high and low partials
 235         vpxor   xd2, xd2, xtmph2
 236
 237         XLDR    xd5, [dest5+pos]        ;Reuse xtmpl2. Get next dest vector
 238
 239         ; dest3
 240         vpshufb xtmph3, xtmph3, x0              ;Lookup mul table of high nibble
 241         vpshufb xtmpl3, xtmpl3, xtmpa           ;Lookup mul table of low nibble
 242         vpxor   xtmph3, xtmph3, xtmpl3          ;GF add high and low partials
 243         vpxor   xd3, xd3, xtmph3
 244
 245         ; dest4
 246         vpshufb xtmph2, xgft4_hi, x0            ;Lookup mul table of high nibble
 247         vpshufb xtmpl3, xgft4_lo, xtmpa         ;Lookup mul table of low nibble
 248         vpxor   xtmph2, xtmph2, xtmpl3          ;GF add high and low partials
 249         vpxor   xd4, xd4, xtmph2
 250
 251         ; dest5
 252         vpshufb xtmph5, xgft5_hi, x0            ;Lookup mul table of high nibble
 253         vpshufb xtmpl5, xtmpl5, xtmpa           ;Lookup mul table of low nibble
 254         vpxor   xtmph5, xtmph5, xtmpl5  ;GF add high and low partials
 255         vpxor   xd5, xd5, xtmph5
 256
 257         XSTR    [dest1+pos], xd1        ;Store result into dest1
 258         XSTR    [dest2+pos], xd2        ;Store result into dest2
 259         XSTR    [dest3+pos], xd3        ;Store result into dest3
 260         XSTR    [dest4+pos], xd4        ;Store result into dest4
 261         XSTR    [dest5+pos], xd5        ;Store result into dest5
 262
 263         add     pos, 16                 ;Loop on 16 bytes at a time
 264         cmp     pos, len
 265         jle     .loop16
 266
 267         lea     tmp, [len + 16]
 268         cmp     pos, tmp
 269         je      .return_pass
 270
 271 .lessthan16:
 272         ;; Tail len
 273         ;; Do one more overlap pass
 274         mov     tmp, len        ;Overlapped offset length-16
 275         XLDR    x0, [src+tmp]           ;Get next source vector
 276
 277         sub     len, pos
 278
 279         vmovdqa xtmph1, [constip16]     ;Load const of i + 16
 280         vpinsrb xtmph5, len.w, 15
 281         vpshufb xtmph5, xmask0f         ;Broadcast len to all bytes
 282         vpcmpgtb        xtmph5, xtmph5, xtmph1
 283
 284         vmovdqu xtmph1, [tmp3+16]       ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 285         vmovdqu xtmpl1, [tmp3]          ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 286         vmovdqu xtmph2, [tmp3+vec+16]   ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 287         vmovdqu xtmpl2, [tmp3+vec]      ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 288         vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 289         vmovdqu xtmpl3, [tmp3+2*vec]    ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 290         vmovdqu xtmpl5, [tmp3+4*vec]    ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
 291
 292         XLDR    xd1, [dest1+tmp]        ;Get next dest vector
 293         XLDR    xd2, [dest2+tmp]        ;Get next dest vector
 294
 295         vpand   xtmpa, x0, xmask0f      ;Mask low src nibble in bits 4-0
 296         vpsraw  x0, x0, 4               ;Shift to put high nibble into bits 4-0
 297         vpand   x0, x0, xmask0f         ;Mask high src nibble in bits 4-0
 298
 299         ; dest1
 300         vpshufb xtmph1, xtmph1, x0              ;Lookup mul table of high nibble
 301         vpshufb xtmpl1, xtmpl1, xtmpa           ;Lookup mul table of low nibble
 302         vpxor   xtmph1, xtmph1, xtmpl1          ;GF add high and low partials
 303         vpand   xtmph1, xtmph1, xtmph5
 304         vpxor   xd1, xd1, xtmph1
 305
 306         XLDR    xd3, [dest3+tmp]        ;Reuse xtmpl1, Get next dest vector
 307         XLDR    xd4, [dest4+tmp]        ;Reuse xtmph1, Get next dest vector
 308
 309         ; dest2
 310         vpshufb xtmph2, xtmph2, x0              ;Lookup mul table of high nibble
 311         vpshufb xtmpl2, xtmpl2, xtmpa           ;Lookup mul table of low nibble
 312         vpxor   xtmph2, xtmph2, xtmpl2          ;GF add high and low partials
 313         vpand   xtmph2, xtmph2, xtmph5
 314         vpxor   xd2, xd2, xtmph2
 315
 316         XLDR    xd5, [dest5+tmp]        ;Reuse xtmpl2. Get next dest vector
 317
 318         ; dest3
 319         vpshufb xtmph3, xtmph3, x0              ;Lookup mul table of high nibble
 320         vpshufb xtmpl3, xtmpl3, xtmpa           ;Lookup mul table of low nibble
 321         vpxor   xtmph3, xtmph3, xtmpl3          ;GF add high and low partials
 322         vpand   xtmph3, xtmph3, xtmph5
 323         vpxor   xd3, xd3, xtmph3
 324
 325         ; dest4
 326         vpshufb xgft4_hi, xgft4_hi, x0          ;Lookup mul table of high nibble
 327         vpshufb xgft4_lo, xgft4_lo, xtmpa               ;Lookup mul table of low nibble
 328         vpxor   xgft4_hi, xgft4_hi, xgft4_lo            ;GF add high and low partials
 329         vpand   xgft4_hi, xgft4_hi, xtmph5
 330         vpxor   xd4, xd4, xgft4_hi
 331
 332         ; dest5
 333         vpshufb xgft5_hi, xgft5_hi, x0          ;Lookup mul table of high nibble
 334         vpshufb xtmpl5, xtmpl5, xtmpa           ;Lookup mul table of low nibble
 335         vpxor   xgft5_hi, xgft5_hi, xtmpl5      ;GF add high and low partials
 336         vpand   xgft5_hi, xgft5_hi, xtmph5
 337         vpxor   xd5, xd5, xgft5_hi
 338
 339         XSTR    [dest1+tmp], xd1        ;Store result into dest1
 340         XSTR    [dest2+tmp], xd2        ;Store result into dest2
 341         XSTR    [dest3+tmp], xd3        ;Store result into dest3
 342         XSTR    [dest4+tmp], xd4        ;Store result into dest4
 343         XSTR    [dest5+tmp], xd5        ;Store result into dest5
 344
 345 .return_pass:
 346         FUNC_RESTORE
 347         mov     return, 0
 348         ret
 349
 350 .return_fail:
 351         FUNC_RESTORE
 352         mov     return, 1
 353         ret
 354
 355 endproc_frame
 356
 357 section .data
 358
 359 align 16
 360 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 361 constip16:
 362         dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
 363
 364 ;;;       func             core, ver, snum
 365 slversion gf_5vect_mad_avx, 02,  01,  020d