ceph/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %define PS 8
  37
  38 %ifidn __OUTPUT_FORMAT__, win64
  39  %define arg0  rcx
  40  %define arg0.w ecx
  41  %define arg1  rdx
  42  %define arg2  r8
  43  %define arg3  r9
  44  %define arg4  r12
  45  %define arg5  r15
  46  %define tmp   r11
  47  %define tmp2   r10
  48  %define return rax
  49  %define return.w eax
  50  %define stack_size 16*9 + 3*8
  51  %define arg(x)      [rsp + stack_size + PS + PS*x]
  52  %define func(x) proc_frame x
  53
  54 %macro FUNC_SAVE 0
  55         sub     rsp, stack_size
  56         movdqa  [rsp+16*0],xmm6
  57         movdqa  [rsp+16*1],xmm7
  58         movdqa  [rsp+16*2],xmm8
  59         movdqa  [rsp+16*3],xmm9
  60         movdqa  [rsp+16*4],xmm10
  61         movdqa  [rsp+16*5],xmm11
  62         movdqa  [rsp+16*6],xmm12
  63         movdqa  [rsp+16*7],xmm13
  64         movdqa  [rsp+16*8],xmm14
  65         save_reg        r12,  9*16 + 0*8
  66         save_reg        r15,  9*16 + 1*8
  67         end_prolog
  68         mov     arg4, arg(4)
  69         mov     arg5, arg(5)
  70 %endmacro
  71
  72 %macro FUNC_RESTORE 0
  73         movdqa  xmm6, [rsp+16*0]
  74         movdqa  xmm7, [rsp+16*1]
  75         movdqa  xmm8, [rsp+16*2]
  76         movdqa  xmm9, [rsp+16*3]
  77         movdqa  xmm10, [rsp+16*4]
  78         movdqa  xmm11, [rsp+16*5]
  79         movdqa  xmm12, [rsp+16*6]
  80         movdqa  xmm13, [rsp+16*7]
  81         movdqa  xmm14, [rsp+16*8]
  82         mov     r12,  [rsp + 9*16 + 0*8]
  83         mov     r15,  [rsp + 9*16 + 1*8]
  84         add     rsp, stack_size
  85 %endmacro
  86
  87 %elifidn __OUTPUT_FORMAT__, elf64
  88  %define arg0  rdi
  89  %define arg0.w edi
  90  %define arg1  rsi
  91  %define arg2  rdx
  92  %define arg3  rcx
  93  %define arg4  r8
  94  %define arg5  r9
  95  %define tmp   r11
  96  %define tmp2   r10
  97  %define return rax
  98  %define return.w eax
  99
 100  %define func(x) x:
 101  %define FUNC_SAVE
 102  %define FUNC_RESTORE
 103 %endif
 104
 105 ;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
 106 %define len   arg0
 107 %define len.w arg0.w
 108 %define vec    arg1
 109 %define vec_i    arg2
 110 %define mul_array arg3
 111 %define src   arg4
 112 %define dest1  arg5
 113 %define pos   return
 114 %define pos.w return.w
 115
 116 %define dest2 tmp2
 117
 118 %ifndef EC_ALIGNED_ADDR
 119 ;;; Use Un-aligned load/store
 120  %define XLDR movdqu
 121  %define XSTR movdqu
 122 %else
 123 ;;; Use Non-temporal load/stor
 124  %ifdef NO_NT_LDST
 125   %define XLDR movdqa
 126   %define XSTR movdqa
 127  %else
 128   %define XLDR movntdqa
 129   %define XSTR movntdq
 130  %endif
 131 %endif
 132
 133 default rel
 134
 135 [bits 64]
 136 section .text
 137
 138 %define xmask0f  xmm14
 139 %define xgft1_lo  xmm13
 140 %define xgft1_hi  xmm12
 141 %define xgft2_lo  xmm11
 142 %define xgft2_hi  xmm10
 143
 144 %define x0      xmm0
 145 %define xtmpa   xmm1
 146 %define xtmph1  xmm2
 147 %define xtmpl1  xmm3
 148 %define xtmph2  xmm4
 149 %define xtmpl2  xmm5
 150 %define xd1     xmm6
 151 %define xd2     xmm7
 152 %define xtmpd1  xmm8
 153 %define xtmpd2  xmm9
 154
 155
 156 align 16
 157 global gf_2vect_mad_sse:function
 158 func(gf_2vect_mad_sse)
 159         FUNC_SAVE
 160         sub     len, 16
 161         jl      .return_fail
 162
 163         xor     pos, pos
 164         movdqa  xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 165         sal     vec_i, 5                ;Multiply by 32
 166         sal     vec, 5
 167         lea     tmp, [mul_array + vec_i]
 168         movdqu  xgft1_lo,[tmp]          ;Load array Ax{00}, Ax{01}, Ax{02}, ...
 169         movdqu  xgft1_hi, [tmp+16]      ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
 170         movdqu  xgft2_lo, [tmp+vec]     ;Load array Bx{00}, Bx{01}, Bx{02}, ...
 171         movdqu  xgft2_hi, [tmp+vec+16]  ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
 172         mov     dest2, [dest1+PS]
 173         mov     dest1, [dest1]
 174
 175         XLDR    xtmpd1, [dest1+len]     ;backup the last 16 bytes in dest
 176         XLDR    xtmpd2, [dest2+len]     ;backup the last 16 bytes in dest
 177
 178 .loop16:
 179         XLDR    xd1, [dest1+pos]                ;Get next dest vector
 180         XLDR    xd2, [dest2+pos]                ;Get next dest vector
 181 .loop16_overlap:
 182         XLDR    x0, [src+pos]           ;Get next source vector
 183         movdqa  xtmph1, xgft1_hi                ;Reload const array registers
 184         movdqa  xtmpl1, xgft1_lo
 185         movdqa  xtmph2, xgft2_hi                ;Reload const array registers
 186         movdqa  xtmpl2, xgft2_lo
 187         movdqa  xtmpa, x0               ;Keep unshifted copy of src
 188         psraw   x0, 4                   ;Shift to put high nibble into bits 4-0
 189         pand    x0, xmask0f             ;Mask high src nibble in bits 4-0
 190         pand    xtmpa, xmask0f          ;Mask low src nibble in bits 4-0
 191
 192         pshufb  xtmph1, x0              ;Lookup mul table of high nibble
 193         pshufb  xtmpl1, xtmpa           ;Lookup mul table of low nibble
 194         pxor    xtmph1, xtmpl1          ;GF add high and low partials
 195         pxor    xd1, xtmph1
 196
 197         pshufb  xtmph2, x0              ;Lookup mul table of high nibble
 198         pshufb  xtmpl2, xtmpa           ;Lookup mul table of low nibble
 199         pxor    xtmph2, xtmpl2          ;GF add high and low partials
 200         pxor    xd2, xtmph2
 201
 202         XSTR    [dest1+pos], xd1        ;Store result
 203         XSTR    [dest2+pos], xd2        ;Store result
 204
 205         add     pos, 16                 ;Loop on 16 bytes at a time
 206         cmp     pos, len
 207         jle     .loop16
 208
 209         lea     tmp, [len + 16]
 210         cmp     pos, tmp
 211         je      .return_pass
 212
 213         ;; Tail len
 214         mov     pos, len        ;Overlapped offset length-16
 215         movdqa  xd1, xtmpd1     ;Restore xd1
 216         movdqa  xd2, xtmpd2     ;Restore xd2
 217         jmp     .loop16_overlap ;Do one more overlap pass
 218
 219 .return_pass:
 220         FUNC_RESTORE
 221         mov     return, 0
 222         ret
 223
 224 .return_fail:
 225         FUNC_RESTORE
 226         mov     return, 1
 227         ret
 228
 229 endproc_frame
 230
 231 section .data
 232
 233 align 16
 234
 235 mask0f:
 236         ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 237
 238 ;;;       func             core, ver, snum
 239 slversion gf_2vect_mad_sse, 00,  01,  0203