ceph/src/spdk/isa-l/erasure_code/gf_vect_mad_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;
  31 ;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
  32 ;;;
  33
  34 %include "reg_sizes.asm"
  35
  36 %ifidn __OUTPUT_FORMAT__, win64
  37  %define arg0  rcx
  38  %define arg0.w ecx
  39  %define arg1  rdx
  40  %define arg2  r8
  41  %define arg3  r9
  42  %define arg4  r12
  43  %define arg5  r15
  44  %define tmp   r11
  45  %define return rax
  46  %define return.w eax
  47  %define PS 8
  48  %define stack_size 16*3 + 3*8
  49  %define arg(x)      [rsp + stack_size + PS + PS*x]
  50  %define func(x) proc_frame x
  51
  52 %macro FUNC_SAVE 0
  53         sub     rsp, stack_size
  54         movdqa  [rsp+16*0],xmm6
  55         movdqa  [rsp+16*1],xmm7
  56         movdqa  [rsp+16*2],xmm8
  57         save_reg        r12,  3*16 + 0*8
  58         save_reg        r15,  3*16 + 1*8
  59         end_prolog
  60         mov     arg4, arg(4)
  61         mov     arg5, arg(5)
  62 %endmacro
  63
  64 %macro FUNC_RESTORE 0
  65         movdqa  xmm6, [rsp+16*0]
  66         movdqa  xmm7, [rsp+16*1]
  67         movdqa  xmm8, [rsp+16*2]
  68         mov     r12,  [rsp + 3*16 + 0*8]
  69         mov     r15,  [rsp + 3*16 + 1*8]
  70         add     rsp, stack_size
  71 %endmacro
  72
  73 %elifidn __OUTPUT_FORMAT__, elf64
  74  %define arg0  rdi
  75  %define arg0.w edi
  76  %define arg1  rsi
  77  %define arg2  rdx
  78  %define arg3  rcx
  79  %define arg4  r8
  80  %define arg5  r9
  81  %define tmp   r11
  82  %define return rax
  83  %define return.w eax
  84
  85  %define func(x) x:
  86  %define FUNC_SAVE
  87  %define FUNC_RESTORE
  88 %endif
  89
  90 ;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
  91 %define len   arg0
  92 %define len.w arg0.w
  93 %define vec    arg1
  94 %define vec_i    arg2
  95 %define mul_array arg3
  96 %define src   arg4
  97 %define dest  arg5
  98 %define pos   return
  99 %define pos.w return.w
 100
 101 %ifndef EC_ALIGNED_ADDR
 102 ;;; Use Un-aligned load/store
 103  %define XLDR movdqu
 104  %define XSTR movdqu
 105 %else
 106 ;;; Use Non-temporal load/stor
 107  %ifdef NO_NT_LDST
 108   %define XLDR movdqa
 109   %define XSTR movdqa
 110  %else
 111   %define XLDR movntdqa
 112   %define XSTR movntdq
 113  %endif
 114 %endif
 115
 116 default rel
 117
 118 [bits 64]
 119 section .text
 120
 121 %define xmask0f  xmm8
 122 %define xgft_lo  xmm7
 123 %define xgft_hi  xmm6
 124
 125 %define x0     xmm0
 126 %define xtmpa  xmm1
 127 %define xtmph  xmm2
 128 %define xtmpl  xmm3
 129 %define xd     xmm4
 130 %define xtmpd  xmm5
 131
 132
 133 align 16
 134 global gf_vect_mad_sse:function
 135 func(gf_vect_mad_sse)
 136         FUNC_SAVE
 137         sub     len, 16
 138         jl      .return_fail
 139
 140         xor     pos, pos
 141         movdqa  xmask0f, [mask0f]       ;Load mask of lower nibble in each byte
 142         sal     vec_i, 5                ;Multiply by 32
 143         movdqu  xgft_lo, [vec_i+mul_array]      ;Load array Cx{00}, Cx{01}, Cx{02}, ...
 144         movdqu  xgft_hi, [vec_i+mul_array+16]   ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
 145
 146         XLDR    xtmpd, [dest+len]       ;backup the last 16 bytes in dest
 147
 148 .loop16:
 149         XLDR    xd, [dest+pos]          ;Get next dest vector
 150 .loop16_overlap:
 151         XLDR    x0, [src+pos]           ;Get next source vector
 152         movdqa  xtmph, xgft_hi          ;Reload const array registers
 153         movdqa  xtmpl, xgft_lo
 154         movdqa  xtmpa, x0               ;Keep unshifted copy of src
 155         psraw   x0, 4                   ;Shift to put high nibble into bits 4-0
 156         pand    x0, xmask0f             ;Mask high src nibble in bits 4-0
 157         pand    xtmpa, xmask0f          ;Mask low src nibble in bits 4-0
 158         pshufb  xtmph, x0               ;Lookup mul table of high nibble
 159         pshufb  xtmpl, xtmpa            ;Lookup mul table of low nibble
 160         pxor    xtmph, xtmpl            ;GF add high and low partials
 161
 162         pxor    xd, xtmph
 163         XSTR    [dest+pos], xd          ;Store result
 164
 165         add     pos, 16                 ;Loop on 16 bytes at a time
 166         cmp     pos, len
 167         jle     .loop16
 168
 169         lea     tmp, [len + 16]
 170         cmp     pos, tmp
 171         je      .return_pass
 172
 173         ;; Tail len
 174         mov     pos, len        ;Overlapped offset length-16
 175         movdqa  xd, xtmpd       ;Restore xd
 176         jmp     .loop16_overlap ;Do one more overlap pass
 177
 178 .return_pass:
 179         mov     return, 0
 180         FUNC_RESTORE
 181         ret
 182
 183 .return_fail:
 184         mov     return, 1
 185         FUNC_RESTORE
 186         ret
 187
 188 endproc_frame
 189
 190 section .data
 191
 192 align 16
 193
 194 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 195
 196 ;;;       func            core, ver, snum
 197 slversion gf_vect_mad_sse, 00,  01,  0200