ceph/src/isa-l/raid/xor_gen_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized xor of N source vectors using SSE
  31 ;;; int xor_gen_sse(int vects, int len, void **array)
  32
  33 ;;; Generates xor parity vector from N (vects-1) sources in array of pointers
  34 ;;; (**array).  Last pointer is the dest.
  35 ;;; Vectors must be aligned to 16 bytes.  Length can be any value.
  36
  37 %include "reg_sizes.asm"
  38
  39 %ifidn __OUTPUT_FORMAT__, elf64
  40  %define arg0  rdi
  41  %define arg1  rsi
  42  %define arg2  rdx
  43  %define arg3  rcx
  44  %define arg4  r8
  45  %define arg5  r9
  46  %define tmp   r11
  47  %define tmp2  rax
  48  %define tmp2.b al
  49  %define tmp3  arg4
  50  %define return rax
  51  %define PS 8
  52  %define func(x) x: endbranch
  53  %define FUNC_SAVE
  54  %define FUNC_RESTORE
  55
  56 %elifidn __OUTPUT_FORMAT__, win64
  57  %define arg0  rcx
  58  %define arg1  rdx
  59  %define arg2  r8
  60  %define arg3  r9
  61  %define return rax
  62  %define tmp2  rax
  63  %define tmp2.b al
  64  %define PS 8
  65  %define tmp   r11
  66  %define tmp3  r10
  67  %define stack_size  2*16 + 8   ; must be an odd multiple of 8
  68  %define func(x) proc_frame x
  69
  70  %macro FUNC_SAVE 0
  71         alloc_stack     stack_size
  72         save_xmm128     xmm6, 0*16
  73         save_xmm128     xmm7, 1*16
  74         end_prolog
  75  %endmacro
  76  %macro FUNC_RESTORE 0
  77         movdqa  xmm6, [rsp + 0*16]
  78         movdqa  xmm7, [rsp + 1*16]
  79         add     rsp, stack_size
  80  %endmacro
  81
  82
  83 %elifidn __OUTPUT_FORMAT__, elf32
  84  %define arg0   arg(0)
  85  %define arg1   ecx
  86  %define tmp2   eax
  87  %define tmp2.b  al
  88  %define tmp3   edx
  89  %define return eax
  90  %define PS 4
  91  %define func(x) x: endbranch
  92  %define arg(x) [ebp+8+PS*x]
  93  %define arg2  edi      ; must sav/restore
  94  %define arg3  esi
  95  %define tmp   ebx
  96
  97  %macro FUNC_SAVE 0
  98         push    ebp
  99         mov     ebp, esp
 100         push    esi
 101         push    edi
 102         push    ebx
 103         mov     arg1, arg(1)
 104         mov     arg2, arg(2)
 105  %endmacro
 106
 107  %macro FUNC_RESTORE 0
 108         pop     ebx
 109         pop     edi
 110         pop     esi
 111         mov     esp, ebp        ;if has frame pointer
 112         pop     ebp
 113  %endmacro
 114
 115 %endif  ; output formats
 116
 117
 118 %define vec arg0
 119 %define len arg1
 120 %define ptr arg3
 121 %define pos tmp3
 122
 123 %ifidn PS,8                     ; 64-bit code
 124  default rel
 125  [bits 64]
 126 %endif
 127
 128 ;;; Use Non-temporal load/stor
 129 %ifdef NO_NT_LDST
 130  %define XLDR movdqa
 131  %define XSTR movdqa
 132 %else
 133  %define XLDR movntdqa
 134  %define XSTR movntdq
 135 %endif
 136
 137 section .text
 138
 139 align 16
 140 mk_global  xor_gen_sse, function
 141 func(xor_gen_sse)
 142         FUNC_SAVE
 143 %ifidn PS,8                             ;64-bit code
 144         sub     vec, 2                  ; Keep as offset to last source
 145 %else                                   ;32-bit code
 146         mov     tmp, arg(0)             ; Update vec length arg to last source
 147         sub     tmp, 2
 148         mov     arg(0), tmp
 149 %endif
 150
 151         jng     return_fail             ;Must have at least 2 sources
 152         cmp     len, 0
 153         je      return_pass
 154         test    len, (128-1)            ;Check alignment of length
 155         jnz     len_not_aligned
 156
 157
 158 len_aligned_128bytes:
 159         sub     len, 128
 160         mov     pos, 0
 161         mov     tmp, vec                ;Preset to last vector
 162
 163 loop128:
 164         mov     tmp2, [arg2+tmp*PS]     ;Fetch last pointer in array
 165         sub     tmp, 1                  ;Next vect
 166         XLDR    xmm0, [tmp2+pos]        ;Start with end of array in last vector
 167         XLDR    xmm1, [tmp2+pos+16]     ;Keep xor parity in xmm0-7
 168         XLDR    xmm2, [tmp2+pos+(2*16)]
 169         XLDR    xmm3, [tmp2+pos+(3*16)]
 170         XLDR    xmm4, [tmp2+pos+(4*16)]
 171         XLDR    xmm5, [tmp2+pos+(5*16)]
 172         XLDR    xmm6, [tmp2+pos+(6*16)]
 173         XLDR    xmm7, [tmp2+pos+(7*16)]
 174
 175 next_vect:
 176         mov     ptr, [arg2+tmp*PS]
 177         sub     tmp, 1
 178         xorpd   xmm0, [ptr+pos]         ;Get next vector (source)
 179         xorpd   xmm1, [ptr+pos+16]
 180         xorpd   xmm2, [ptr+pos+(2*16)]
 181         xorpd   xmm3, [ptr+pos+(3*16)]
 182         xorpd   xmm4, [ptr+pos+(4*16)]
 183         xorpd   xmm5, [ptr+pos+(5*16)]
 184         xorpd   xmm6, [ptr+pos+(6*16)]
 185         xorpd   xmm7, [ptr+pos+(7*16)]
 186 ;;;     prefetch [ptr+pos+(8*16)]
 187         jge     next_vect               ;Loop for each vect
 188
 189
 190         mov     tmp, vec                ;Back to last vector
 191         mov     ptr, [arg2+PS+tmp*PS]   ;Address of parity vector
 192         XSTR    [ptr+pos], xmm0         ;Write parity xor vector
 193         XSTR    [ptr+pos+(1*16)], xmm1
 194         XSTR    [ptr+pos+(2*16)], xmm2
 195         XSTR    [ptr+pos+(3*16)], xmm3
 196         XSTR    [ptr+pos+(4*16)], xmm4
 197         XSTR    [ptr+pos+(5*16)], xmm5
 198         XSTR    [ptr+pos+(6*16)], xmm6
 199         XSTR    [ptr+pos+(7*16)], xmm7
 200         add     pos, 128
 201         cmp     pos, len
 202         jle     loop128
 203
 204 return_pass:
 205         mov     return, 0
 206         FUNC_RESTORE
 207         ret
 208
 209
 210
 211 ;;; Do one byte at a time for no alignment case
 212
 213 xor_gen_byte:
 214         mov     tmp, vec                ;Preset to last vector
 215
 216 loop_1byte:
 217         mov     ptr, [arg2+tmp*PS]      ;Fetch last pointer in array
 218         mov     tmp2.b, [ptr+len-1]     ;Get array n
 219         sub     tmp, 1
 220 nextvect_1byte:
 221         mov     ptr, [arg2+tmp*PS]
 222         xor     tmp2.b, [ptr+len-1]
 223         sub     tmp, 1
 224         jge     nextvect_1byte
 225
 226         mov     tmp, vec                ;Back to last vector
 227         mov     ptr, [arg2+PS+tmp*PS]   ;Get last vec
 228         mov     [ptr+len-1], tmp2.b     ;Write parity
 229         sub     len, 1
 230         test    len, (8-1)
 231         jnz     loop_1byte
 232
 233         cmp     len, 0
 234         je      return_pass
 235         test    len, (128-1)            ;If not 0 and 128bit aligned
 236         jz      len_aligned_128bytes    ; then do aligned case. len = y * 128
 237
 238         ;; else we are 8-byte aligned so fall through to recheck
 239
 240
 241         ;; Unaligned length cases
 242 len_not_aligned:
 243         test    len, (PS-1)
 244         jne     xor_gen_byte
 245         mov     tmp3, len
 246         and     tmp3, (128-1)           ;Do the unaligned bytes 4-8 at a time
 247         mov     tmp, vec                ;Preset to last vector
 248
 249         ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
 250 loopN_bytes:
 251         mov     ptr, [arg2+tmp*PS]      ;Fetch last pointer in array
 252         mov     tmp2, [ptr+len-PS]      ;Get array n
 253         sub     tmp, 1
 254 nextvect_Nbytes:
 255         mov     ptr, [arg2+tmp*PS]      ;Get pointer to next vector
 256         xor     tmp2, [ptr+len-PS]
 257         sub     tmp, 1
 258         jge     nextvect_Nbytes         ;Loop for each source
 259
 260         mov     tmp, vec                ;Back to last vector
 261         mov     ptr, [arg2+PS+tmp*PS]   ;Get last vec
 262         mov     [ptr+len-PS], tmp2      ;Write parity
 263         sub     len, PS
 264         sub     tmp3, PS
 265         jg      loopN_bytes
 266
 267         cmp     len, 128                ;Now len is aligned to 128B
 268         jge     len_aligned_128bytes    ;We can do the rest aligned
 269
 270         cmp     len, 0
 271         je      return_pass
 272
 273 return_fail:
 274         mov     return, 1
 275         FUNC_RESTORE
 276         ret
 277
 278 endproc_frame
 279
 280 section .data
 281
 282 ;;;       func         core, ver, snum
 283 slversion xor_gen_sse, 00,   0c,  0030
 284