ceph/src/isa-l/raid/xor_check_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized xor of N source vectors using SSE
  31 ;;; int xor_gen_sse(int vects, int len, void **array)
  32
  33 ;;; Generates xor parity vector from N (vects-1) sources in array of pointers
  34 ;;; (**array).  Last pointer is the dest.
  35 ;;; Vectors must be aligned to 16 bytes.  Length can be any value.
  36
  37 %include "reg_sizes.asm"
  38
  39 %ifidn __OUTPUT_FORMAT__, elf64
  40  %define arg0  rdi
  41  %define arg1  rsi
  42  %define arg2  rdx
  43  %define arg3  rcx
  44  %define arg4  r8
  45  %define arg5  r9
  46  %define tmp   r11
  47  %define tmp2  rax
  48  %define tmp2.b al
  49  %define tmp3  arg4
  50  %define return rax
  51  %define PS 8
  52  %define func(x) x: endbranch
  53  %define FUNC_SAVE
  54  %define FUNC_RESTORE
  55
  56 %elifidn __OUTPUT_FORMAT__, win64
  57  %define arg0  rcx
  58  %define arg1  rdx
  59  %define arg2  r8
  60  %define arg3  r9
  61  %define return rax
  62  %define tmp2  rax
  63  %define tmp2.b al
  64  %define PS 8
  65  %define tmp   r11
  66  %define tmp3  r10
  67  %define stack_size  2*16 + 8   ; must be an odd multiple of 8
  68  %define func(x) proc_frame x
  69
  70  %macro FUNC_SAVE 0
  71         alloc_stack     stack_size
  72         save_xmm128     xmm6, 0*16
  73         save_xmm128     xmm7, 1*16
  74         end_prolog
  75  %endmacro
  76  %macro FUNC_RESTORE 0
  77         movdqa  xmm6, [rsp + 0*16]
  78         movdqa  xmm7, [rsp + 1*16]
  79         add     rsp, stack_size
  80  %endmacro
  81
  82
  83 %elifidn __OUTPUT_FORMAT__, elf32
  84  %define arg0   arg(0)
  85  %define arg1   ecx
  86  %define tmp2   eax
  87  %define tmp2.b  al
  88  %define tmp3   edx
  89  %define return eax
  90  %define PS 4
  91  %define func(x) x: endbranch
  92  %define arg(x) [ebp+8+PS*x]
  93  %define arg2  edi      ; must sav/restore
  94  %define arg3  esi
  95  %define tmp   ebx
  96
  97  %macro FUNC_SAVE 0
  98         push    ebp
  99         mov     ebp, esp
 100         push    esi
 101         push    edi
 102         push    ebx
 103         mov     arg1, arg(1)
 104         mov     arg2, arg(2)
 105  %endmacro
 106
 107  %macro FUNC_RESTORE 0
 108         pop     ebx
 109         pop     edi
 110         pop     esi
 111         mov     esp, ebp        ;if has frame pointer
 112         pop     ebp
 113  %endmacro
 114
 115 %endif  ; output formats
 116
 117
 118 %define vec arg0
 119 %define len arg1
 120 %define ptr arg3
 121 %define pos tmp3
 122
 123 %ifidn PS,8                     ; 64-bit code
 124  default rel
 125  [bits 64]
 126 %endif
 127
 128 ;;; Use Non-temporal load/stor
 129 %ifdef NO_NT_LDST
 130  %define XLDR movdqa
 131  %define XSTR movdqa
 132 %else
 133  %define XLDR movntdqa
 134  %define XSTR movntdq
 135 %endif
 136
 137 section .text
 138
 139 align 16
 140 mk_global  xor_check_sse, function
 141 func(xor_check_sse)
 142         FUNC_SAVE
 143 %ifidn PS,8                             ;64-bit code
 144         sub     vec, 1                  ; Keep as offset to last source
 145 %else                                   ;32-bit code
 146         mov     tmp, arg(0)             ; Update vec length arg to last source
 147         sub     tmp, 1
 148         mov     arg(0), tmp
 149 %endif
 150
 151         jng     return_fail             ;Must have at least 2 sources
 152         cmp     len, 0
 153         je      return_pass
 154         test    len, (128-1)            ;Check alignment of length
 155         jnz     len_not_aligned
 156
 157
 158 len_aligned_128bytes:
 159         sub     len, 128
 160         mov     pos, 0
 161         mov     tmp, vec                ;Preset to last vector
 162
 163 loop128:
 164         mov     tmp2, [arg2+tmp*PS]     ;Fetch last pointer in array
 165         sub     tmp, 1                  ;Next vect
 166         XLDR    xmm0, [tmp2+pos]        ;Start with end of array in last vector
 167         XLDR    xmm1, [tmp2+pos+16]     ;Keep xor parity in xmm0-7
 168         XLDR    xmm2, [tmp2+pos+(2*16)]
 169         XLDR    xmm3, [tmp2+pos+(3*16)]
 170         XLDR    xmm4, [tmp2+pos+(4*16)]
 171         XLDR    xmm5, [tmp2+pos+(5*16)]
 172         XLDR    xmm6, [tmp2+pos+(6*16)]
 173         XLDR    xmm7, [tmp2+pos+(7*16)]
 174
 175 next_vect:
 176         mov     ptr, [arg2+tmp*PS]
 177         sub     tmp, 1
 178         xorpd   xmm0, [ptr+pos]         ;Get next vector (source)
 179         xorpd   xmm1, [ptr+pos+16]
 180         xorpd   xmm2, [ptr+pos+(2*16)]
 181         xorpd   xmm3, [ptr+pos+(3*16)]
 182         xorpd   xmm4, [ptr+pos+(4*16)]
 183         xorpd   xmm5, [ptr+pos+(5*16)]
 184         xorpd   xmm6, [ptr+pos+(6*16)]
 185         xorpd   xmm7, [ptr+pos+(7*16)]
 186 ;;;     prefetch [ptr+pos+(8*16)]
 187         jge     next_vect               ;Loop for each vect
 188
 189         ;; End of vects, chech that all parity regs = 0
 190         mov     tmp, vec                ;Back to last vector
 191         por     xmm0, xmm1
 192         por     xmm0, xmm2
 193         por     xmm0, xmm3
 194         por     xmm0, xmm4
 195         por     xmm0, xmm5
 196         por     xmm0, xmm6
 197         por     xmm0, xmm7
 198         ptest   xmm0, xmm0
 199         jnz     return_fail
 200
 201         add     pos, 128
 202         cmp     pos, len
 203         jle     loop128
 204
 205 return_pass:
 206         FUNC_RESTORE
 207         mov     return, 0
 208         ret
 209
 210
 211
 212 ;;; Do one byte at a time for no alignment case
 213
 214 xor_gen_byte:
 215         mov     tmp, vec                ;Preset to last vector
 216
 217 loop_1byte:
 218         mov     ptr, [arg2+tmp*PS]      ;Fetch last pointer in array
 219         mov     tmp2.b, [ptr+len-1]     ;Get array n
 220         sub     tmp, 1
 221 nextvect_1byte:
 222         mov     ptr, [arg2+tmp*PS]
 223         xor     tmp2.b, [ptr+len-1]
 224         sub     tmp, 1
 225         jge     nextvect_1byte
 226
 227         mov     tmp, vec                ;Back to last vector
 228         cmp     tmp2.b, 0
 229         jne     return_fail
 230         sub     len, 1
 231         test    len, (8-1)
 232         jnz     loop_1byte
 233
 234         cmp     len, 0
 235         je      return_pass
 236         test    len, (128-1)            ;If not 0 and 128bit aligned
 237         jz      len_aligned_128bytes    ; then do aligned case. len = y * 128
 238
 239         ;; else we are 8-byte aligned so fall through to recheck
 240
 241
 242         ;; Unaligned length cases
 243 len_not_aligned:
 244         test    len, (PS-1)
 245         jne     xor_gen_byte
 246         mov     tmp3, len
 247         and     tmp3, (128-1)           ;Do the unaligned bytes 4-8 at a time
 248         mov     tmp, vec                ;Preset to last vector
 249
 250         ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
 251 loopN_bytes:
 252         mov     ptr, [arg2+tmp*PS]      ;Fetch last pointer in array
 253         mov     tmp2, [ptr+len-PS]      ;Get array n
 254         sub     tmp, 1
 255 nextvect_Nbytes:
 256         mov     ptr, [arg2+tmp*PS]      ;Get pointer to next vector
 257         xor     tmp2, [ptr+len-PS]
 258         sub     tmp, 1
 259         jge     nextvect_Nbytes         ;Loop for each source
 260
 261         mov     tmp, vec                ;Back to last vector
 262         cmp     tmp2, 0
 263         jne     return_fail
 264         sub     len, PS
 265         sub     tmp3, PS
 266         jg      loopN_bytes
 267
 268         cmp     len, 128                ;Now len is aligned to 128B
 269         jge     len_aligned_128bytes    ;We can do the rest aligned
 270
 271         cmp     len, 0
 272         je      return_pass
 273
 274 return_fail:
 275         mov     return, 1
 276         FUNC_RESTORE
 277         ret
 278
 279 endproc_frame
 280
 281 section .data
 282
 283 ;;;       func           core, ver, snum
 284 slversion xor_check_sse, 00,   03,  0031
 285