ceph/src/isa-l/raid/pq_gen_sse_i32.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;; Optimized pq of N source vectors using SSE3
  31 ;;; int pq_gen_sse(int vects, int len, void **array)
  32
  33 ;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
  34 ;;; (**array).  Last two pointers are the P and Q destinations respectively.
  35 ;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
  36
  37 %include "reg_sizes.asm"
  38
  39 %ifidn __OUTPUT_FORMAT__, elf64
  40  %define arg0  rdi
  41  %define arg1  rsi
  42  %define arg2  rdx
  43  %define arg3  rcx
  44  %define arg4  r8
  45  %define arg5  r9
  46  %define tmp   r11
  47  %define return rax
  48  %define PS 8
  49  %define func(x) x: endbranch
  50  %define FUNC_SAVE
  51  %define FUNC_RESTORE
  52
  53 %elifidn __OUTPUT_FORMAT__, win64
  54  %define arg0  rcx
  55  %define arg1  rdx
  56  %define arg2  r8
  57  %define arg3  r9
  58  %define return rax
  59  %define PS 8
  60  %define tmp   r10
  61  %define stack_size  2*16 + 8   ; must be an odd multiple of 8
  62  %define func(x) proc_frame x
  63
  64  %macro FUNC_SAVE 0
  65         alloc_stack     stack_size
  66         save_xmm128     xmm6, 0*16
  67         save_xmm128     xmm7, 1*16
  68         end_prolog
  69  %endmacro
  70  %macro FUNC_RESTORE 0
  71         movdqa  xmm6, [rsp + 0*16]
  72         movdqa  xmm7, [rsp + 1*16]
  73         add     rsp, stack_size
  74  %endmacro
  75
  76 %elifidn __OUTPUT_FORMAT__, elf32
  77  %define arg0   edx
  78  %define arg1   ecx
  79  %define return eax
  80  %define PS 4
  81  %define func(x) x: endbranch
  82  %define arg(x) [ebp+8+PS*x]
  83  %define arg2  edi      ; must sav/restore
  84  %define arg3  esi
  85  %define tmp   ebx
  86
  87  %macro FUNC_SAVE 0
  88         push    ebp
  89         mov     ebp, esp
  90         push    esi
  91         push    edi
  92         push    ebx
  93         mov     arg0, arg(0)
  94         mov     arg1, arg(1)
  95         mov     arg2, arg(2)
  96  %endmacro
  97
  98  %macro FUNC_RESTORE 0
  99         pop     ebx
 100         pop     edi
 101         pop     esi
 102         mov     esp, ebp        ;if has frame pointer?
 103         pop     ebp
 104  %endmacro
 105
 106 %endif  ; output formats
 107
 108 %define vec arg0
 109 %define len arg1
 110 %define ptr arg3
 111 %define pos return
 112
 113 %define xp1   xmm0
 114 %define xq1   xmm1
 115 %define xtmp1 xmm2
 116 %define xs1   xmm3
 117
 118 %define xp2   xmm4
 119 %define xq2   xmm5
 120 %define xtmp2 xmm6
 121 %define xs2   xmm7
 122
 123 %ifidn PS,8                     ; 64-bit code
 124  default rel
 125  [bits 64]
 126  %define xpoly xmm15
 127 %elifidn PS,4                   ; 32-bit code
 128  %define xpoly [poly]
 129 %endif
 130
 131 ;;; Use Non-temporal load/stor
 132 %ifdef NO_NT_LDST
 133  %define XLDR movdqa
 134  %define XSTR movdqa
 135 %else
 136  %define XLDR movntdqa
 137  %define XSTR movntdq
 138 %endif
 139
 140 section .text
 141
 142 align 16
 143 mk_global  pq_gen_sse, function
 144 func(pq_gen_sse)
 145         FUNC_SAVE
 146         sub     vec, 3                  ;Keep as offset to last source
 147         jng     return_fail             ;Must have at least 2 sources
 148         cmp     len, 0
 149         je      return_pass
 150         test    len, (16-1)             ;Check alignment of length
 151         jnz     return_fail
 152         mov     pos, 0
 153 %ifidn PS,8
 154         movdqa  xpoly, [poly]           ;For 64-bit, load poly into high xmm reg
 155 %endif
 156         cmp     len, 32
 157         jl      loop16
 158
 159 len_aligned_32bytes:
 160         sub     len, 32                 ;Do end of vec first and run backward
 161
 162 loop32:
 163         mov     ptr, [arg2+vec*PS]      ;Fetch last source pointer
 164         mov     tmp, vec                ;Set tmp to point back to last vector
 165         XLDR    xs1, [ptr+pos]          ;Preload last vector (source)
 166         XLDR    xs2, [ptr+pos+16]       ;Preload last vector (source)
 167         pxor    xp1, xp1                ;p1 = 0
 168         pxor    xq1, xq1                ;q1 = 0
 169         pxor    xp2, xp2                ;p2 = 0
 170         pxor    xq2, xq2                ;q2 = 0
 171
 172 next_vect:
 173         sub     tmp, 1                  ;Inner loop for each source vector
 174         mov     ptr, [arg2+tmp*PS]      ; get pointer to next vect
 175         pxor    xq1, xs1                ; q1 ^= s1
 176         pxor    xq2, xs2                ; q2 ^= s2
 177         pxor    xp1, xs1                ; p1 ^= s1
 178         pxor    xp2, xs2                ; p2 ^= s2
 179         pxor    xtmp1, xtmp1            ; xtmp1 = 0 - for compare to 0
 180         pxor    xtmp2, xtmp2            ; xtmp2 = 0
 181         pcmpgtb xtmp1, xq1              ; xtmp1 = mask 0xff or 0x00 if bit7 set
 182         pcmpgtb xtmp2, xq2              ; xtmp2 = mask 0xff or 0x00 if bit7 set
 183         pand    xtmp1, xpoly            ; xtmp1 = poly or 0x00
 184         pand    xtmp2, xpoly            ; xtmp2 = poly or 0x00
 185         XLDR    xs1, [ptr+pos]          ; Get next vector (source data1)
 186         XLDR    xs2, [ptr+pos+16]       ; Get next vector (source data2)
 187         paddb   xq1, xq1                ; q1 = q1<<1
 188         paddb   xq2, xq2                ; q2 = q2<<1
 189         pxor    xq1, xtmp1              ; q1 = q1<<1 ^ poly_masked
 190         pxor    xq2, xtmp2              ; q2 = q2<<1 ^ poly_masked
 191         jg      next_vect               ; Loop for each vect except 0
 192
 193         mov     ptr, [arg2+PS+vec*PS]   ;Get address of P parity vector
 194         mov     tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
 195         pxor    xp1, xs1                ;p1 ^= s1[0] - last source is already loaded
 196         pxor    xq1, xs1                ;q1 ^= 1 * s1[0]
 197         pxor    xp2, xs2                ;p2 ^= s2[0]
 198         pxor    xq2, xs2                ;q2 ^= 1 * s2[0]
 199         XSTR    [ptr+pos], xp1          ;Write parity P1 vector
 200         XSTR    [ptr+pos+16], xp2       ;Write parity P2 vector
 201         XSTR    [tmp+pos], xq1          ;Write parity Q1 vector
 202         XSTR    [tmp+pos+16], xq2       ;Write parity Q2 vector
 203         add     pos, 32
 204         cmp     pos, len
 205         jle     loop32
 206
 207         ;; ------------------------------
 208         ;; Do last 16 Bytes remaining
 209         add     len, 32
 210         cmp     pos, len
 211         je      return_pass
 212
 213 loop16:
 214         mov     ptr, [arg2+vec*PS]      ;Fetch last source pointer
 215         mov     tmp, vec                ;Set tmp to point back to last vector
 216         XLDR    xs1, [ptr+pos]          ;Preload last vector (source)
 217         pxor    xp1, xp1                ;p = 0
 218         pxor    xq1, xq1                ;q = 0
 219
 220 next_vect16:
 221         sub     tmp, 1                  ;Inner loop for each source vector
 222         mov     ptr, [arg2+tmp*PS]      ; get pointer to next vect
 223         pxor    xq1, xs1                ; q1 ^= s1
 224         pxor    xtmp1, xtmp1            ; xtmp = 0
 225         pcmpgtb xtmp1, xq1              ; xtmp = mask 0xff or 0x00 if bit7 set
 226         pand    xtmp1, xpoly            ; xtmp = poly or 0x00
 227         pxor    xp1, xs1                ; p ^= s
 228         paddb   xq1, xq1                ; q = q<<1
 229         pxor    xq1, xtmp1              ; q = q<<1 ^ poly_masked
 230         XLDR    xs1, [ptr+pos]          ; Get next vector (source data)
 231         jg      next_vect16             ; Loop for each vect except 0
 232
 233         mov     ptr, [arg2+PS+vec*PS]   ;Get address of P parity vector
 234         mov     tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
 235         pxor    xp1, xs1                ;p ^= s[0] - last source is already loaded
 236         pxor    xq1, xs1                ;q ^= 1 * s[0]
 237         XSTR    [ptr+pos], xp1          ;Write parity P vector
 238         XSTR    [tmp+pos], xq1          ;Write parity Q vector
 239         add     pos, 16
 240         cmp     pos, len
 241         jl      loop16
 242
 243
 244 return_pass:
 245         mov     return, 0
 246         FUNC_RESTORE
 247         ret
 248
 249
 250 return_fail:
 251         mov     return, 1
 252         FUNC_RESTORE
 253         ret
 254
 255 endproc_frame
 256
 257 section .data
 258
 259 align 16
 260 poly:
 261 dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
 262
 263 ;;;       func        core, ver, snum
 264 slversion pq_gen_sse, 00,   08,  0032