ceph/src/isa-l/igzip/adler32_sse.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
  31
  32 %define LIMIT 5552
  33 %define BASE  0xFFF1 ; 65521
  34
  35 %include "reg_sizes.asm"
  36
  37 default rel
  38 [bits 64]
  39
  40 ; need to keep free: eax, ecx, edx
  41
  42 %ifidn __OUTPUT_FORMAT__, elf64
  43  %define arg1   rdi
  44  %define arg2   rsi
  45  %define arg3   rdx
  46
  47  %define init_d edi
  48  %define data   r9
  49  %define size   r10
  50  %define s      r11
  51  %define a_d    r12d
  52  %define b_d    r8d
  53  %define end    r13
  54
  55  %define func(x) x:
  56  %macro FUNC_SAVE 0
  57         push    r12
  58         push    r13
  59  %endmacro
  60 %macro FUNC_RESTORE 0
  61         pop     r13
  62         pop     r12
  63  %endmacro
  64 %endif
  65
  66
  67 %ifidn __OUTPUT_FORMAT__, win64
  68  %define arg1   rcx
  69  %define arg2   rdx
  70  %define arg3   r8
  71
  72  %define init_d r12d
  73  %define data   r9
  74  %define size   r10
  75  %define s      r11
  76  %define a_d    esi
  77  %define b_d    edi
  78  %define end    r13
  79
  80  %define stack_size  5*8                ; must be an odd multiple of 8
  81  %define func(x) proc_frame x
  82  %macro FUNC_SAVE 0
  83         alloc_stack     stack_size
  84         save_reg        rdi,  0*8
  85         save_reg        rsi,  1*8
  86         save_reg        r12,  2*8
  87         save_reg        r13,  3*8
  88         end_prolog
  89         mov     init_d, ecx     ; initalize init_d from arg1 to keep ecx free
  90  %endmacro
  91
  92  %macro FUNC_RESTORE 0
  93         mov     rdi,  [rsp + 0*8]
  94         mov     rsi,  [rsp + 1*8]
  95         mov     r12,  [rsp + 2*8]
  96         mov     r13,  [rsp + 3*8]
  97         add     rsp, stack_size
  98  %endmacro
  99 %endif
 100
 101 %define xa      xmm0
 102 %define xb      xmm1
 103 %define xdata0  xmm2
 104 %define xdata1  xmm3
 105 %define xsa     xmm4
 106
 107 global adler32_sse:ISAL_SYM_TYPE_FUNCTION
 108 func(adler32_sse)
 109         FUNC_SAVE
 110
 111         mov     data, arg2
 112         mov     size, arg3
 113
 114         mov     b_d, init_d
 115         shr     b_d, 16
 116         and     init_d, 0xFFFF
 117         cmp     size, 32
 118         jb      .lt64
 119         movd    xa, init_d
 120         pxor    xb, xb
 121 .sloop1:
 122         mov     s, LIMIT
 123         cmp     s, size
 124         cmova   s, size         ; s = min(size, LIMIT)
 125         lea     end, [data + s - 7]
 126         cmp     data, end
 127         jae     .skip_loop_1a
 128 align 32
 129 .sloop1a:
 130         ; do 8 adds
 131         pmovzxbd xdata0, [data]
 132         pmovzxbd xdata1, [data + 4]
 133         add     data, 8
 134         paddd   xa, xdata0
 135         paddd   xb, xa
 136         paddd   xa, xdata1
 137         paddd   xb, xa
 138         cmp     data, end
 139         jb      .sloop1a
 140
 141 .skip_loop_1a:
 142         add     end, 7
 143
 144         test    s, 7
 145         jnz     .do_final
 146
 147         ; either we're done, or we just did LIMIT
 148         sub     size, s
 149
 150         ; reduce
 151         pslld   xb, 2   ; b is scaled by 4
 152         movdqa  xsa, xa ; scaled a
 153         pmulld  xsa, [A_SCALE]
 154
 155         phaddd  xa, xa
 156         phaddd  xb, xb
 157         phaddd  xsa, xsa
 158         phaddd  xa, xa
 159         phaddd  xb, xb
 160         phaddd  xsa, xsa
 161
 162         movd    eax, xa
 163         xor     edx, edx
 164         mov     ecx, BASE
 165         div     ecx             ; divide edx:eax by ecx, quot->eax, rem->edx
 166         mov     a_d, edx
 167
 168         psubd   xb, xsa
 169         movd    eax, xb
 170         add     eax, b_d
 171         xor     edx, edx
 172         mov     ecx, BASE
 173         div     ecx             ; divide edx:eax by ecx, quot->eax, rem->edx
 174         mov     b_d, edx
 175
 176         test    size, size
 177         jz      .finish
 178
 179         ; continue loop
 180         movd    xa, a_d
 181         pxor    xb, xb
 182         jmp     .sloop1
 183
 184 .finish:
 185         mov     eax, b_d
 186         shl     eax, 16
 187         or      eax, a_d
 188         jmp     .end
 189
 190 .lt64:
 191         mov     a_d, init_d
 192         lea     end, [data + size]
 193         test    size, size
 194         jnz     .final_loop
 195         jmp     .zero_size
 196
 197         ; handle remaining 1...15 bytes
 198 .do_final:
 199         ; reduce
 200         pslld   xb, 2   ; b is scaled by 4
 201         movdqa  xsa, xa ; scaled a
 202         pmulld  xsa, [A_SCALE]
 203
 204         phaddd  xa, xa
 205         phaddd  xb, xb
 206         phaddd  xsa, xsa
 207         phaddd  xa, xa
 208         phaddd  xb, xb
 209         phaddd  xsa, xsa
 210         psubd   xb, xsa
 211
 212         movd    a_d, xa
 213         movd    eax, xb
 214         add     b_d, eax
 215
 216 align 32
 217 .final_loop:
 218         movzx   eax, byte[data]
 219         add     a_d, eax
 220         inc     data
 221         add     b_d, a_d
 222         cmp     data, end
 223         jb      .final_loop
 224
 225 .zero_size:
 226         mov     eax, a_d
 227         xor     edx, edx
 228         mov     ecx, BASE
 229         div     ecx             ; divide edx:eax by ecx, quot->eax, rem->edx
 230         mov     a_d, edx
 231
 232         mov     eax, b_d
 233         xor     edx, edx
 234         mov     ecx, BASE
 235         div     ecx             ; divide edx:eax by ecx, quot->eax, rem->edx
 236         shl     edx, 16
 237         or      edx, a_d
 238         mov     eax, edx
 239
 240 .end:
 241         FUNC_RESTORE
 242         ret
 243
 244 endproc_frame
 245
 246 section .data
 247 align 32
 248 A_SCALE:
 249         dq      0x0000000100000000, 0x0000000300000002