1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; uint64_t crc64_iso_refl_by8(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
38 %include "reg_sizes.asm"
40 %define fetch_dist 1024
48 %ifidn __OUTPUT_FORMAT__, win64
59 %ifidn __OUTPUT_FORMAT__, win64
61 %define VARIABLE_OFFSET 16*10+8
63 %define VARIABLE_OFFSET 16*2+8
68 global crc64_iso_refl_by8:function
70 ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
72 sub rsp, VARIABLE_OFFSET
74 %ifidn __OUTPUT_FORMAT__, win64
75 ; push the xmm registers into the stack to maintain
76 movdqa [rsp + XMM_SAVE + 16*0], xmm6
77 movdqa [rsp + XMM_SAVE + 16*1], xmm7
78 movdqa [rsp + XMM_SAVE + 16*2], xmm8
79 movdqa [rsp + XMM_SAVE + 16*3], xmm9
80 movdqa [rsp + XMM_SAVE + 16*4], xmm10
81 movdqa [rsp + XMM_SAVE + 16*5], xmm11
82 movdqa [rsp + XMM_SAVE + 16*6], xmm12
83 movdqa [rsp + XMM_SAVE + 16*7], xmm13
86 ; check if smaller than 256B
89 ; for sizes less than 256, we can't fold 128B at a time...
93 ; load the initial crc value
94 movq xmm10, arg1 ; initial crc
95 ; receive the initial 128B data, xor the initial crc value
96 movdqu xmm0, [arg2+16*0]
97 movdqu xmm1, [arg2+16*1]
98 movdqu xmm2, [arg2+16*2]
99 movdqu xmm3, [arg2+16*3]
100 movdqu xmm4, [arg2+16*4]
101 movdqu xmm5, [arg2+16*5]
102 movdqu xmm6, [arg2+16*6]
103 movdqu xmm7, [arg2+16*7]
105 ; XOR the initial_crc value
107 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
108 ;imm value of pclmulqdq instruction will determine which constant to use
109 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
110 ; we subtract 256 instead of 128 to save one instruction from the loop
113 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
114 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
117 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
120 ; update the buffer pointer
123 prefetchnta [arg2+fetch_dist+0]
124 movdqu xmm9, [arg2+16*0]
125 movdqu xmm12, [arg2+16*1]
128 pclmulqdq xmm0, xmm10, 0x10
129 pclmulqdq xmm8, xmm10 , 0x1
130 pclmulqdq xmm1, xmm10, 0x10
131 pclmulqdq xmm13, xmm10 , 0x1
137 prefetchnta [arg2+fetch_dist+32]
138 movdqu xmm9, [arg2+16*2]
139 movdqu xmm12, [arg2+16*3]
142 pclmulqdq xmm2, xmm10, 0x10
143 pclmulqdq xmm8, xmm10 , 0x1
144 pclmulqdq xmm3, xmm10, 0x10
145 pclmulqdq xmm13, xmm10 , 0x1
151 prefetchnta [arg2+fetch_dist+64]
152 movdqu xmm9, [arg2+16*4]
153 movdqu xmm12, [arg2+16*5]
156 pclmulqdq xmm4, xmm10, 0x10
157 pclmulqdq xmm8, xmm10 , 0x1
158 pclmulqdq xmm5, xmm10, 0x10
159 pclmulqdq xmm13, xmm10 , 0x1
165 prefetchnta [arg2+fetch_dist+96]
166 movdqu xmm9, [arg2+16*6]
167 movdqu xmm12, [arg2+16*7]
170 pclmulqdq xmm6, xmm10, 0x10
171 pclmulqdq xmm8, xmm10 , 0x1
172 pclmulqdq xmm7, xmm10, 0x10
173 pclmulqdq xmm13, xmm10 , 0x1
181 ; check if there is another 128B in the buffer to be able to fold
183 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
186 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
187 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
190 ; fold the 8 xmm registers to 1 xmm register with different constants
194 pclmulqdq xmm0, xmm10, 0x1
195 pclmulqdq xmm8, xmm10, 0x10
201 pclmulqdq xmm1, xmm10, 0x1
202 pclmulqdq xmm8, xmm10, 0x10
208 pclmulqdq xmm2, xmm10, 0x1
209 pclmulqdq xmm8, xmm10, 0x10
215 pclmulqdq xmm3, xmm10, 0x1
216 pclmulqdq xmm8, xmm10, 0x10
222 pclmulqdq xmm4, xmm10, 0x1
223 pclmulqdq xmm8, xmm10, 0x10
229 pclmulqdq xmm5, xmm10, 0x1
230 pclmulqdq xmm8, xmm10, 0x10
236 pclmulqdq xmm6, xmm10, 0x1
237 pclmulqdq xmm8, xmm10, 0x10
242 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
243 ; instead of a cmp instruction, we use the negative flag with the jl instruction
245 jl _final_reduction_for_128
247 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
248 ; we can fold 16 bytes at a time if y>=16
249 ; continue folding 16B at a time
253 pclmulqdq xmm7, xmm10, 0x1
254 pclmulqdq xmm8, xmm10, 0x10
260 ; instead of a cmp instruction, we utilize the flags with the jge instruction
261 ; equivalent of: cmp arg3, 16-16
262 ; check if there is any more 16B in the buffer to be able to fold
263 jge _16B_reduction_loop
265 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
266 ;first, we reduce the data in the xmm7 register
269 _final_reduction_for_128:
272 ; here we are getting data that is less than 16 bytes.
273 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
274 ; after that the registers need to be adjusted.
279 movdqu xmm1, [arg2 - 16 + arg3]
281 ; get rid of the extra data that was loaded before
282 ; load the shift constant
283 lea rax, [pshufb_shf_table]
292 pblendvb xmm2, xmm1 ;xmm0 is implicit
295 pclmulqdq xmm7, xmm10, 0x1
297 pclmulqdq xmm8, xmm10, 0x10
302 ; compute crc of a 128-bit value
307 pclmulqdq xmm7, xmm10, 0
316 pclmulqdq xmm7, xmm10, 0
318 pclmulqdq xmm7, xmm10, 0x10
325 ; return c ^ 0xffffffff, ffffffffL;
329 %ifidn __OUTPUT_FORMAT__, win64
330 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
331 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
332 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
333 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
334 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
335 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
336 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
337 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
339 add rsp, VARIABLE_OFFSET
342 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350 ; check if there is enough buffer to be able to fold 16B at a time
354 ; if there is, load the constants
355 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
357 movq xmm0, arg1 ; get the initial crc value
358 movdqu xmm7, [arg2] ; load the plaintext
361 ; update the buffer pointer
364 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
367 jmp _16B_reduction_loop
371 ; mov initial crc to the return value. this is necessary for zero-length buffers.
376 movq xmm0, arg1 ; get the initial crc value
380 jl _less_than_16_left
382 movdqu xmm7, [arg2] ; load the plaintext
383 pxor xmm7, xmm0 ; xor the initial crc value
386 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
387 jmp _get_last_two_xmms
392 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
398 ; backup the counter value
441 pxor xmm7, xmm0 ; xor the initial crc value
443 lea rax,[pshufb_shf_table]
449 movdqu xmm0, [rax + r9]
454 ; Left shift (8-length) bytes in XMM
455 movdqu xmm0, [rax + r9 + 8]
463 pxor xmm7, xmm0 ; xor the initial crc value
469 ; precomputed constants
471 ; rk7 = floor(2^128/Q)
474 DQ 0xf500000000000001
476 DQ 0x6b70000000000001
478 DQ 0xb001000000010000
480 DQ 0xf501b0000001b000
482 DQ 0xf500000000000001
484 DQ 0x0000000000000000
486 DQ 0xb000000000000001
488 DQ 0xb000000000000000
490 DQ 0xe014514514501501
492 DQ 0x771db6db6db71c71
494 DQ 0xa101101101110001
496 DQ 0x1ab1ab1ab1aab001
498 DQ 0xf445014445000001
500 DQ 0x6aab71daab700001
502 DQ 0xb100010100000001
504 DQ 0x01b001b1b0000001
506 DQ 0xe145150000000001
508 DQ 0x76db6c7000000001
510 DQ 0xa011000000000001
512 DQ 0x1b1ab00000000001
515 ; use these values for shift constants for the pshufb instruction
516 ; different alignments result in values as shown:
517 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
518 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
519 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
520 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
521 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
522 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
523 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
524 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
525 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
526 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
527 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
528 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
529 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
530 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
531 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
532 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
533 dq 0x0706050403020100, 0x000e0d0c0b0a0908
537 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
539 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
541 dq 0x8080808080808080, 0x8080808080808080
543 ;;; func core, ver, snum
544 slversion crc64_iso_refl_by8, 01, 00, 0023