1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT16 crc16_t10dif_copy_by4(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; unsigned char *dst, //buffer pointer destination for copy
34 ; const unsigned char *src, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
47 %include "reg_sizes.asm"
49 %define fetch_dist 1024
55 %ifidn __OUTPUT_FORMAT__, win64
61 %xdefine arg1_low32 ecx
68 %xdefine arg1_low32 edi
72 global crc16_t10dif_copy_by4:ISAL_SYM_TYPE_FUNCTION
73 crc16_t10dif_copy_by4:
75 ; adjust the 16-bit initial_crc value, scale it to 32 bits
78 ; After this point, code flow is exactly same as a 32-bit CRC.
79 ; The only difference is before returning eax, we will shift
80 ; it right 16 bits, to scale back to 16 bits.
84 ; push the xmm registers into the stack to maintain
85 movdqa [rsp+16*2],xmm6
86 movdqa [rsp+16*3],xmm7
88 ; check if smaller than 128B
91 ; for sizes less than 128, we can't fold 64B at a time...
95 ; load the initial crc value
96 movd xmm6, arg1_low32 ; initial crc
98 ; crc value does not need to be byte-reflected, but it needs to
99 ; be moved to the high part of the register.
100 ; because data will be byte-reflected and will align with
101 ; initial crc at correct place.
104 movdqa xmm7, [SHUF_MASK]
105 ; receive the initial 64B data, xor the initial crc value
107 movdqu xmm1, [arg3+16]
108 movdqu xmm2, [arg3+32]
109 movdqu xmm3, [arg3+48]
113 movdqu [arg2+16], xmm1
114 movdqu [arg2+32], xmm2
115 movdqu [arg2+48], xmm3
118 ; XOR the initial_crc value
124 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
125 ;imm value of pclmulqdq instruction
126 ;will determine which constant to use
127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
128 ; we subtract 128 instead of 64 to save one instruction from the loop
131 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
132 ; buffer. The _fold_64_B_loop
133 ; loop will fold 64B at a time until we have 64+y Bytes of buffer
136 ; fold 64B at a time. This section of the code folds 4 xmm
137 ; registers in parallel
140 ; update the buffer pointer
141 add arg3, 64 ; buf += 64;
144 prefetchnta [arg3+fetch_dist+0]
148 pclmulqdq xmm0, xmm6 , 0x11
149 pclmulqdq xmm1, xmm6 , 0x11
151 pclmulqdq xmm4, xmm6, 0x0
152 pclmulqdq xmm5, xmm6, 0x0
157 prefetchnta [arg3+fetch_dist+32]
161 pclmulqdq xmm2, xmm6, 0x11
162 pclmulqdq xmm3, xmm6, 0x11
164 pclmulqdq xmm4, xmm6, 0x0
165 pclmulqdq xmm5, xmm6, 0x0
171 movdqu xmm5, [arg3+16]
173 movdqu [arg2+16], xmm5
179 movdqu xmm4, [arg3+32]
180 movdqu xmm5, [arg3+48]
181 movdqu [arg2+32], xmm4
182 movdqu [arg2+48], xmm5
191 ; check if there is another 64B in the buffer to be able to fold
193 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
198 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
199 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
202 ; fold the 4 xmm registers to 1 xmm register with different constants
204 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
205 ;imm value of pclmulqdq instruction will
206 ;determine which constant to use
209 pclmulqdq xmm0, xmm6, 0x11
210 pclmulqdq xmm4, xmm6, 0x0
215 pclmulqdq xmm1, xmm6, 0x11
216 pclmulqdq xmm4, xmm6, 0x0
221 pclmulqdq xmm2, xmm6, 0x11
222 pclmulqdq xmm4, xmm6, 0x0
227 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
228 ; instead of a cmp instruction, we use the negative flag with the jl instruction
230 jl _final_reduction_for_128
232 ; now we have 16+y bytes left to reduce. 16 Bytes
233 ; is in register xmm3 and the rest is in memory
234 ; we can fold 16 bytes at a time if y>=16
235 ; continue folding 16B at a time
239 pclmulqdq xmm3, xmm6, 0x11
240 pclmulqdq xmm4, xmm6, 0x0
249 ; instead of a cmp instruction, we utilize the flags with the jge instruction
250 ; equivalent of: cmp arg4, 16-16
251 ; check if there is any more 16B in the buffer to be able to fold
252 jge _16B_reduction_loop
254 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
255 ;first, we reduce the data in the xmm3 register
258 _final_reduction_for_128:
259 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
263 ; here we are getting data that is less than 16 bytes.
264 ; since we know that there was data before the pointer,
265 ; we can offset the input pointer before the actual point,
266 ; to receive exactly 16 bytes.
267 ; after that the registers need to be adjusted.
271 movdqu xmm1, [arg3 - 16 + arg4]
272 movdqu [arg2 - 16 + arg4], xmm1
275 ; get rid of the extra data that was loaded before
276 ; load the shift constant
277 lea rax, [pshufb_shf_table + 16]
281 ; shift xmm2 to the left by arg4 bytes
284 ; shift xmm3 to the right by 16-arg4 bytes
287 pblendvb xmm1, xmm2 ;xmm0 is implicit
292 pclmulqdq xmm3, xmm6, 0x11
293 pclmulqdq xmm4, xmm6, 0x0
298 ; compute crc of a 128-bit value
299 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
303 pclmulqdq xmm3, xmm6, 0x1
313 pclmulqdq xmm3, xmm6, 0x10
318 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
320 pclmulqdq xmm3, xmm6, 0x01
322 pclmulqdq xmm3, xmm6, 0x11
329 ; scale the result back to 16 bits
331 movdqa xmm6, [rsp+16*2]
332 movdqa xmm7, [rsp+16*3]
337 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
338 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
339 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
340 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345 ; check if there is enough buffer to be able to fold 16B at a time
348 movdqa xmm7, [SHUF_MASK]
350 ; if there is, load the constants
351 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
353 movd xmm0, arg1_low32 ; get the initial crc value
354 pslldq xmm0, 12 ; align it to its correct place
355 movdqu xmm3, [arg3] ; load the plaintext
356 movdqu [arg2], xmm3 ; store copy
357 pshufb xmm3, xmm7 ; byte-reflect the plaintext
361 ; update the buffer pointer
365 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
368 jmp _16B_reduction_loop
373 ; mov initial crc to the return value. this is necessary for zero-length buffers.
378 movdqa xmm7, [SHUF_MASK]
380 movd xmm0, arg1_low32 ; get the initial crc value
381 pslldq xmm0, 12 ; align it to its correct place
385 jl _less_than_16_left
387 movdqu xmm3, [arg3] ; load the plaintext
388 movdqu [arg2], xmm3 ; store the copy
389 pshufb xmm3, xmm7 ; byte-reflect the plaintext
390 pxor xmm3, xmm0 ; xor the initial crc value
394 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
395 jmp _get_last_two_xmms
400 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
409 ; backup the counter value
459 pxor xmm3, xmm0 ; xor the initial crc value
462 lea rax, [pshufb_shf_table + 16]
475 pxor xmm3, xmm0 ; xor the initial crc value
498 pxor xmm3, xmm0 ; xor the initial crc value
518 pxor xmm3, xmm0 ; xor the initial crc value
532 pxor xmm3, xmm0 ; xor the initial crc value
540 ; precomputed constants
541 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
544 ; rk1 = 2^(32*3) mod Q << 32
545 ; rk2 = 2^(32*5) mod Q << 32
546 ; rk3 = 2^(32*15) mod Q << 32
547 ; rk4 = 2^(32*17) mod Q << 32
548 ; rk5 = 2^(32*3) mod Q << 32
549 ; rk6 = 2^(32*2) mod Q << 32
550 ; rk7 = floor(2^64/Q)
553 DQ 0x2d56000000000000
555 DQ 0x06df000000000000
557 DQ 0x044c000000000000
559 DQ 0xe658000000000000
561 DQ 0x2d56000000000000
563 DQ 0x1368000000000000
565 DQ 0x00000001f65a57f8
567 DQ 0x000000018bb70000
569 dq 0x8080808080808080, 0x8080808080808080
571 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
574 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
577 ; use these values for shift constants for the pshufb instruction
578 ; different alignments result in values as shown:
579 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
580 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
581 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
582 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
583 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
584 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
585 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
586 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
587 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
588 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
589 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
590 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
591 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
592 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
593 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
594 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
595 dq 0x0706050403020100, 0x000e0d0c0b0a0908
597 ;;; func core, ver, snum
598 slversion crc16_t10dif_copy_by4, 05, 02, 0000