1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT16 crc16_t10dif_01(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 %include "reg_sizes.asm"
52 %ifidn __OUTPUT_FORMAT__, win64
57 %xdefine arg1_low32 ecx
63 %xdefine arg1_low32 edi
66 %ifidn __OUTPUT_FORMAT__, win64
68 %define VARIABLE_OFFSET 16*10+8
70 %define VARIABLE_OFFSET 16*2+8
74 global crc16_t10dif_01:function
77 ; adjust the 16-bit initial_crc value, scale it to 32 bits
80 ; After this point, code flow is exactly same as a 32-bit CRC.
81 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
83 sub rsp, VARIABLE_OFFSET
84 %ifidn __OUTPUT_FORMAT__, win64
85 ; push the xmm registers into the stack to maintain
86 movdqa [rsp+16*2],xmm6
87 movdqa [rsp+16*3],xmm7
88 movdqa [rsp+16*4],xmm8
89 movdqa [rsp+16*5],xmm9
90 movdqa [rsp+16*6],xmm10
91 movdqa [rsp+16*7],xmm11
92 movdqa [rsp+16*8],xmm12
93 movdqa [rsp+16*9],xmm13
96 ; check if smaller than 256
99 ; for sizes less than 256, we can't fold 128B at a time...
103 ; load the initial crc value
104 movd xmm10, arg1_low32 ; initial crc
106 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
107 ; because data will be byte-reflected and will align with initial crc at correct place.
110 movdqa xmm11, [SHUF_MASK]
111 ; receive the initial 128B data, xor the initial crc value
112 movdqu xmm0, [arg2+16*0]
113 movdqu xmm1, [arg2+16*1]
114 movdqu xmm2, [arg2+16*2]
115 movdqu xmm3, [arg2+16*3]
116 movdqu xmm4, [arg2+16*4]
117 movdqu xmm5, [arg2+16*5]
118 movdqu xmm6, [arg2+16*6]
119 movdqu xmm7, [arg2+16*7]
122 ; XOR the initial_crc value
132 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
133 ;imm value of pclmulqdq instruction will determine which constant to use
134 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
135 ; we subtract 256 instead of 128 to save one instruction from the loop
138 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
139 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
142 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
145 ; update the buffer pointer
146 add arg2, 128 ; buf += 128;
148 movdqu xmm9, [arg2+16*0]
149 movdqu xmm12, [arg2+16*1]
154 pclmulqdq xmm0, xmm10, 0x0
155 pclmulqdq xmm8, xmm10 , 0x11
156 pclmulqdq xmm1, xmm10, 0x0
157 pclmulqdq xmm13, xmm10 , 0x11
163 movdqu xmm9, [arg2+16*2]
164 movdqu xmm12, [arg2+16*3]
169 pclmulqdq xmm2, xmm10, 0x0
170 pclmulqdq xmm8, xmm10 , 0x11
171 pclmulqdq xmm3, xmm10, 0x0
172 pclmulqdq xmm13, xmm10 , 0x11
178 movdqu xmm9, [arg2+16*4]
179 movdqu xmm12, [arg2+16*5]
184 pclmulqdq xmm4, xmm10, 0x0
185 pclmulqdq xmm8, xmm10 , 0x11
186 pclmulqdq xmm5, xmm10, 0x0
187 pclmulqdq xmm13, xmm10 , 0x11
193 movdqu xmm9, [arg2+16*6]
194 movdqu xmm12, [arg2+16*7]
199 pclmulqdq xmm6, xmm10, 0x0
200 pclmulqdq xmm8, xmm10 , 0x11
201 pclmulqdq xmm7, xmm10, 0x0
202 pclmulqdq xmm13, xmm10 , 0x11
210 ; check if there is another 128B in the buffer to be able to fold
212 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
217 ; fold the 8 xmm registers to 1 xmm register with different constants
221 pclmulqdq xmm0, xmm10, 0x11
222 pclmulqdq xmm8, xmm10, 0x0
228 pclmulqdq xmm1, xmm10, 0x11
229 pclmulqdq xmm8, xmm10, 0x0
235 pclmulqdq xmm2, xmm10, 0x11
236 pclmulqdq xmm8, xmm10, 0x0
242 pclmulqdq xmm3, xmm10, 0x11
243 pclmulqdq xmm8, xmm10, 0x0
249 pclmulqdq xmm4, xmm10, 0x11
250 pclmulqdq xmm8, xmm10, 0x0
256 pclmulqdq xmm5, xmm10, 0x11
257 pclmulqdq xmm8, xmm10, 0x0
261 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
262 ;imm value of pclmulqdq instruction will determine which constant to use
264 pclmulqdq xmm6, xmm10, 0x11
265 pclmulqdq xmm8, xmm10, 0x0
270 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
271 ; instead of a cmp instruction, we use the negative flag with the jl instruction
273 jl _final_reduction_for_128
275 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
276 ; we can fold 16 bytes at a time if y>=16
277 ; continue folding 16B at a time
281 pclmulqdq xmm7, xmm10, 0x11
282 pclmulqdq xmm8, xmm10, 0x0
289 ; instead of a cmp instruction, we utilize the flags with the jge instruction
290 ; equivalent of: cmp arg3, 16-16
291 ; check if there is any more 16B in the buffer to be able to fold
292 jge _16B_reduction_loop
294 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
295 ;first, we reduce the data in the xmm7 register
298 _final_reduction_for_128:
299 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
303 ; here we are getting data that is less than 16 bytes.
304 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
305 ; after that the registers need to be adjusted.
309 movdqu xmm1, [arg2 - 16 + arg3]
312 ; get rid of the extra data that was loaded before
313 ; load the shift constant
314 lea rax, [pshufb_shf_table + 16]
318 ; shift xmm2 to the left by arg3 bytes
321 ; shift xmm7 to the right by 16-arg3 bytes
324 pblendvb xmm1, xmm2 ;xmm0 is implicit
329 pclmulqdq xmm7, xmm10, 0x11
330 pclmulqdq xmm8, xmm10, 0x0
335 ; compute crc of a 128-bit value
336 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
340 pclmulqdq xmm7, xmm10, 0x1
350 pclmulqdq xmm7, xmm10, 0x10
355 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
357 pclmulqdq xmm7, xmm10, 0x01
359 pclmulqdq xmm7, xmm10, 0x11
366 ; scale the result back to 16 bits
368 %ifidn __OUTPUT_FORMAT__, win64
369 movdqa xmm6, [rsp+16*2]
370 movdqa xmm7, [rsp+16*3]
371 movdqa xmm8, [rsp+16*4]
372 movdqa xmm9, [rsp+16*5]
373 movdqa xmm10, [rsp+16*6]
374 movdqa xmm11, [rsp+16*7]
375 movdqa xmm12, [rsp+16*8]
376 movdqa xmm13, [rsp+16*9]
378 add rsp, VARIABLE_OFFSET
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390 ; check if there is enough buffer to be able to fold 16B at a time
393 movdqa xmm11, [SHUF_MASK]
395 ; if there is, load the constants
396 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
398 movd xmm0, arg1_low32 ; get the initial crc value
399 pslldq xmm0, 12 ; align it to its correct place
400 movdqu xmm7, [arg2] ; load the plaintext
401 pshufb xmm7, xmm11 ; byte-reflect the plaintext
405 ; update the buffer pointer
408 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
411 jmp _16B_reduction_loop
416 ; mov initial crc to the return value. this is necessary for zero-length buffers.
421 movdqa xmm11, [SHUF_MASK]
423 movd xmm0, arg1_low32 ; get the initial crc value
424 pslldq xmm0, 12 ; align it to its correct place
428 jl _less_than_16_left
430 movdqu xmm7, [arg2] ; load the plaintext
431 pshufb xmm7, xmm11 ; byte-reflect the plaintext
432 pxor xmm7, xmm0 ; xor the initial crc value
435 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
436 jmp _get_last_two_xmms
441 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
450 ; backup the counter value
493 pxor xmm7, xmm0 ; xor the initial crc value
495 lea rax, [pshufb_shf_table + 16]
507 pxor xmm7, xmm0 ; xor the initial crc value
527 pxor xmm7, xmm0 ; xor the initial crc value
545 pxor xmm7, xmm0 ; xor the initial crc value
558 pxor xmm7, xmm0 ; xor the initial crc value
566 ; precomputed constants
567 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
570 ; rk1 = 2^(32*3) mod Q << 32
571 ; rk2 = 2^(32*5) mod Q << 32
572 ; rk3 = 2^(32*15) mod Q << 32
573 ; rk4 = 2^(32*17) mod Q << 32
574 ; rk5 = 2^(32*3) mod Q << 32
575 ; rk6 = 2^(32*2) mod Q << 32
576 ; rk7 = floor(2^64/Q)
579 DQ 0x2d56000000000000
581 DQ 0x06df000000000000
583 DQ 0x9d9d000000000000
585 DQ 0x7cf5000000000000
587 DQ 0x2d56000000000000
589 DQ 0x1368000000000000
591 DQ 0x00000001f65a57f8
593 DQ 0x000000018bb70000
596 DQ 0xceae000000000000
598 DQ 0xbfd6000000000000
600 DQ 0x1e16000000000000
602 DQ 0x713c000000000000
604 DQ 0xf7f9000000000000
606 DQ 0x80a6000000000000
608 DQ 0x044c000000000000
610 DQ 0xe658000000000000
612 DQ 0xad18000000000000
614 DQ 0xa497000000000000
616 DQ 0x6ee3000000000000
618 DQ 0xe7b5000000000000
629 dq 0x8080808080808080, 0x8080808080808080
631 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
634 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
637 ; use these values for shift constants for the pshufb instruction
638 ; different alignments result in values as shown:
639 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
640 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
641 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
642 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
643 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
644 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
645 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
646 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
647 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
648 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
649 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
650 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
651 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
652 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
653 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
654 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
655 dq 0x0706050403020100, 0x000e0d0c0b0a0908
657 ;;; func core, ver, snum
658 slversion crc16_t10dif_01, 01, 06, 0010