1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; UINT32 crc16_t10dif_by16_10(
33 ; UINT16 init_crc, //initial CRC value, 16 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
48 %include "reg_sizes.asm"
51 %define FUNCTION_NAME crc16_t10dif_by16_10
54 %if (AS_FEATURE_LEVEL) >= 10
62 %ifidn __OUTPUT_FORMAT__, win64
67 %xdefine arg1_low32 ecx
73 %xdefine arg1_low32 edi
77 %ifidn __OUTPUT_FORMAT__, win64
79 %define VARIABLE_OFFSET 16*12+8
81 %define VARIABLE_OFFSET 16*2+8
85 mk_global FUNCTION_NAME, function
89 ; adjust the 16-bit initial_crc value, scale it to 32 bits
92 ; After this point, code flow is exactly same as a 32-bit CRC.
93 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
95 sub rsp, VARIABLE_OFFSET
97 %ifidn __OUTPUT_FORMAT__, win64
98 ; push the xmm registers into the stack to maintain
99 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
100 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
101 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
102 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
103 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
104 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
105 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
106 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
107 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
108 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
111 vbroadcasti32x4 zmm18, [SHUF_MASK]
115 ; load the initial crc value
116 vmovd xmm10, arg1_low32 ; initial crc
118 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
119 ; because data will be byte-reflected and will align with initial crc at correct place.
122 ; receive the initial 64B data, xor the initial crc value
123 vmovdqu8 zmm0, [arg2+16*0]
124 vmovdqu8 zmm4, [arg2+16*4]
125 vpshufb zmm0, zmm0, zmm18
126 vpshufb zmm4, zmm4, zmm18
128 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
129 ;imm value of pclmulqdq instruction will determine which constant to use
135 vmovdqu8 zmm7, [arg2+16*8]
136 vmovdqu8 zmm8, [arg2+16*12]
137 vpshufb zmm7, zmm7, zmm18
138 vpshufb zmm8, zmm8, zmm18
139 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
144 vmovdqu8 zmm3, [arg2+16*0]
145 vpshufb zmm3, zmm3, zmm18
146 vpclmulqdq zmm1, zmm0, zmm16, 0x00
147 vpclmulqdq zmm2, zmm0, zmm16, 0x11
148 vpxorq zmm0, zmm1, zmm2
149 vpxorq zmm0, zmm0, zmm3
151 vmovdqu8 zmm9, [arg2+16*4]
152 vpshufb zmm9, zmm9, zmm18
153 vpclmulqdq zmm5, zmm4, zmm16, 0x00
154 vpclmulqdq zmm6, zmm4, zmm16, 0x11
155 vpxorq zmm4, zmm5, zmm6
156 vpxorq zmm4, zmm4, zmm9
158 vmovdqu8 zmm11, [arg2+16*8]
159 vpshufb zmm11, zmm11, zmm18
160 vpclmulqdq zmm12, zmm7, zmm16, 0x00
161 vpclmulqdq zmm13, zmm7, zmm16, 0x11
162 vpxorq zmm7, zmm12, zmm13
163 vpxorq zmm7, zmm7, zmm11
165 vmovdqu8 zmm17, [arg2+16*12]
166 vpshufb zmm17, zmm17, zmm18
167 vpclmulqdq zmm14, zmm8, zmm16, 0x00
168 vpclmulqdq zmm15, zmm8, zmm16, 0x11
169 vpxorq zmm8, zmm14, zmm15
170 vpxorq zmm8, zmm8, zmm17
177 vpclmulqdq zmm1, zmm0, zmm10, 0x00
178 vpclmulqdq zmm2, zmm0, zmm10, 0x11
179 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
181 vpclmulqdq zmm5, zmm4, zmm10, 0x00
182 vpclmulqdq zmm6, zmm4, zmm10, 0x11
183 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
189 jmp .fold_128_B_register
193 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
194 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
196 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
199 vmovdqu8 zmm8, [arg2+16*0]
200 vpshufb zmm8, zmm8, zmm18
201 vpclmulqdq zmm2, zmm0, zmm10, 0x00
202 vpclmulqdq zmm1, zmm0, zmm10, 0x11
203 vpxorq zmm0, zmm2, zmm1
204 vpxorq zmm0, zmm0, zmm8
206 vmovdqu8 zmm9, [arg2+16*4]
207 vpshufb zmm9, zmm9, zmm18
208 vpclmulqdq zmm5, zmm4, zmm10, 0x00
209 vpclmulqdq zmm6, zmm4, zmm10, 0x11
210 vpxorq zmm4, zmm5, zmm6
211 vpxorq zmm4, zmm4, zmm9
215 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
218 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
219 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
221 .fold_128_B_register:
222 ; fold the 8 128b parts into 1 xmm register with different constants
223 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
224 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
225 vpclmulqdq zmm1, zmm0, zmm16, 0x00
226 vpclmulqdq zmm2, zmm0, zmm16, 0x11
227 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
229 vpclmulqdq zmm5, zmm4, zmm11, 0x00
230 vpclmulqdq zmm6, zmm4, zmm11, 0x11
231 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
232 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
233 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
235 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
236 vpxorq ymm8, ymm8, ymm1
237 vextracti64x2 xmm5, ymm8, 1
238 vpxorq xmm7, xmm5, xmm8
240 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
241 ; instead of a cmp instruction, we use the negative flag with the jl instruction
243 jl .final_reduction_for_128
245 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
246 ; we can fold 16 bytes at a time if y>=16
247 ; continue folding 16B at a time
250 vpclmulqdq xmm8, xmm7, xmm10, 0x11
251 vpclmulqdq xmm7, xmm7, xmm10, 0x00
254 vpshufb xmm0, xmm0, xmm18
258 ; instead of a cmp instruction, we utilize the flags with the jge instruction
259 ; equivalent of: cmp arg3, 16-16
260 ; check if there is any more 16B in the buffer to be able to fold
261 jge .16B_reduction_loop
263 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
264 ;first, we reduce the data in the xmm7 register
267 .final_reduction_for_128:
271 ; here we are getting data that is less than 16 bytes.
272 ; since we know that there was data before the pointer, we can offset
273 ; the input pointer before the actual point, to receive exactly 16 bytes.
274 ; after that the registers need to be adjusted.
278 vmovdqu xmm1, [arg2 - 16 + arg3]
281 ; get rid of the extra data that was loaded before
282 ; load the shift constant
283 lea rax, [pshufb_shf_table + 16]
290 vpblendvb xmm1, xmm1, xmm2, xmm0
292 vpclmulqdq xmm8, xmm7, xmm10, 0x11
293 vpclmulqdq xmm7, xmm7, xmm10, 0x00
298 ; compute crc of a 128-bit value
303 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
311 vpclmulqdq xmm7, xmm10, 0x10
316 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
318 vpclmulqdq xmm7, xmm10, 0x01
320 vpclmulqdq xmm7, xmm10, 0x11
327 ; scale the result back to 16 bits
330 %ifidn __OUTPUT_FORMAT__, win64
331 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
332 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
333 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
334 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
335 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
336 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
337 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
338 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
339 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
340 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
342 add rsp, VARIABLE_OFFSET
346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
354 ; check if there is enough buffer to be able to fold 16B at a time
358 ; if there is, load the constants
359 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
361 vmovd xmm0, arg1_low32 ; get the initial crc value
362 vpslldq xmm0, 12 ; align it to its correct place
363 vmovdqu xmm7, [arg2] ; load the plaintext
364 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
367 ; update the buffer pointer
370 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
373 jmp .16B_reduction_loop
378 ; mov initial crc to the return value. this is necessary for zero-length buffers.
383 vmovd xmm0, arg1_low32 ; get the initial crc value
384 vpslldq xmm0, 12 ; align it to its correct place
388 jl .less_than_16_left
390 vmovdqu xmm7, [arg2] ; load the plaintext
392 vpxor xmm7, xmm0 ; xor the initial crc value
395 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
396 jmp .get_last_two_xmms
400 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
409 ; backup the counter value
453 vpxor xmm7, xmm0 ; xor the initial crc value
455 lea rax, [pshufb_shf_table + 16]
467 vpxor xmm7, xmm0 ; xor the initial crc value
486 vpxor xmm7, xmm0 ; xor the initial crc value
504 vpxor xmm7, xmm0 ; xor the initial crc value
516 vpxor xmm7, xmm0 ; xor the initial crc value
525 ; precomputed constants
527 rk_1: dq 0xdccf000000000000
528 rk_2: dq 0x4b0b000000000000
529 rk1: dq 0x2d56000000000000
530 rk2: dq 0x06df000000000000
531 rk3: dq 0x9d9d000000000000
532 rk4: dq 0x7cf5000000000000
533 rk5: dq 0x2d56000000000000
534 rk6: dq 0x1368000000000000
535 rk7: dq 0x00000001f65a57f8
536 rk8: dq 0x000000018bb70000
537 rk9: dq 0xceae000000000000
538 rk10: dq 0xbfd6000000000000
539 rk11: dq 0x1e16000000000000
540 rk12: dq 0x713c000000000000
541 rk13: dq 0xf7f9000000000000
542 rk14: dq 0x80a6000000000000
543 rk15: dq 0x044c000000000000
544 rk16: dq 0xe658000000000000
545 rk17: dq 0xad18000000000000
546 rk18: dq 0xa497000000000000
547 rk19: dq 0x6ee3000000000000
548 rk20: dq 0xe7b5000000000000
550 rk_1b: dq 0x2d56000000000000
551 rk_2b: dq 0x06df000000000000
552 dq 0x0000000000000000
553 dq 0x0000000000000000
558 mask1: dq 0x8080808080808080, 0x8080808080808080
559 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
561 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
564 ; use these values for shift constants for the pshufb instruction
565 ; different alignments result in values as shown:
566 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
567 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
568 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
569 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
570 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
571 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
572 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
573 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
574 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
575 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
576 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
577 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
578 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
579 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
580 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
581 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
582 dq 0x0706050403020100, 0x000e0d0c0b0a0908
583 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
584 dq 0x8080808080808080, 0x8080808080808080
586 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
587 %ifidn __OUTPUT_FORMAT__, win64
588 global no_ %+ FUNCTION_NAME
589 no_ %+ FUNCTION_NAME %+ :
591 %endif ; (AS_FEATURE_LEVEL) >= 10