]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_01.asm
129a9eb9fb7c961fe1384330c9b5ce14edb9740f
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_01.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ; Function API:
31 ; UINT16 crc16_t10dif_01(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
35 ; );
36 ;
37 ; Authors:
38 ; Erdinc Ozturk
39 ; Vinodh Gopal
40 ; James Guilford
41 ;
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
44
45 %include "reg_sizes.asm"
46
47 [bits 64]
48 default rel
49
50 section .text
51
52 %ifidn __OUTPUT_FORMAT__, win64
53 %xdefine arg1 rcx
54 %xdefine arg2 rdx
55 %xdefine arg3 r8
56
57 %xdefine arg1_low32 ecx
58 %else
59 %xdefine arg1 rdi
60 %xdefine arg2 rsi
61 %xdefine arg3 rdx
62
63 %xdefine arg1_low32 edi
64 %endif
65
66 %ifidn __OUTPUT_FORMAT__, win64
67 %define XMM_SAVE 16*2
68 %define VARIABLE_OFFSET 16*10+8
69 %else
70 %define VARIABLE_OFFSET 16*2+8
71 %endif
72
73 align 16
74 global crc16_t10dif_01:function
75 crc16_t10dif_01:
76
77 ; adjust the 16-bit initial_crc value, scale it to 32 bits
78 shl arg1_low32, 16
79
80 ; After this point, code flow is exactly same as a 32-bit CRC.
81 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
82
83 sub rsp, VARIABLE_OFFSET
84 %ifidn __OUTPUT_FORMAT__, win64
85 ; push the xmm registers into the stack to maintain
86 movdqa [rsp+16*2],xmm6
87 movdqa [rsp+16*3],xmm7
88 movdqa [rsp+16*4],xmm8
89 movdqa [rsp+16*5],xmm9
90 movdqa [rsp+16*6],xmm10
91 movdqa [rsp+16*7],xmm11
92 movdqa [rsp+16*8],xmm12
93 movdqa [rsp+16*9],xmm13
94 %endif
95
96 ; check if smaller than 256
97 cmp arg3, 256
98
99 ; for sizes less than 256, we can't fold 128B at a time...
100 jl _less_than_256
101
102
103 ; load the initial crc value
104 movd xmm10, arg1_low32 ; initial crc
105
106 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
107 ; because data will be byte-reflected and will align with initial crc at correct place.
108 pslldq xmm10, 12
109
110 movdqa xmm11, [SHUF_MASK]
111 ; receive the initial 128B data, xor the initial crc value
112 movdqu xmm0, [arg2+16*0]
113 movdqu xmm1, [arg2+16*1]
114 movdqu xmm2, [arg2+16*2]
115 movdqu xmm3, [arg2+16*3]
116 movdqu xmm4, [arg2+16*4]
117 movdqu xmm5, [arg2+16*5]
118 movdqu xmm6, [arg2+16*6]
119 movdqu xmm7, [arg2+16*7]
120
121 pshufb xmm0, xmm11
122 ; XOR the initial_crc value
123 pxor xmm0, xmm10
124 pshufb xmm1, xmm11
125 pshufb xmm2, xmm11
126 pshufb xmm3, xmm11
127 pshufb xmm4, xmm11
128 pshufb xmm5, xmm11
129 pshufb xmm6, xmm11
130 pshufb xmm7, xmm11
131
132 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
133 ;imm value of pclmulqdq instruction will determine which constant to use
134 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
135 ; we subtract 256 instead of 128 to save one instruction from the loop
136 sub arg3, 256
137
138 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
139 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
140
141
142 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
143 _fold_128_B_loop:
144
145 ; update the buffer pointer
146 add arg2, 128 ; buf += 128;
147
148 movdqu xmm9, [arg2+16*0]
149 movdqu xmm12, [arg2+16*1]
150 pshufb xmm9, xmm11
151 pshufb xmm12, xmm11
152 movdqa xmm8, xmm0
153 movdqa xmm13, xmm1
154 pclmulqdq xmm0, xmm10, 0x0
155 pclmulqdq xmm8, xmm10 , 0x11
156 pclmulqdq xmm1, xmm10, 0x0
157 pclmulqdq xmm13, xmm10 , 0x11
158 pxor xmm0, xmm9
159 xorps xmm0, xmm8
160 pxor xmm1, xmm12
161 xorps xmm1, xmm13
162
163 movdqu xmm9, [arg2+16*2]
164 movdqu xmm12, [arg2+16*3]
165 pshufb xmm9, xmm11
166 pshufb xmm12, xmm11
167 movdqa xmm8, xmm2
168 movdqa xmm13, xmm3
169 pclmulqdq xmm2, xmm10, 0x0
170 pclmulqdq xmm8, xmm10 , 0x11
171 pclmulqdq xmm3, xmm10, 0x0
172 pclmulqdq xmm13, xmm10 , 0x11
173 pxor xmm2, xmm9
174 xorps xmm2, xmm8
175 pxor xmm3, xmm12
176 xorps xmm3, xmm13
177
178 movdqu xmm9, [arg2+16*4]
179 movdqu xmm12, [arg2+16*5]
180 pshufb xmm9, xmm11
181 pshufb xmm12, xmm11
182 movdqa xmm8, xmm4
183 movdqa xmm13, xmm5
184 pclmulqdq xmm4, xmm10, 0x0
185 pclmulqdq xmm8, xmm10 , 0x11
186 pclmulqdq xmm5, xmm10, 0x0
187 pclmulqdq xmm13, xmm10 , 0x11
188 pxor xmm4, xmm9
189 xorps xmm4, xmm8
190 pxor xmm5, xmm12
191 xorps xmm5, xmm13
192
193 movdqu xmm9, [arg2+16*6]
194 movdqu xmm12, [arg2+16*7]
195 pshufb xmm9, xmm11
196 pshufb xmm12, xmm11
197 movdqa xmm8, xmm6
198 movdqa xmm13, xmm7
199 pclmulqdq xmm6, xmm10, 0x0
200 pclmulqdq xmm8, xmm10 , 0x11
201 pclmulqdq xmm7, xmm10, 0x0
202 pclmulqdq xmm13, xmm10 , 0x11
203 pxor xmm6, xmm9
204 xorps xmm6, xmm8
205 pxor xmm7, xmm12
206 xorps xmm7, xmm13
207
208 sub arg3, 128
209
210 ; check if there is another 128B in the buffer to be able to fold
211 jge _fold_128_B_loop
212 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
213
214
215 add arg2, 128
216 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
217 ; fold the 8 xmm registers to 1 xmm register with different constants
218
219 movdqa xmm10, [rk9]
220 movdqa xmm8, xmm0
221 pclmulqdq xmm0, xmm10, 0x11
222 pclmulqdq xmm8, xmm10, 0x0
223 pxor xmm7, xmm8
224 xorps xmm7, xmm0
225
226 movdqa xmm10, [rk11]
227 movdqa xmm8, xmm1
228 pclmulqdq xmm1, xmm10, 0x11
229 pclmulqdq xmm8, xmm10, 0x0
230 pxor xmm7, xmm8
231 xorps xmm7, xmm1
232
233 movdqa xmm10, [rk13]
234 movdqa xmm8, xmm2
235 pclmulqdq xmm2, xmm10, 0x11
236 pclmulqdq xmm8, xmm10, 0x0
237 pxor xmm7, xmm8
238 pxor xmm7, xmm2
239
240 movdqa xmm10, [rk15]
241 movdqa xmm8, xmm3
242 pclmulqdq xmm3, xmm10, 0x11
243 pclmulqdq xmm8, xmm10, 0x0
244 pxor xmm7, xmm8
245 xorps xmm7, xmm3
246
247 movdqa xmm10, [rk17]
248 movdqa xmm8, xmm4
249 pclmulqdq xmm4, xmm10, 0x11
250 pclmulqdq xmm8, xmm10, 0x0
251 pxor xmm7, xmm8
252 pxor xmm7, xmm4
253
254 movdqa xmm10, [rk19]
255 movdqa xmm8, xmm5
256 pclmulqdq xmm5, xmm10, 0x11
257 pclmulqdq xmm8, xmm10, 0x0
258 pxor xmm7, xmm8
259 xorps xmm7, xmm5
260
261 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
262 ;imm value of pclmulqdq instruction will determine which constant to use
263 movdqa xmm8, xmm6
264 pclmulqdq xmm6, xmm10, 0x11
265 pclmulqdq xmm8, xmm10, 0x0
266 pxor xmm7, xmm8
267 pxor xmm7, xmm6
268
269
270 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
271 ; instead of a cmp instruction, we use the negative flag with the jl instruction
272 add arg3, 128-16
273 jl _final_reduction_for_128
274
275 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
276 ; we can fold 16 bytes at a time if y>=16
277 ; continue folding 16B at a time
278
279 _16B_reduction_loop:
280 movdqa xmm8, xmm7
281 pclmulqdq xmm7, xmm10, 0x11
282 pclmulqdq xmm8, xmm10, 0x0
283 pxor xmm7, xmm8
284 movdqu xmm0, [arg2]
285 pshufb xmm0, xmm11
286 pxor xmm7, xmm0
287 add arg2, 16
288 sub arg3, 16
289 ; instead of a cmp instruction, we utilize the flags with the jge instruction
290 ; equivalent of: cmp arg3, 16-16
291 ; check if there is any more 16B in the buffer to be able to fold
292 jge _16B_reduction_loop
293
294 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
295 ;first, we reduce the data in the xmm7 register
296
297
298 _final_reduction_for_128:
299 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
300 add arg3, 16
301 je _128_done
302
303 ; here we are getting data that is less than 16 bytes.
304 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
305 ; after that the registers need to be adjusted.
306 _get_last_two_xmms:
307 movdqa xmm2, xmm7
308
309 movdqu xmm1, [arg2 - 16 + arg3]
310 pshufb xmm1, xmm11
311
312 ; get rid of the extra data that was loaded before
313 ; load the shift constant
314 lea rax, [pshufb_shf_table + 16]
315 sub rax, arg3
316 movdqu xmm0, [rax]
317
318 ; shift xmm2 to the left by arg3 bytes
319 pshufb xmm2, xmm0
320
321 ; shift xmm7 to the right by 16-arg3 bytes
322 pxor xmm0, [mask1]
323 pshufb xmm7, xmm0
324 pblendvb xmm1, xmm2 ;xmm0 is implicit
325
326 ; fold 16 Bytes
327 movdqa xmm2, xmm1
328 movdqa xmm8, xmm7
329 pclmulqdq xmm7, xmm10, 0x11
330 pclmulqdq xmm8, xmm10, 0x0
331 pxor xmm7, xmm8
332 pxor xmm7, xmm2
333
334 _128_done:
335 ; compute crc of a 128-bit value
336 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
337 movdqa xmm0, xmm7
338
339 ;64b fold
340 pclmulqdq xmm7, xmm10, 0x1
341 pslldq xmm0, 8
342 pxor xmm7, xmm0
343
344 ;32b fold
345 movdqa xmm0, xmm7
346
347 pand xmm0, [mask2]
348
349 psrldq xmm7, 12
350 pclmulqdq xmm7, xmm10, 0x10
351 pxor xmm7, xmm0
352
353 ;barrett reduction
354 _barrett:
355 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
356 movdqa xmm0, xmm7
357 pclmulqdq xmm7, xmm10, 0x01
358 pslldq xmm7, 4
359 pclmulqdq xmm7, xmm10, 0x11
360
361 pslldq xmm7, 4
362 pxor xmm7, xmm0
363 pextrd eax, xmm7,1
364
365 _cleanup:
366 ; scale the result back to 16 bits
367 shr eax, 16
368 %ifidn __OUTPUT_FORMAT__, win64
369 movdqa xmm6, [rsp+16*2]
370 movdqa xmm7, [rsp+16*3]
371 movdqa xmm8, [rsp+16*4]
372 movdqa xmm9, [rsp+16*5]
373 movdqa xmm10, [rsp+16*6]
374 movdqa xmm11, [rsp+16*7]
375 movdqa xmm12, [rsp+16*8]
376 movdqa xmm13, [rsp+16*9]
377 %endif
378 add rsp, VARIABLE_OFFSET
379 ret
380
381
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
386
387 align 16
388 _less_than_256:
389
390 ; check if there is enough buffer to be able to fold 16B at a time
391 cmp arg3, 32
392 jl _less_than_32
393 movdqa xmm11, [SHUF_MASK]
394
395 ; if there is, load the constants
396 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
397
398 movd xmm0, arg1_low32 ; get the initial crc value
399 pslldq xmm0, 12 ; align it to its correct place
400 movdqu xmm7, [arg2] ; load the plaintext
401 pshufb xmm7, xmm11 ; byte-reflect the plaintext
402 pxor xmm7, xmm0
403
404
405 ; update the buffer pointer
406 add arg2, 16
407
408 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
409 sub arg3, 32
410
411 jmp _16B_reduction_loop
412
413
414 align 16
415 _less_than_32:
416 ; mov initial crc to the return value. this is necessary for zero-length buffers.
417 mov eax, arg1_low32
418 test arg3, arg3
419 je _cleanup
420
421 movdqa xmm11, [SHUF_MASK]
422
423 movd xmm0, arg1_low32 ; get the initial crc value
424 pslldq xmm0, 12 ; align it to its correct place
425
426 cmp arg3, 16
427 je _exact_16_left
428 jl _less_than_16_left
429
430 movdqu xmm7, [arg2] ; load the plaintext
431 pshufb xmm7, xmm11 ; byte-reflect the plaintext
432 pxor xmm7, xmm0 ; xor the initial crc value
433 add arg2, 16
434 sub arg3, 16
435 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
436 jmp _get_last_two_xmms
437
438
439 align 16
440 _less_than_16_left:
441 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
442
443 pxor xmm1, xmm1
444 mov r11, rsp
445 movdqa [r11], xmm1
446
447 cmp arg3, 4
448 jl _only_less_than_4
449
450 ; backup the counter value
451 mov r9, arg3
452 cmp arg3, 8
453 jl _less_than_8_left
454
455 ; load 8 Bytes
456 mov rax, [arg2]
457 mov [r11], rax
458 add r11, 8
459 sub arg3, 8
460 add arg2, 8
461 _less_than_8_left:
462
463 cmp arg3, 4
464 jl _less_than_4_left
465
466 ; load 4 Bytes
467 mov eax, [arg2]
468 mov [r11], eax
469 add r11, 4
470 sub arg3, 4
471 add arg2, 4
472 _less_than_4_left:
473
474 cmp arg3, 2
475 jl _less_than_2_left
476
477 ; load 2 Bytes
478 mov ax, [arg2]
479 mov [r11], ax
480 add r11, 2
481 sub arg3, 2
482 add arg2, 2
483 _less_than_2_left:
484 cmp arg3, 1
485 jl _zero_left
486
487 ; load 1 Byte
488 mov al, [arg2]
489 mov [r11], al
490 _zero_left:
491 movdqa xmm7, [rsp]
492 pshufb xmm7, xmm11
493 pxor xmm7, xmm0 ; xor the initial crc value
494
495 lea rax, [pshufb_shf_table + 16]
496 sub rax, r9
497 movdqu xmm0, [rax]
498 pxor xmm0, [mask1]
499
500 pshufb xmm7, xmm0
501 jmp _128_done
502
503 align 16
504 _exact_16_left:
505 movdqu xmm7, [arg2]
506 pshufb xmm7, xmm11
507 pxor xmm7, xmm0 ; xor the initial crc value
508
509 jmp _128_done
510
511 _only_less_than_4:
512 cmp arg3, 3
513 jl _only_less_than_3
514
515 ; load 3 Bytes
516 mov al, [arg2]
517 mov [r11], al
518
519 mov al, [arg2+1]
520 mov [r11+1], al
521
522 mov al, [arg2+2]
523 mov [r11+2], al
524
525 movdqa xmm7, [rsp]
526 pshufb xmm7, xmm11
527 pxor xmm7, xmm0 ; xor the initial crc value
528
529 psrldq xmm7, 5
530
531 jmp _barrett
532 _only_less_than_3:
533 cmp arg3, 2
534 jl _only_less_than_2
535
536 ; load 2 Bytes
537 mov al, [arg2]
538 mov [r11], al
539
540 mov al, [arg2+1]
541 mov [r11+1], al
542
543 movdqa xmm7, [rsp]
544 pshufb xmm7, xmm11
545 pxor xmm7, xmm0 ; xor the initial crc value
546
547 psrldq xmm7, 6
548
549 jmp _barrett
550 _only_less_than_2:
551
552 ; load 1 Byte
553 mov al, [arg2]
554 mov [r11], al
555
556 movdqa xmm7, [rsp]
557 pshufb xmm7, xmm11
558 pxor xmm7, xmm0 ; xor the initial crc value
559
560 psrldq xmm7, 7
561
562 jmp _barrett
563
564 section .data
565
566 ; precomputed constants
567 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
568 align 16
569 ; Q = 0x18BB70000
570 ; rk1 = 2^(32*3) mod Q << 32
571 ; rk2 = 2^(32*5) mod Q << 32
572 ; rk3 = 2^(32*15) mod Q << 32
573 ; rk4 = 2^(32*17) mod Q << 32
574 ; rk5 = 2^(32*3) mod Q << 32
575 ; rk6 = 2^(32*2) mod Q << 32
576 ; rk7 = floor(2^64/Q)
577 ; rk8 = Q
578 rk1:
579 DQ 0x2d56000000000000
580 rk2:
581 DQ 0x06df000000000000
582 rk3:
583 DQ 0x9d9d000000000000
584 rk4:
585 DQ 0x7cf5000000000000
586 rk5:
587 DQ 0x2d56000000000000
588 rk6:
589 DQ 0x1368000000000000
590 rk7:
591 DQ 0x00000001f65a57f8
592 rk8:
593 DQ 0x000000018bb70000
594
595 rk9:
596 DQ 0xceae000000000000
597 rk10:
598 DQ 0xbfd6000000000000
599 rk11:
600 DQ 0x1e16000000000000
601 rk12:
602 DQ 0x713c000000000000
603 rk13:
604 DQ 0xf7f9000000000000
605 rk14:
606 DQ 0x80a6000000000000
607 rk15:
608 DQ 0x044c000000000000
609 rk16:
610 DQ 0xe658000000000000
611 rk17:
612 DQ 0xad18000000000000
613 rk18:
614 DQ 0xa497000000000000
615 rk19:
616 DQ 0x6ee3000000000000
617 rk20:
618 DQ 0xe7b5000000000000
619
620
621
622
623
624
625
626
627
628 mask1:
629 dq 0x8080808080808080, 0x8080808080808080
630 mask2:
631 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
632
633 SHUF_MASK:
634 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
635
636 pshufb_shf_table:
637 ; use these values for shift constants for the pshufb instruction
638 ; different alignments result in values as shown:
639 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
640 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
641 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
642 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
643 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
644 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
645 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
646 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
647 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
648 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
649 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
650 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
651 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
652 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
653 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
654 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
655 dq 0x0706050403020100, 0x000e0d0c0b0a0908
656
657 ;;; func core, ver, snum
658 slversion crc16_t10dif_01, 01, 06, 0010
659