1 #[cfg(target_arch = "x86")]
2 use core
::arch
::x86
as arch
;
3 #[cfg(target_arch = "x86_64")]
4 use core
::arch
::x86_64
as arch
;
12 #[cfg(not(feature = "std"))]
13 pub fn new(state
: u32) -> Option
<Self> {
14 if cfg
!(target_feature
= "pclmulqdq")
15 && cfg
!(target_feature
= "sse2")
16 && cfg
!(target_feature
= "sse4.1")
18 // SAFETY: The conditions above ensure that all
19 // required instructions are supported by the CPU.
26 #[cfg(feature = "std")]
27 pub fn new(state
: u32) -> Option
<Self> {
28 if is_x86_feature_detected
!("pclmulqdq")
29 && is_x86_feature_detected
!("sse2")
30 && is_x86_feature_detected
!("sse4.1")
32 // SAFETY: The conditions above ensure that all
33 // required instructions are supported by the CPU.
40 pub fn update(&mut self, buf
: &[u8]) {
41 // SAFETY: The `State::new` constructor ensures that all
42 // required instructions are supported by the CPU.
43 self.state
= unsafe { calculate(self.state, buf) }
46 pub fn finalize(self) -> u32 {
50 pub fn reset(&mut self) {
54 pub fn combine(&mut self, other
: u32, amount
: u64) {
55 self.state
= ::combine
::combine(self.state
, other
, amount
);
59 const K1
: i64 = 0x154442bd4;
60 const K2
: i64 = 0x1c6e41596;
61 const K3
: i64 = 0x1751997d0;
62 const K4
: i64 = 0x0ccaa009e;
63 const K5
: i64 = 0x163cd6124;
64 const K6
: i64 = 0x1db710640;
66 const P_X
: i64 = 0x1DB710641;
67 const U_PRIME
: i64 = 0x1F7011641;
69 #[cfg(feature = "std")]
70 unsafe fn debug(s
: &str, a
: arch
::__m128i
) -> arch
::__m128i
{
77 print
!(" {:20} | ", s
);
86 #[cfg(not(feature = "std"))]
87 unsafe fn debug(_s
: &str, a
: arch
::__m128i
) -> arch
::__m128i
{
91 #[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
92 unsafe fn calculate(crc
: u32, mut data
: &[u8]) -> u32 {
93 // In theory we can accelerate smaller chunks too, but for now just rely on
94 // the fallback implementation as it's too much hassle and doesn't seem too
97 return ::baseline
::update_fast_16(crc
, data
);
100 // Step 1: fold by 4 loop
101 let mut x3
= get(&mut data
);
102 let mut x2
= get(&mut data
);
103 let mut x1
= get(&mut data
);
104 let mut x0
= get(&mut data
);
106 // fold in our initial value, part of the incremental crc checksum
107 x3
= arch
::_mm_xor_si128(x3
, arch
::_mm_cvtsi32_si128(!crc
as i32));
109 let k1k2
= arch
::_mm_set_epi64x(K2
, K1
);
110 while data
.len() >= 64 {
111 x3
= reduce128(x3
, get(&mut data
), k1k2
);
112 x2
= reduce128(x2
, get(&mut data
), k1k2
);
113 x1
= reduce128(x1
, get(&mut data
), k1k2
);
114 x0
= reduce128(x0
, get(&mut data
), k1k2
);
117 let k3k4
= arch
::_mm_set_epi64x(K4
, K3
);
118 let mut x
= reduce128(x3
, x2
, k3k4
);
119 x
= reduce128(x
, x1
, k3k4
);
120 x
= reduce128(x
, x0
, k3k4
);
122 // Step 2: fold by 1 loop
123 while data
.len() >= 16 {
124 x
= reduce128(x
, get(&mut data
), k3k4
);
127 debug("128 > 64 init", x
);
129 // Perform step 3, reduction from 128 bits to 64 bits. This is
130 // significantly different from the paper and basically doesn't follow it
131 // at all. It's not really clear why, but implementations of this algorithm
132 // in Chrome/Linux diverge in the same way. It is beyond me why this is
133 // different than the paper, maybe the paper has like errata or something?
136 // It's also not clear to me what's actually happening here and/or why, but
137 // algebraically what's happening is:
139 // x = (x[0:63] • K4) ^ x[64:127] // 96 bit result
140 // x = ((x[0:31] as u64) • K5) ^ x[32:95] // 64 bit result
142 // It's... not clear to me what's going on here. The paper itself is pretty
143 // vague on this part but definitely uses different constants at least.
144 // It's not clear to me, reading the paper, where the xor operations are
145 // happening or why things are shifting around. This implementation...
146 // appears to work though!
148 let x
= arch
::_mm_xor_si128(
149 arch
::_mm_clmulepi64_si128(x
, k3k4
, 0x10),
150 arch
::_mm_srli_si128(x
, 8),
152 let x
= arch
::_mm_xor_si128(
153 arch
::_mm_clmulepi64_si128(
154 arch
::_mm_and_si128(x
, arch
::_mm_set_epi32(0, 0, 0, !0)),
155 arch
::_mm_set_epi64x(0, K5
),
158 arch
::_mm_srli_si128(x
, 4),
160 debug("128 > 64 xx", x
);
162 // Perform a Barrett reduction from our now 64 bits to 32 bits. The
163 // algorithm for this is described at the end of the paper, and note that
164 // this also implements the "bit reflected input" variant.
165 let pu
= arch
::_mm_set_epi64x(U_PRIME
, P_X
);
167 // T1(x) = ⌊(R(x) % x^32)⌋ • μ
168 let t1
= arch
::_mm_clmulepi64_si128(
169 arch
::_mm_and_si128(x
, arch
::_mm_set_epi32(0, 0, 0, !0)),
173 // T2(x) = ⌊(T1(x) % x^32)⌋ • P(x)
174 let t2
= arch
::_mm_clmulepi64_si128(
175 arch
::_mm_and_si128(t1
, arch
::_mm_set_epi32(0, 0, 0, !0)),
179 // We're doing the bit-reflected variant, so get the upper 32-bits of the
180 // 64-bit result instead of the lower 32-bits.
182 // C(x) = R(x) ^ T2(x) / x^32
183 let c
= arch
::_mm_extract_epi32(arch
::_mm_xor_si128(x
, t2
), 1) as u32;
185 if !data
.is_empty() {
186 ::baseline
::update_fast_16(!c
, data
)
192 unsafe fn reduce128(a
: arch
::__m128i
, b
: arch
::__m128i
, keys
: arch
::__m128i
) -> arch
::__m128i
{
193 let t1
= arch
::_mm_clmulepi64_si128(a
, keys
, 0x00);
194 let t2
= arch
::_mm_clmulepi64_si128(a
, keys
, 0x11);
195 arch
::_mm_xor_si128(arch
::_mm_xor_si128(b
, t1
), t2
)
198 unsafe fn get(a
: &mut &[u8]) -> arch
::__m128i
{
199 debug_assert
!(a
.len() >= 16);
200 let r
= arch
::_mm_loadu_si128(a
.as_ptr() as *const arch
::__m128i
);
207 #[cfg(feature = "std")]
209 fn check_against_baseline(init
: u32, chunks
: Vec
<(Vec
<u8>, usize)>) -> bool
{
210 let mut baseline
= super::super::super::baseline
::State
::new(init
);
211 let mut pclmulqdq
= super::State
::new(init
).expect("not supported");
212 for (chunk
, mut offset
) in chunks
{
213 // simulate random alignments by offsetting the slice by up to 15 bytes
215 if chunk
.len() <= offset
{
216 baseline
.update(&chunk
);
217 pclmulqdq
.update(&chunk
);
219 baseline
.update(&chunk
[offset
..]);
220 pclmulqdq
.update(&chunk
[offset
..]);
223 pclmulqdq
.finalize() == baseline
.finalize()