]> git.proxmox.com Git - rustc.git/blobdiff - vendor/blake2b_simd/src/guts.rs
New upstream version 1.52.1+dfsg1
[rustc.git] / vendor / blake2b_simd / src / guts.rs
diff --git a/vendor/blake2b_simd/src/guts.rs b/vendor/blake2b_simd/src/guts.rs
new file mode 100644 (file)
index 0000000..9fcacf3
--- /dev/null
@@ -0,0 +1,565 @@
+use crate::*;
+use arrayref::array_ref;
+use core::cmp;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub const MAX_DEGREE: usize = 4;
+
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+pub const MAX_DEGREE: usize = 1;
+
+// Variants other than Portable are unreachable in no_std, unless CPU features
+// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2".
+// This might change in the future if is_x86_feature_detected moves into libcore.
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Platform {
+    Portable,
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    SSE41,
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    AVX2,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Implementation(Platform);
+
+impl Implementation {
+    pub fn detect() -> Self {
+        // Try the different implementations in order of how fast/modern they
+        // are. Currently on non-x86, everything just uses portable.
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(avx2_impl) = Self::avx2_if_supported() {
+                return avx2_impl;
+            }
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(sse41_impl) = Self::sse41_if_supported() {
+                return sse41_impl;
+            }
+        }
+        Self::portable()
+    }
+
+    pub fn portable() -> Self {
+        Implementation(Platform::Portable)
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unreachable_code)]
+    pub fn sse41_if_supported() -> Option<Self> {
+        // Check whether SSE4.1 support is assumed by the build.
+        #[cfg(target_feature = "sse4.1")]
+        {
+            return Some(Implementation(Platform::SSE41));
+        }
+        // Otherwise dynamically check for support if we can.
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("sse4.1") {
+                return Some(Implementation(Platform::SSE41));
+            }
+        }
+        None
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unreachable_code)]
+    pub fn avx2_if_supported() -> Option<Self> {
+        // Check whether AVX2 support is assumed by the build.
+        #[cfg(target_feature = "avx2")]
+        {
+            return Some(Implementation(Platform::AVX2));
+        }
+        // Otherwise dynamically check for support if we can.
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("avx2") {
+                return Some(Implementation(Platform::AVX2));
+            }
+        }
+        None
+    }
+
+    pub fn degree(&self) -> usize {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => avx2::DEGREE,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::SSE41 => sse41::DEGREE,
+            Platform::Portable => 1,
+        }
+    }
+
+    pub fn compress1_loop(
+        &self,
+        input: &[u8],
+        words: &mut [Word; 8],
+        count: Count,
+        last_node: LastNode,
+        finalize: Finalize,
+        stride: Stride,
+    ) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => unsafe {
+                avx2::compress1_loop(input, words, count, last_node, finalize, stride);
+            },
+            // Note that there's an SSE version of compress1 in the official C
+            // implementation, but I haven't ported it yet.
+            _ => {
+                portable::compress1_loop(input, words, count, last_node, finalize, stride);
+            }
+        }
+    }
+
+    pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 | Platform::SSE41 => unsafe {
+                sse41::compress2_loop(jobs, finalize, stride)
+            },
+            _ => panic!("unsupported"),
+        }
+    }
+
+    pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) },
+            _ => panic!("unsupported"),
+        }
+    }
+}
+
+pub struct Job<'a, 'b> {
+    pub input: &'a [u8],
+    pub words: &'b mut [Word; 8],
+    pub count: Count,
+    pub last_node: LastNode,
+}
+
+impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // NB: Don't print the words. Leaking them would allow length extension.
+        write!(
+            f,
+            "Job {{ input_len: {}, count: {}, last_node: {} }}",
+            self.input.len(),
+            self.count,
+            self.last_node.yes(),
+        )
+    }
+}
+
+// Finalize could just be a bool, but this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum Finalize {
+    Yes,
+    No,
+}
+
+impl Finalize {
+    pub fn yes(&self) -> bool {
+        match self {
+            Finalize::Yes => true,
+            Finalize::No => false,
+        }
+    }
+}
+
+// Like Finalize, this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum LastNode {
+    Yes,
+    No,
+}
+
+impl LastNode {
+    pub fn yes(&self) -> bool {
+        match self {
+            LastNode::Yes => true,
+            LastNode::No => false,
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum Stride {
+    Serial,   // BLAKE2b/BLAKE2s
+    Parallel, // BLAKE2bp/BLAKE2sp
+}
+
+impl Stride {
+    pub fn padded_blockbytes(&self) -> usize {
+        match self {
+            Stride::Serial => BLOCKBYTES,
+            Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES,
+        }
+    }
+}
+
+pub(crate) fn count_low(count: Count) -> Word {
+    count as Word
+}
+
+pub(crate) fn count_high(count: Count) -> Word {
+    (count >> 8 * size_of::<Word>()) as Word
+}
+
+pub(crate) fn assemble_count(low: Word, high: Word) -> Count {
+    low as Count + ((high as Count) << 8 * size_of::<Word>())
+}
+
+pub(crate) fn flag_word(flag: bool) -> Word {
+    if flag {
+        !0
+    } else {
+        0
+    }
+}
+
+// Pull a array reference at the given offset straight from the input, if
+// there's a full block of input available. If there's only a partial block,
+// copy it into the provided buffer, and return an array reference that. Along
+// with the array, return the number of bytes of real input, and whether the
+// input can be finalized (i.e. whether there aren't any more bytes after this
+// block). Note that this is written so that the optimizer can elide bounds
+// checks, see: https://godbolt.org/z/0hH2bC
+pub fn final_block<'a>(
+    input: &'a [u8],
+    offset: usize,
+    buffer: &'a mut [u8; BLOCKBYTES],
+    stride: Stride,
+) -> (&'a [u8; BLOCKBYTES], usize, bool) {
+    let capped_offset = cmp::min(offset, input.len());
+    let offset_slice = &input[capped_offset..];
+    if offset_slice.len() >= BLOCKBYTES {
+        let block = array_ref!(offset_slice, 0, BLOCKBYTES);
+        let should_finalize = offset_slice.len() <= stride.padded_blockbytes();
+        (block, BLOCKBYTES, should_finalize)
+    } else {
+        // Copy the final block to the front of the block buffer. The rest of
+        // the buffer is assumed to be initialized to zero.
+        buffer[..offset_slice.len()].copy_from_slice(offset_slice);
+        (buffer, offset_slice.len(), true)
+    }
+}
+
+pub fn input_debug_asserts(input: &[u8], finalize: Finalize) {
+    // If we're not finalizing, the input must not be empty, and it must be an
+    // even multiple of the block size.
+    if !finalize.yes() {
+        debug_assert!(!input.is_empty());
+        debug_assert_eq!(0, input.len() % BLOCKBYTES);
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrayvec::ArrayVec;
+    use core::mem::size_of;
+
+    #[test]
+    fn test_detection() {
+        assert_eq!(Platform::Portable, Implementation::portable().0);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("avx2") {
+                assert_eq!(Platform::AVX2, Implementation::detect().0);
+                assert_eq!(
+                    Platform::AVX2,
+                    Implementation::avx2_if_supported().unwrap().0
+                );
+                assert_eq!(
+                    Platform::SSE41,
+                    Implementation::sse41_if_supported().unwrap().0
+                );
+            } else if is_x86_feature_detected!("sse4.1") {
+                assert_eq!(Platform::SSE41, Implementation::detect().0);
+                assert!(Implementation::avx2_if_supported().is_none());
+                assert_eq!(
+                    Platform::SSE41,
+                    Implementation::sse41_if_supported().unwrap().0
+                );
+            } else {
+                assert_eq!(Platform::Portable, Implementation::detect().0);
+                assert!(Implementation::avx2_if_supported().is_none());
+                assert!(Implementation::sse41_if_supported().is_none());
+            }
+        }
+    }
+
+    // TODO: Move all of these case tests into the implementation files.
+    fn exercise_cases<F>(mut f: F)
+    where
+        F: FnMut(Stride, usize, LastNode, Finalize, Count),
+    {
+        // Chose counts to hit the relevant overflow cases.
+        let counts = &[
+            (0 as Count),
+            ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count,
+            (0 as Count).wrapping_sub(BLOCKBYTES as Count),
+        ];
+        for &stride in &[Stride::Serial, Stride::Parallel] {
+            let lengths = [
+                0,
+                1,
+                BLOCKBYTES - 1,
+                BLOCKBYTES,
+                BLOCKBYTES + 1,
+                2 * BLOCKBYTES - 1,
+                2 * BLOCKBYTES,
+                2 * BLOCKBYTES + 1,
+                stride.padded_blockbytes() - 1,
+                stride.padded_blockbytes(),
+                stride.padded_blockbytes() + 1,
+                2 * stride.padded_blockbytes() - 1,
+                2 * stride.padded_blockbytes(),
+                2 * stride.padded_blockbytes() + 1,
+            ];
+            for &length in &lengths {
+                for &last_node in &[LastNode::No, LastNode::Yes] {
+                    for &finalize in &[Finalize::No, Finalize::Yes] {
+                        if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) {
+                            // Skip these cases, they're invalid.
+                            continue;
+                        }
+                        for &count in counts {
+                            // eprintln!("\ncase -----");
+                            // dbg!(stride);
+                            // dbg!(length);
+                            // dbg!(last_node);
+                            // dbg!(finalize);
+                            // dbg!(count);
+
+                            f(stride, length, last_node, finalize, count);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    fn initial_test_words(input_index: usize) -> [Word; 8] {
+        crate::Params::new()
+            .node_offset(input_index as u64)
+            .to_words()
+    }
+
+    // Use the portable implementation, one block at a time, to compute the
+    // final state words expected for a given test case.
+    fn reference_compression(
+        input: &[u8],
+        stride: Stride,
+        last_node: LastNode,
+        finalize: Finalize,
+        mut count: Count,
+        input_index: usize,
+    ) -> [Word; 8] {
+        let mut words = initial_test_words(input_index);
+        let mut offset = 0;
+        while offset == 0 || offset < input.len() {
+            let block_size = cmp::min(BLOCKBYTES, input.len() - offset);
+            let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() {
+                Finalize::No
+            } else {
+                finalize
+            };
+            portable::compress1_loop(
+                &input[offset..][..block_size],
+                &mut words,
+                count,
+                last_node,
+                maybe_finalize,
+                Stride::Serial,
+            );
+            offset += stride.padded_blockbytes();
+            count = count.wrapping_add(BLOCKBYTES as Count);
+        }
+        words
+    }
+
+    // For various loop lengths and finalization parameters, make sure that the
+    // implementation gives the same answer as the portable implementation does
+    // when invoked one block at a time. (So even the portable implementation
+    // itself is being tested here, to make sure its loop is correct.) Note
+    // that this doesn't include any fixed test vectors; those are taken from
+    // the blake2-kat.json file (copied from upstream) and tested elsewhere.
+    fn exercise_compress1_loop(implementation: Implementation) {
+        let mut input = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input);
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let reference_words =
+                reference_compression(&input[..length], stride, last_node, finalize, count, 0);
+
+            let mut test_words = initial_test_words(0);
+            implementation.compress1_loop(
+                &input[..length],
+                &mut test_words,
+                count,
+                last_node,
+                finalize,
+                stride,
+            );
+            assert_eq!(reference_words, test_words);
+        });
+    }
+
+    #[test]
+    fn test_compress1_loop_portable() {
+        exercise_compress1_loop(Implementation::portable());
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress1_loop_sse41() {
+        // Currently this just falls back to portable, but we test it anyway.
+        if let Some(imp) = Implementation::sse41_if_supported() {
+            exercise_compress1_loop(imp);
+        }
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress1_loop_avx2() {
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress1_loop(imp);
+        }
+    }
+
+    // I use ArrayVec everywhere in here becuase currently these tests pass
+    // under no_std. I might decide that's not worth maintaining at some point,
+    // since really all we care about with no_std is that the library builds,
+    // but for now it's here. Everything is keyed off of this N constant so
+    // that it's easy to copy the code to exercise_compress4_loop.
+    fn exercise_compress2_loop(implementation: Implementation) {
+        const N: usize = 2;
+
+        let mut input_buffer = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input_buffer);
+        let mut inputs = ArrayVec::<[_; N]>::new();
+        for i in 0..N {
+            inputs.push(&input_buffer[i..]);
+        }
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let mut reference_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                let words = reference_compression(
+                    &inputs[i][..length],
+                    stride,
+                    last_node,
+                    finalize,
+                    count.wrapping_add((i * BLOCKBYTES) as Count),
+                    i,
+                );
+                reference_words.push(words);
+            }
+
+            let mut test_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                test_words.push(initial_test_words(i));
+            }
+            let mut jobs = ArrayVec::<[_; N]>::new();
+            for (i, words) in test_words.iter_mut().enumerate() {
+                jobs.push(Job {
+                    input: &inputs[i][..length],
+                    words,
+                    count: count.wrapping_add((i * BLOCKBYTES) as Count),
+                    last_node,
+                });
+            }
+            let mut jobs = jobs.into_inner().expect("full");
+            implementation.compress2_loop(&mut jobs, finalize, stride);
+
+            for i in 0..N {
+                assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+            }
+        });
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress2_loop_sse41() {
+        if let Some(imp) = Implementation::sse41_if_supported() {
+            exercise_compress2_loop(imp);
+        }
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress2_loop_avx2() {
+        // Currently this just falls back to SSE4.1, but we test it anyway.
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress2_loop(imp);
+        }
+    }
+
+    // Copied from exercise_compress2_loop, with a different value of N and an
+    // interior call to compress4_loop.
+    fn exercise_compress4_loop(implementation: Implementation) {
+        const N: usize = 4;
+
+        let mut input_buffer = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input_buffer);
+        let mut inputs = ArrayVec::<[_; N]>::new();
+        for i in 0..N {
+            inputs.push(&input_buffer[i..]);
+        }
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let mut reference_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                let words = reference_compression(
+                    &inputs[i][..length],
+                    stride,
+                    last_node,
+                    finalize,
+                    count.wrapping_add((i * BLOCKBYTES) as Count),
+                    i,
+                );
+                reference_words.push(words);
+            }
+
+            let mut test_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                test_words.push(initial_test_words(i));
+            }
+            let mut jobs = ArrayVec::<[_; N]>::new();
+            for (i, words) in test_words.iter_mut().enumerate() {
+                jobs.push(Job {
+                    input: &inputs[i][..length],
+                    words,
+                    count: count.wrapping_add((i * BLOCKBYTES) as Count),
+                    last_node,
+                });
+            }
+            let mut jobs = jobs.into_inner().expect("full");
+            implementation.compress4_loop(&mut jobs, finalize, stride);
+
+            for i in 0..N {
+                assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+            }
+        });
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress4_loop_avx2() {
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress4_loop(imp);
+        }
+    }
+
+    #[test]
+    fn sanity_check_count_size() {
+        assert_eq!(size_of::<Count>(), 2 * size_of::<Word>());
+    }
+}