library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs

   1 //! Vectorized Carry-less Multiplication (VCLMUL)
   2 //!
   3 //! The reference is [Intel 64 and IA-32 Architectures Software Developer's
   4 //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
   5 //!
   6 //! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
   7
   8 use crate::core_arch::x86::__m256i;
   9 use crate::core_arch::x86::__m512i;
  10
  11 #[cfg(test)]
  12 use crate::stdarch_test::assert_instr;
  13
  14 #[allow(improper_ctypes)]
  15 extern "C" {
  16     #[link_name = "llvm.x86.pclmulqdq.256"]
  17     fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
  18     #[link_name = "llvm.x86.pclmulqdq.512"]
  19     fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
  20 }
  21
  22 // for some odd reason on x86_64 we generate the correct long name instructions
  23 // but on i686 we generate the short name + imm8
  24 // so we need to special-case on that...
  25
  26 /// Performs a carry-less multiplication of two 64-bit polynomials over the
  27 /// finite field GF(2^k) - in each of the 4 128-bit lanes.
  28 ///
  29 /// The immediate byte is used for determining which halves of each lane `a` and `b`
  30 /// should be used. Immediate bits other than 0 and 4 are ignored.
  31 /// All lanes share immediate byte.
  32 ///
  33 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
  34 #[inline]
  35 #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
  36 // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
  37 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
  38 #[rustc_args_required_const(2)]
  39 pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
  40     macro_rules! call {
  41         ($imm8:expr) => {
  42             pclmulqdq_512(a, b, $imm8)
  43         };
  44     }
  45     constify_imm8!(imm8, call)
  46 }
  47
  48 /// Performs a carry-less multiplication of two 64-bit polynomials over the
  49 /// finite field GF(2^k) - in each of the 2 128-bit lanes.
  50 ///
  51 /// The immediate byte is used for determining which halves of each lane `a` and `b`
  52 /// should be used. Immediate bits other than 0 and 4 are ignored.
  53 /// All lanes share immediate byte.
  54 ///
  55 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
  56 #[inline]
  57 #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
  58 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
  59 #[rustc_args_required_const(2)]
  60 pub unsafe fn _mm256_clmulepi64_epi128(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
  61     macro_rules! call {
  62         ($imm8:expr) => {
  63             pclmulqdq_256(a, b, $imm8)
  64         };
  65     }
  66     constify_imm8!(imm8, call)
  67 }
  68
  69 #[cfg(test)]
  70 mod tests {
  71     // The constants in the tests below are just bit patterns. They should not
  72     // be interpreted as integers; signedness does not make sense for them, but
  73     // __mXXXi happens to be defined in terms of signed integers.
  74     #![allow(overflowing_literals)]
  75
  76     use stdarch_test::simd_test;
  77
  78     use crate::core_arch::x86::*;
  79
  80     macro_rules! verify_kat_pclmul {
  81         ($broadcast:ident, $clmul:ident, $assert:ident) => {
  82             // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
  83          let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
  84          let a = $broadcast(a);
  85          let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
  86          let b = $broadcast(b);
  87          let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
  88          let r00 = $broadcast(r00);
  89          let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
  90          let r01 = $broadcast(r01);
  91          let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
  92          let r10 = $broadcast(r10);
  93          let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
  94          let r11 = $broadcast(r11);
  95
  96          $assert($clmul(a, b, 0x00), r00);
  97          $assert($clmul(a, b, 0x10), r01);
  98          $assert($clmul(a, b, 0x01), r10);
  99          $assert($clmul(a, b, 0x11), r11);
 100
 101          let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
 102          let a0 = $broadcast(a0);
 103          let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
 104          let r = $broadcast(r);
 105          $assert($clmul(a0, a0, 0x00), r);
 106         }
 107     }
 108
 109     macro_rules! unroll {
 110         ($target:ident[4] = $op:ident($source:ident,4);) => {
 111             $target[3] = $op($source, 3);
 112             $target[2] = $op($source, 2);
 113             unroll! {$target[2] = $op($source,2);}
 114         };
 115         ($target:ident[2] = $op:ident($source:ident,2);) => {
 116             $target[1] = $op($source, 1);
 117             $target[0] = $op($source, 0);
 118         };
 119         (assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => {
 120             assert_eq_m128i($op($vec_res, 3), $lin_res[3]);
 121             assert_eq_m128i($op($vec_res, 2), $lin_res[2]);
 122             unroll! {assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
 123         };
 124         (assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => {
 125             assert_eq_m128i($op($vec_res, 1), $lin_res[1]);
 126             assert_eq_m128i($op($vec_res, 0), $lin_res[0]);
 127         };
 128     }
 129
 130     // this function tests one of the possible 4 instances
 131     // with different inputs across lanes
 132     #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
 133     unsafe fn verify_512_helper(
 134         linear: unsafe fn(__m128i, __m128i) -> __m128i,
 135         vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
 136     ) {
 137         let a = _mm512_set_epi64(
 138             0xDCB4DB3657BF0B7D,
 139             0x18DB0601068EDD9F,
 140             0xB76B908233200DC5,
 141             0xE478235FA8E22D5E,
 142             0xAB05CFFA2621154C,
 143             0x1171B47A186174C9,
 144             0x8C6B6C0E7595CEC9,
 145             0xBE3E7D4934E961BD,
 146         );
 147         let b = _mm512_set_epi64(
 148             0x672F6F105A94CEA7,
 149             0x8298B8FFCA5F829C,
 150             0xA3927047B3FB61D8,
 151             0x978093862CDE7187,
 152             0xB1927AB22F31D0EC,
 153             0xA9A5DA619BE4D7AF,
 154             0xCA2590F56884FDC6,
 155             0x19BE9F660038BDB5,
 156         );
 157
 158         let mut a_decomp = [_mm_setzero_si128(); 4];
 159         unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
 160         let mut b_decomp = [_mm_setzero_si128(); 4];
 161         unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
 162
 163         let r = vectorized(a, b);
 164         let mut e_decomp = [_mm_setzero_si128(); 4];
 165         for i in 0..4 {
 166             e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
 167         }
 168         unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
 169     }
 170
 171     // this function tests one of the possible 4 instances
 172     // with different inputs across lanes for the VL version
 173     #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
 174     unsafe fn verify_256_helper(
 175         linear: unsafe fn(__m128i, __m128i) -> __m128i,
 176         vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
 177     ) {
 178         let a = _mm512_set_epi64(
 179             0xDCB4DB3657BF0B7D,
 180             0x18DB0601068EDD9F,
 181             0xB76B908233200DC5,
 182             0xE478235FA8E22D5E,
 183             0xAB05CFFA2621154C,
 184             0x1171B47A186174C9,
 185             0x8C6B6C0E7595CEC9,
 186             0xBE3E7D4934E961BD,
 187         );
 188         let b = _mm512_set_epi64(
 189             0x672F6F105A94CEA7,
 190             0x8298B8FFCA5F829C,
 191             0xA3927047B3FB61D8,
 192             0x978093862CDE7187,
 193             0xB1927AB22F31D0EC,
 194             0xA9A5DA619BE4D7AF,
 195             0xCA2590F56884FDC6,
 196             0x19BE9F660038BDB5,
 197         );
 198
 199         let mut a_decomp = [_mm_setzero_si128(); 2];
 200         unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
 201         let mut b_decomp = [_mm_setzero_si128(); 2];
 202         unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
 203
 204         let r = vectorized(
 205             _mm512_extracti64x4_epi64(a, 0),
 206             _mm512_extracti64x4_epi64(b, 0),
 207         );
 208         let mut e_decomp = [_mm_setzero_si128(); 2];
 209         for i in 0..2 {
 210             e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
 211         }
 212         unroll! {assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
 213     }
 214
 215     #[simd_test(enable = "avx512vpclmulqdq,avx512f")]
 216     unsafe fn test_mm512_clmulepi64_epi128() {
 217         verify_kat_pclmul!(
 218             _mm512_broadcast_i32x4,
 219             _mm512_clmulepi64_epi128,
 220             assert_eq_m512i
 221         );
 222
 223         verify_512_helper(
 224             |a, b| _mm_clmulepi64_si128(a, b, 0x00),
 225             |a, b| _mm512_clmulepi64_epi128(a, b, 0x00),
 226         );
 227         verify_512_helper(
 228             |a, b| _mm_clmulepi64_si128(a, b, 0x01),
 229             |a, b| _mm512_clmulepi64_epi128(a, b, 0x01),
 230         );
 231         verify_512_helper(
 232             |a, b| _mm_clmulepi64_si128(a, b, 0x10),
 233             |a, b| _mm512_clmulepi64_epi128(a, b, 0x10),
 234         );
 235         verify_512_helper(
 236             |a, b| _mm_clmulepi64_si128(a, b, 0x11),
 237             |a, b| _mm512_clmulepi64_epi128(a, b, 0x11),
 238         );
 239     }
 240
 241     #[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
 242     unsafe fn test_mm256_clmulepi64_epi128() {
 243         verify_kat_pclmul!(
 244             _mm256_broadcastsi128_si256,
 245             _mm256_clmulepi64_epi128,
 246             assert_eq_m256i
 247         );
 248
 249         verify_256_helper(
 250             |a, b| _mm_clmulepi64_si128(a, b, 0x00),
 251             |a, b| _mm256_clmulepi64_epi128(a, b, 0x00),
 252         );
 253         verify_256_helper(
 254             |a, b| _mm_clmulepi64_si128(a, b, 0x01),
 255             |a, b| _mm256_clmulepi64_epi128(a, b, 0x01),
 256         );
 257         verify_256_helper(
 258             |a, b| _mm_clmulepi64_si128(a, b, 0x10),
 259             |a, b| _mm256_clmulepi64_epi128(a, b, 0x10),
 260         );
 261         verify_256_helper(
 262             |a, b| _mm_clmulepi64_si128(a, b, 0x11),
 263             |a, b| _mm256_clmulepi64_epi128(a, b, 0x11),
 264         );
 265     }
 266 }