38 explicit
SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3,
39 uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7)
41 m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0);
78 static_assert(ROT > 0 && ROT < 32,
"Invalid rotation constant");
80#if defined(__AVX512VL__)
81 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
85 const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
86 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
88 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
92 const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
93 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
95 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
99 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2,
static_cast<int>(ROT)),
100 _mm256_srli_epi32(m_avx2,
static_cast<int>(32-ROT))));
240 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
241 const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
242 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
243 const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
245 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
246 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
247 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
248 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);