8#include <botan/ghash.h>
9#include <botan/internal/simd_32.h>
11#if defined(BOTAN_SIMD_USE_SSE2)
12 #include <immintrin.h>
13 #include <wmmintrin.h>
22#if defined(BOTAN_SIMD_USE_SSE2)
23 const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
24 return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
25#elif defined(BOTAN_SIMD_USE_NEON)
26 const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
27 const uint8x16_t mask = vld1q_u8(maskb);
28 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
29#elif defined(BOTAN_SIMD_USE_ALTIVEC)
30 const __vector
unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
31 return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask));
38 static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11,
"Valid clmul mode");
40#if defined(BOTAN_SIMD_USE_SSE2)
41 return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42#elif defined(BOTAN_SIMD_USE_NEON)
43 const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44 const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45 return SIMD_4x32(
reinterpret_cast<uint32x4_t
>(vmull_p64(a, b)));
46#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47 const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
59 i1 = i1.shift_elems_left<2>();
63 i2 = i2.shift_elems_left<2>();
67 i1 = mask_lo.andc(i1);
68 i2 = mask_lo.andc(i2);
71 auto i1v =
reinterpret_cast<__vector
unsigned long long>(i1.raw());
72 auto i2v =
reinterpret_cast<__vector
unsigned long long>(i2.raw());
75 auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v);
77 auto rv = __builtin_crypto_vpmsumd(i1v, i2v);
80 return SIMD_4x32(
reinterpret_cast<__vector
unsigned int>(rv));
84inline SIMD_4x32 gcm_reduce(
const SIMD_4x32& B0,
const SIMD_4x32& B1)
86 SIMD_4x32 X0 = B1.shr<31>();
87 SIMD_4x32 X1 = B1.shl<1>();
88 SIMD_4x32 X2 = B0.shr<31>();
89 SIMD_4x32 X3 = B0.shl<1>();
91 X3 |= X0.shift_elems_right<3>();
92 X3 |= X2.shift_elems_left<1>();
93 X1 |= X0.shift_elems_left<1>();
95 X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
97 X1 ^= X0.shift_elems_left<3>();
99 X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
100 X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
104inline SIMD_4x32
BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(
const SIMD_4x32& H,
const SIMD_4x32& x)
106 SIMD_4x32 T0 = clmul<0x11>(H, x);
107 SIMD_4x32 T1 = clmul<0x10>(H, x);
108 SIMD_4x32 T2 = clmul<0x01>(H, x);
109 SIMD_4x32 T3 = clmul<0x00>(H, x);
112 T0 ^= T1.shift_elems_right<2>();
113 T3 ^= T1.shift_elems_left<2>();
115 return gcm_reduce(T0, T3);
119 gcm_multiply_x4(
const SIMD_4x32& H1,
const SIMD_4x32& H2,
const SIMD_4x32& H3,
const SIMD_4x32& H4,
120 const SIMD_4x32& X1,
const SIMD_4x32& X2,
const SIMD_4x32& X3,
const SIMD_4x32& X4)
127 const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^
128 (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
130 const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^
131 (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
135 T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
136 T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
137 T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
138 T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
142 return gcm_reduce(hi ^
T.shift_elems_right<2>(),
143 lo ^
T.shift_elems_left<2>());
149void GHASH::ghash_precompute_cpu(
const uint8_t H_bytes[16], uint64_t H_pow[4*2])
152 const SIMD_4x32 H2 = gcm_multiply(H1, H1);
153 const SIMD_4x32 H3 = gcm_multiply(H1, H2);
154 const SIMD_4x32 H4 = gcm_multiply(H2, H2);
157 H2.store_le(H_pow + 2);
158 H3.store_le(H_pow + 4);
159 H4.store_le(H_pow + 6);
163void GHASH::ghash_multiply_cpu(uint8_t x[16],
164 const uint64_t H_pow[8],
165 const uint8_t input[],
size_t blocks)
188 a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
195 for(
size_t i = 0; i != blocks; ++i)
200 a = gcm_multiply(H1, a);
203 a = reverse_vector(a);
static SIMD_4x32 load_le(const void *in)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE