9#include <botan/internal/ct_utils.h>
17inline __m128i mul(__m128i
X, uint16_t K_16)
19 const __m128i zeros = _mm_set1_epi16(0);
20 const __m128i ones = _mm_set1_epi16(1);
22 const __m128i K = _mm_set1_epi16(K_16);
24 const __m128i X_is_zero = _mm_cmpeq_epi16(
X, zeros);
25 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
27 const __m128i mul_lo = _mm_mullo_epi16(
X, K);
28 const __m128i mul_hi = _mm_mulhi_epu16(
X, K);
30 __m128i
T = _mm_sub_epi16(mul_lo, mul_hi);
33 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
34 const __m128i cmp = _mm_min_epu8(
35 _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
37 T = _mm_add_epi16(
T, cmp);
47 _mm_andnot_si128(X_is_zero,
T),
48 _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
51 _mm_andnot_si128(K_is_zero,
T),
52 _mm_and_si128(_mm_sub_epi16(ones,
X), K_is_zero));
66void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
68 __m128i T0 = _mm_unpackhi_epi32(B0, B1);
69 __m128i T1 = _mm_unpacklo_epi32(B0, B1);
70 __m128i T2 = _mm_unpackhi_epi32(B2, B3);
71 __m128i T3 = _mm_unpacklo_epi32(B2, B3);
73 __m128i T4 = _mm_unpacklo_epi32(T0, T1);
74 __m128i T5 = _mm_unpackhi_epi32(T0, T1);
75 __m128i T6 = _mm_unpacklo_epi32(T2, T3);
76 __m128i T7 = _mm_unpackhi_epi32(T2, T3);
78 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
79 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
80 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
81 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
83 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
84 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
85 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
86 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
88 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
89 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
90 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
91 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
93 B0 = _mm_unpacklo_epi64(T0, T2);
94 B1 = _mm_unpackhi_epi64(T0, T2);
95 B2 = _mm_unpacklo_epi64(T1, T3);
96 B3 = _mm_unpackhi_epi64(T1, T3);
103void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
105 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
106 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
107 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
108 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
110 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
111 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
112 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
113 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
115 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
116 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
117 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
118 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
120 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
121 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
122 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
123 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
125 B0 = _mm_unpacklo_epi32(T0, T1);
126 B1 = _mm_unpackhi_epi32(T0, T1);
127 B2 = _mm_unpacklo_epi32(T2, T3);
128 B3 = _mm_unpackhi_epi32(T2, T3);
137void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52])
const
143 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
145 __m128i B0 = _mm_loadu_si128(in_mm + 0);
146 __m128i B1 = _mm_loadu_si128(in_mm + 1);
147 __m128i B2 = _mm_loadu_si128(in_mm + 2);
148 __m128i B3 = _mm_loadu_si128(in_mm + 3);
150 transpose_in(B0, B1, B2, B3);
153 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
154 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
155 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
156 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
158 for(
size_t i = 0; i != 8; ++i)
160 B0 = mul(B0, EK[6*i+0]);
161 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
162 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
163 B3 = mul(B3, EK[6*i+3]);
166 B2 = _mm_xor_si128(B2, B0);
167 B2 = mul(B2, EK[6*i+4]);
171 B1 = _mm_xor_si128(B1, B3);
172 B1 = _mm_add_epi16(B1, B2);
173 B1 = mul(B1, EK[6*i+5]);
175 B2 = _mm_add_epi16(B2, B1);
177 B0 = _mm_xor_si128(B0, B1);
178 B1 = _mm_xor_si128(B1, T0);
179 B3 = _mm_xor_si128(B3, B2);
180 B2 = _mm_xor_si128(B2, T1);
183 B0 = mul(B0, EK[48]);
184 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
185 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
186 B3 = mul(B3, EK[51]);
189 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
190 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
191 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
192 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
194 transpose_out(B0, B2, B1, B3);
196 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
198 _mm_storeu_si128(out_mm + 0, B0);
199 _mm_storeu_si128(out_mm + 1, B2);
200 _mm_storeu_si128(out_mm + 2, B1);
201 _mm_storeu_si128(out_mm + 3, B3);
#define BOTAN_FUNC_ISA(isa)
void poison(const T *p, size_t n)
void unpoison(const T *p, size_t n)