Botan 2.19.3
Crypto and TLS for C&
threefish_512_avx2.cpp
Go to the documentation of this file.
1/*
2* Threefish-512 using AVX2
3* (C) 2013,2016 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/threefish_512.h>
9#include <immintrin.h>
10
11namespace Botan {
12
13namespace {
14
15BOTAN_FUNC_ISA("avx2")
16inline void interleave_epi64(__m256i& X0, __m256i& X1)
17 {
18 // interleave X0 and X1 qwords
19 // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20
21 const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22 const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23
24 X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25 X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26 }
27
28BOTAN_FUNC_ISA("avx2")
29inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30 {
31 const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32 const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33
34 X0 = _mm256_unpacklo_epi64(T0, T1);
35 X1 = _mm256_unpackhi_epi64(T0, T1);
36 }
37
38BOTAN_FUNC_ISA("avx2")
39inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40 {
41 /*
42 Behold. The key schedule progresses like so. The values
43 loop back to the originals after the rounds are complete
44 so we don't need to reload for starting the next block.
45
46 R0 R1 R2
47 K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48 K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49 K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50
51 K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52 K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53 K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54
55 K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56 K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57 K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58
59 To compute the values for the next round:
60 X0 is X2 from the last round
61 X1 becomes (X0[4],X1[1:3])
62 X2 becomes (X1[4],X2[1:3])
63
64 Uses 3 permutes and 2 blends, is there a faster way?
65 */
66 __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67 __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68 __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69
70 R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71 R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72 }
73
74
75}
76
77BOTAN_FUNC_ISA("avx2")
78void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79 {
80 _mm256_zeroupper();
81
82 const uint64_t* K = m_K.data();
83 const uint64_t* T_64 = m_T.data();
84
85 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
86 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
87 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
88 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
89 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
90 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
91 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
92 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
93
94#define THREEFISH_ROUND(X0, X1, SHL) \
95 do { \
96 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
97 X0 = _mm256_add_epi64(X0, X1); \
98 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
99 X1 = _mm256_xor_si256(X1, X0); \
100 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
101 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
102 } while(0)
103
104#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \
105 do { \
106 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
107 X0 = _mm256_add_epi64(X0, X1); \
108 X2 = _mm256_add_epi64(X2, X3); \
109 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
110 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
111 X1 = _mm256_xor_si256(X1, X0); \
112 X3 = _mm256_xor_si256(X3, X2); \
113 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
114 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \
115 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
116 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
117 } while(0)
118
119#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
120 do { \
121 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
122 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
123 X0 = _mm256_add_epi64(X0, K0); \
124 X1 = _mm256_add_epi64(X1, K1); \
125 X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \
126 X0 = _mm256_add_epi64(X0, T0); \
127 X1 = _mm256_add_epi64(X1, T1); \
128 } while(0)
129
130#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
131 do { \
132 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
133 __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
134 X0 = _mm256_add_epi64(X0, K0); \
135 X2 = _mm256_add_epi64(X2, K0); \
136 X1 = _mm256_add_epi64(X1, K1); \
137 X3 = _mm256_add_epi64(X3, K1); \
138 T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
139 X0 = _mm256_add_epi64(X0, T0); \
140 X2 = _mm256_add_epi64(X2, T0); \
141 X1 = _mm256_add_epi64(X1, T1); \
142 X3 = _mm256_add_epi64(X3, T1); \
143 } while(0)
144
145#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \
146 do { \
147 rotate_keys(K1, K2, K0); \
148 THREEFISH_ROUND(X0, X1, ROTATE_1); \
149 THREEFISH_ROUND(X0, X1, ROTATE_2); \
150 THREEFISH_ROUND(X0, X1, ROTATE_3); \
151 THREEFISH_ROUND(X0, X1, ROTATE_4); \
152 THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \
153 \
154 THREEFISH_ROUND(X0, X1, ROTATE_5); \
155 THREEFISH_ROUND(X0, X1, ROTATE_6); \
156 THREEFISH_ROUND(X0, X1, ROTATE_7); \
157 THREEFISH_ROUND(X0, X1, ROTATE_8); \
158 THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \
159 } while(0)
160
161#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
162 do { \
163 rotate_keys(K1, K2, K0); \
164 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
165 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
166 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
167 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
168 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \
169 \
170 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
171 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
172 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
173 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
174 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \
175 } while(0)
176
177 __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
178 __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
179 __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
180
181 const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
182 __m256i* out_mm = reinterpret_cast<__m256i*>(out);
183
184 while(blocks >= 2)
185 {
186 __m256i X0 = _mm256_loadu_si256(in_mm++);
187 __m256i X1 = _mm256_loadu_si256(in_mm++);
188 __m256i X2 = _mm256_loadu_si256(in_mm++);
189 __m256i X3 = _mm256_loadu_si256(in_mm++);
190
191 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
192
193 interleave_epi64(X0, X1);
194 interleave_epi64(X2, X3);
195
196 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
197
198 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3);
199 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1);
200 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2);
201 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3);
202 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1);
203 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
204 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
205 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
206 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
207
208 deinterleave_epi64(X0, X1);
209 deinterleave_epi64(X2, X3);
210
211 _mm256_storeu_si256(out_mm++, X0);
212 _mm256_storeu_si256(out_mm++, X1);
213 _mm256_storeu_si256(out_mm++, X2);
214 _mm256_storeu_si256(out_mm++, X3);
215
216 blocks -= 2;
217 }
218
219 for(size_t i = 0; i != blocks; ++i)
220 {
221 __m256i X0 = _mm256_loadu_si256(in_mm++);
222 __m256i X1 = _mm256_loadu_si256(in_mm++);
223
224 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
225
226 interleave_epi64(X0, X1);
227
228 THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
229
230 THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3);
231 THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1);
232 THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2);
233 THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3);
234 THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1);
235 THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
236 THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
237 THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
238 THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
239
240 deinterleave_epi64(X0, X1);
241
242 _mm256_storeu_si256(out_mm++, X0);
243 _mm256_storeu_si256(out_mm++, X1);
244 }
245
246 _mm256_zeroall();
247
248#undef THREEFISH_ENC_8_ROUNDS
249#undef THREEFISH_ROUND
250#undef THREEFISH_INJECT_KEY
251#undef THREEFISH_DEC_2_8_ROUNDS
252#undef THREEFISH_ROUND_2
253#undef THREEFISH_INJECT_KEY_2
254 }
255
256BOTAN_FUNC_ISA("avx2")
257void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
258 {
259 _mm256_zeroupper();
260
261 const uint64_t* K = m_K.data();
262 const uint64_t* T_64 = m_T.data();
263
264 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
265 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
266 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
267 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
268 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
269 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
270 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
271 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
272
273#define THREEFISH_ROUND(X0, X1, SHR) \
274 do { \
275 const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
276 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
277 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
278 X1 = _mm256_xor_si256(X1, X0); \
279 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
280 X0 = _mm256_sub_epi64(X0, X1); \
281 } while(0)
282
283#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \
284 do { \
285 const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
286 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
287 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \
288 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
289 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
290 X1 = _mm256_xor_si256(X1, X0); \
291 X3 = _mm256_xor_si256(X3, X2); \
292 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
293 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
294 X0 = _mm256_sub_epi64(X0, X1); \
295 X2 = _mm256_sub_epi64(X2, X3); \
296 } while(0)
297
298#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
299 do { \
300 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
301 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
302 X0 = _mm256_sub_epi64(X0, K0); \
303 X1 = _mm256_sub_epi64(X1, K1); \
304 X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \
305 X0 = _mm256_sub_epi64(X0, T0); \
306 X1 = _mm256_sub_epi64(X1, T1); \
307 } while(0)
308
309#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
310 do { \
311 THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \
312 THREEFISH_ROUND(X0, X1, ROTATE_8); \
313 THREEFISH_ROUND(X0, X1, ROTATE_7); \
314 THREEFISH_ROUND(X0, X1, ROTATE_6); \
315 THREEFISH_ROUND(X0, X1, ROTATE_5); \
316 \
317 THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \
318 THREEFISH_ROUND(X0, X1, ROTATE_4); \
319 THREEFISH_ROUND(X0, X1, ROTATE_3); \
320 THREEFISH_ROUND(X0, X1, ROTATE_2); \
321 THREEFISH_ROUND(X0, X1, ROTATE_1); \
322 } while(0)
323
324#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
325 do { \
326 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
327 __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
328 X0 = _mm256_sub_epi64(X0, K0); \
329 X2 = _mm256_sub_epi64(X2, K0); \
330 X1 = _mm256_sub_epi64(X1, K1); \
331 X3 = _mm256_sub_epi64(X3, K1); \
332 T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
333 X0 = _mm256_sub_epi64(X0, T0); \
334 X2 = _mm256_sub_epi64(X2, T0); \
335 X1 = _mm256_sub_epi64(X1, T1); \
336 X3 = _mm256_sub_epi64(X3, T1); \
337 } while(0)
338
339#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
340 do { \
341 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \
342 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
343 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
344 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
345 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
346 \
347 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
348 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
349 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
350 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
351 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
352 } while(0)
353
354 /*
355 v1.0 key schedule: 9 ymm registers (only need 2 or 3)
356 (0,1,2,3),(4,5,6,7) [8]
357 then mutating with vpermq
358 */
359 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
360 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
361 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
362 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
363 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
364 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
365 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
366 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
367 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
368
369 const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
370 __m256i* out_mm = reinterpret_cast<__m256i*>(out);
371
372 while(blocks >= 2)
373 {
374 __m256i X0 = _mm256_loadu_si256(in_mm++);
375 __m256i X1 = _mm256_loadu_si256(in_mm++);
376 __m256i X2 = _mm256_loadu_si256(in_mm++);
377 __m256i X3 = _mm256_loadu_si256(in_mm++);
378
379 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
380
381 interleave_epi64(X0, X1);
382 interleave_epi64(X2, X3);
383
384 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
385 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
386 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
387 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
388 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
389 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
390 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
391 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
392 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
393
394 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
395
396 deinterleave_epi64(X0, X1);
397 deinterleave_epi64(X2, X3);
398
399 _mm256_storeu_si256(out_mm++, X0);
400 _mm256_storeu_si256(out_mm++, X1);
401 _mm256_storeu_si256(out_mm++, X2);
402 _mm256_storeu_si256(out_mm++, X3);
403
404 blocks -= 2;
405 }
406
407 for(size_t i = 0; i != blocks; ++i)
408 {
409 __m256i X0 = _mm256_loadu_si256(in_mm++);
410 __m256i X1 = _mm256_loadu_si256(in_mm++);
411
412 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
413
414 interleave_epi64(X0, X1);
415
416 THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
417 THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
418 THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
419 THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
420 THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
421 THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
422 THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
423 THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
424 THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
425
426 THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
427
428 deinterleave_epi64(X0, X1);
429
430 _mm256_storeu_si256(out_mm++, X0);
431 _mm256_storeu_si256(out_mm++, X1);
432 }
433
434#undef THREEFISH_DEC_8_ROUNDS
435#undef THREEFISH_ROUND
436#undef THREEFISH_INJECT_KEY
437#undef THREEFISH_DEC_2_8_ROUNDS
438#undef THREEFISH_ROUND_2
439#undef THREEFISH_INJECT_KEY_2
440
441 _mm256_zeroall();
442 }
443
444}
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:77
fe T
Definition ge.cpp:37
#define THREEFISH_ENC_8_ROUNDS(R1, R2)
#define THREEFISH_DEC_8_ROUNDS(R1, R2)
#define THREEFISH_INJECT_KEY(r)
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2)
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2)