Botan 2.19.3
Crypto and TLS for C&
idea_sse2.cpp
Go to the documentation of this file.
1/*
2* IDEA in SSE2
3* (C) 2009 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/idea.h>
9#include <botan/internal/ct_utils.h>
10#include <emmintrin.h>
11
12namespace Botan {
13
14namespace {
15
16BOTAN_FUNC_ISA("sse2")
17inline __m128i mul(__m128i X, uint16_t K_16)
18 {
19 const __m128i zeros = _mm_set1_epi16(0);
20 const __m128i ones = _mm_set1_epi16(1);
21
22 const __m128i K = _mm_set1_epi16(K_16);
23
24 const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
25 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
26
27 const __m128i mul_lo = _mm_mullo_epi16(X, K);
28 const __m128i mul_hi = _mm_mulhi_epu16(X, K);
29
30 __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
31
32 // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
33 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
34 const __m128i cmp = _mm_min_epu8(
35 _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
36
37 T = _mm_add_epi16(T, cmp);
38
39 /* Selection: if X[i] is zero then assign 1-K
40 if K is zero then assign 1-X[i]
41
42 Could if() off value of K_16 for the second, but this gives a
43 constant time implementation which is a nice bonus.
44 */
45
46 T = _mm_or_si128(
47 _mm_andnot_si128(X_is_zero, T),
48 _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
49
50 T = _mm_or_si128(
51 _mm_andnot_si128(K_is_zero, T),
52 _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
53
54 return T;
55 }
56
57/*
58* 4x8 matrix transpose
59*
60* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
61* transpose_out doesn't need it. Something with the shuffle? Removing
62* that extra unpack could easily save 3-4 cycles per block, and would
63* also help a lot with register pressure on 32-bit x86
64*/
65BOTAN_FUNC_ISA("sse2")
66void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
67 {
68 __m128i T0 = _mm_unpackhi_epi32(B0, B1);
69 __m128i T1 = _mm_unpacklo_epi32(B0, B1);
70 __m128i T2 = _mm_unpackhi_epi32(B2, B3);
71 __m128i T3 = _mm_unpacklo_epi32(B2, B3);
72
73 __m128i T4 = _mm_unpacklo_epi32(T0, T1);
74 __m128i T5 = _mm_unpackhi_epi32(T0, T1);
75 __m128i T6 = _mm_unpacklo_epi32(T2, T3);
76 __m128i T7 = _mm_unpackhi_epi32(T2, T3);
77
78 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
79 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
80 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
81 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
82
83 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
84 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
85 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
86 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
87
88 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
89 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
90 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
91 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
92
93 B0 = _mm_unpacklo_epi64(T0, T2);
94 B1 = _mm_unpackhi_epi64(T0, T2);
95 B2 = _mm_unpacklo_epi64(T1, T3);
96 B3 = _mm_unpackhi_epi64(T1, T3);
97 }
98
99/*
100* 4x8 matrix transpose (reverse)
101*/
102BOTAN_FUNC_ISA("sse2")
103void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
104 {
105 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
106 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
107 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
108 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
109
110 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
111 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
112 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
113 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
114
115 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
116 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
117 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
118 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
119
120 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
121 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
122 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
123 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
124
125 B0 = _mm_unpacklo_epi32(T0, T1);
126 B1 = _mm_unpackhi_epi32(T0, T1);
127 B2 = _mm_unpacklo_epi32(T2, T3);
128 B3 = _mm_unpackhi_epi32(T2, T3);
129 }
130
131}
132
133/*
134* 8 wide IDEA encryption/decryption in SSE2
135*/
136BOTAN_FUNC_ISA("sse2")
137void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const
138 {
139 CT::poison(in, 64);
140 CT::poison(out, 64);
141 CT::poison(EK, 52);
142
143 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
144
145 __m128i B0 = _mm_loadu_si128(in_mm + 0);
146 __m128i B1 = _mm_loadu_si128(in_mm + 1);
147 __m128i B2 = _mm_loadu_si128(in_mm + 2);
148 __m128i B3 = _mm_loadu_si128(in_mm + 3);
149
150 transpose_in(B0, B1, B2, B3);
151
152 // byte swap
153 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
154 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
155 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
156 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
157
158 for(size_t i = 0; i != 8; ++i)
159 {
160 B0 = mul(B0, EK[6*i+0]);
161 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
162 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
163 B3 = mul(B3, EK[6*i+3]);
164
165 __m128i T0 = B2;
166 B2 = _mm_xor_si128(B2, B0);
167 B2 = mul(B2, EK[6*i+4]);
168
169 __m128i T1 = B1;
170
171 B1 = _mm_xor_si128(B1, B3);
172 B1 = _mm_add_epi16(B1, B2);
173 B1 = mul(B1, EK[6*i+5]);
174
175 B2 = _mm_add_epi16(B2, B1);
176
177 B0 = _mm_xor_si128(B0, B1);
178 B1 = _mm_xor_si128(B1, T0);
179 B3 = _mm_xor_si128(B3, B2);
180 B2 = _mm_xor_si128(B2, T1);
181 }
182
183 B0 = mul(B0, EK[48]);
184 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
185 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
186 B3 = mul(B3, EK[51]);
187
188 // byte swap
189 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
190 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
191 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
192 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
193
194 transpose_out(B0, B2, B1, B3);
195
196 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
197
198 _mm_storeu_si128(out_mm + 0, B0);
199 _mm_storeu_si128(out_mm + 1, B2);
200 _mm_storeu_si128(out_mm + 2, B1);
201 _mm_storeu_si128(out_mm + 3, B3);
202
203 CT::unpoison(in, 64);
204 CT::unpoison(out, 64);
205 CT::unpoison(EK, 52);
206 }
207
208}
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:77
fe T
Definition ge.cpp:37
fe X
Definition ge.cpp:27
void poison(const T *p, size_t n)
Definition ct_utils.h:48
void unpoison(const T *p, size_t n)
Definition ct_utils.h:59