Botan 2.19.3
Crypto and TLS for C&
sm4_armv8.cpp
Go to the documentation of this file.
1/*
2* (C) 2018 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/sm4.h>
8#include <arm_neon.h>
9
10namespace Botan {
11
12namespace {
13
14static const uint8_t qswap_tbl[16] = {
15 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
16};
17
18static const uint8_t bswap_tbl[16] = {
19 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
20};
21
22inline uint32x4_t qswap_32(uint32x4_t B)
23 {
24 return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(qswap_tbl)));
25 }
26
27inline uint32x4_t bswap_32(uint32x4_t B)
28 {
29 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B)));
30 }
31
32/*
33 Swap both the quad-words and bytes within each word
34 equivalent to return bswap_32(qswap_32(B))
35*/
36inline uint32x4_t bqswap_32(uint32x4_t B)
37 {
38 return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(bswap_tbl)));
39 }
40
41#define SM4_E(B0, B1, B2, B3, K) do { \
42 B0 = vsm4eq_u32(B0, K); \
43 B1 = vsm4eq_u32(B1, K); \
44 B2 = vsm4eq_u32(B2, K); \
45 B3 = vsm4eq_u32(B3, K); \
46 } while(0)
47
48}
49
50void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4")
51SM4::sm4_armv8_encrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
52 {
53 const uint32x4_t K0 = vld1q_u32(&m_RK[ 0]);
54 const uint32x4_t K1 = vld1q_u32(&m_RK[ 4]);
55 const uint32x4_t K2 = vld1q_u32(&m_RK[ 8]);
56 const uint32x4_t K3 = vld1q_u32(&m_RK[12]);
57 const uint32x4_t K4 = vld1q_u32(&m_RK[16]);
58 const uint32x4_t K5 = vld1q_u32(&m_RK[20]);
59 const uint32x4_t K6 = vld1q_u32(&m_RK[24]);
60 const uint32x4_t K7 = vld1q_u32(&m_RK[28]);
61
62 const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
63 uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
64
65 while(blocks >= 4)
66 {
67 uint32x4_t B0 = bswap_32(vld1q_u32(input32));
68 uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
69 uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
70 uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
71
72 SM4_E(B0, B1, B2, B3, K0);
73 SM4_E(B0, B1, B2, B3, K1);
74 SM4_E(B0, B1, B2, B3, K2);
75 SM4_E(B0, B1, B2, B3, K3);
76 SM4_E(B0, B1, B2, B3, K4);
77 SM4_E(B0, B1, B2, B3, K5);
78 SM4_E(B0, B1, B2, B3, K6);
79 SM4_E(B0, B1, B2, B3, K7);
80
81 vst1q_u32(output32 , bqswap_32(B0));
82 vst1q_u32(output32+ 4, bqswap_32(B1));
83 vst1q_u32(output32+ 8, bqswap_32(B2));
84 vst1q_u32(output32+12, bqswap_32(B3));
85
86 input32 += 4*4;
87 output32 += 4*4;
88 blocks -= 4;
89 }
90
91 for(size_t i = 0; i != blocks; ++i)
92 {
93 uint32x4_t B = bswap_32(vld1q_u32(input32));
94
95 B = vsm4eq_u32(B, K0);
96 B = vsm4eq_u32(B, K1);
97 B = vsm4eq_u32(B, K2);
98 B = vsm4eq_u32(B, K3);
99 B = vsm4eq_u32(B, K4);
100 B = vsm4eq_u32(B, K5);
101 B = vsm4eq_u32(B, K6);
102 B = vsm4eq_u32(B, K7);
103
104 vst1q_u32(output32, bqswap_32(B));
105
106 input32 += 4;
107 output32 += 4;
108 }
109 }
110
111void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4")
112SM4::sm4_armv8_decrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
113 {
114 const uint32x4_t K0 = qswap_32(vld1q_u32(&m_RK[ 0]));
115 const uint32x4_t K1 = qswap_32(vld1q_u32(&m_RK[ 4]));
116 const uint32x4_t K2 = qswap_32(vld1q_u32(&m_RK[ 8]));
117 const uint32x4_t K3 = qswap_32(vld1q_u32(&m_RK[12]));
118 const uint32x4_t K4 = qswap_32(vld1q_u32(&m_RK[16]));
119 const uint32x4_t K5 = qswap_32(vld1q_u32(&m_RK[20]));
120 const uint32x4_t K6 = qswap_32(vld1q_u32(&m_RK[24]));
121 const uint32x4_t K7 = qswap_32(vld1q_u32(&m_RK[28]));
122
123 const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
124 uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
125
126 while(blocks >= 4)
127 {
128 uint32x4_t B0 = bswap_32(vld1q_u32(input32));
129 uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
130 uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
131 uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
132
133 SM4_E(B0, B1, B2, B3, K7);
134 SM4_E(B0, B1, B2, B3, K6);
135 SM4_E(B0, B1, B2, B3, K5);
136 SM4_E(B0, B1, B2, B3, K4);
137 SM4_E(B0, B1, B2, B3, K3);
138 SM4_E(B0, B1, B2, B3, K2);
139 SM4_E(B0, B1, B2, B3, K1);
140 SM4_E(B0, B1, B2, B3, K0);
141
142 vst1q_u32(output32 , bqswap_32(B0));
143 vst1q_u32(output32+ 4, bqswap_32(B1));
144 vst1q_u32(output32+ 8, bqswap_32(B2));
145 vst1q_u32(output32+12, bqswap_32(B3));
146
147 input32 += 4*4;
148 output32 += 4*4;
149 blocks -= 4;
150 }
151
152 for(size_t i = 0; i != blocks; ++i)
153 {
154 uint32x4_t B = bswap_32(vld1q_u32(input32));
155
156 B = vsm4eq_u32(B, K7);
157 B = vsm4eq_u32(B, K6);
158 B = vsm4eq_u32(B, K5);
159 B = vsm4eq_u32(B, K4);
160 B = vsm4eq_u32(B, K3);
161 B = vsm4eq_u32(B, K2);
162 B = vsm4eq_u32(B, K1);
163 B = vsm4eq_u32(B, K0);
164
165 vst1q_u32(output32, bqswap_32(B));
166
167 input32 += 4;
168 output32 += 4;
169 }
170 }
171
172#undef SM4_E
173
174}
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:77
#define SM4_E(B0, B1, B2, B3, K)
Definition sm4_armv8.cpp:41