80 { \
81 __m128i r0, r1, r2, r3; \
82 \
83 \
84 r3 = _mm_srli_si128((XW3), 4); \
85 r0 = (XW0); \
86 \
87 r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2)); \
88 \
89 r1 = _mm_unpacklo_epi64(r1, (XW1)); \
90 r2 = (XW2); \
91 \
92 r0 = _mm_xor_si128(r1, r0); \
93 r2 = _mm_xor_si128(r3, r2); \
94 r0 = _mm_xor_si128(r2, r0); \
95 \
96 \
97 r2 = _mm_slli_si128(r0, 12); \
98 r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); \
99 r0 = _mm_add_epi32(r0, r0); \
100 r0 = _mm_sub_epi32(r0, r1); \
101 \
102 r3 = _mm_srli_epi32(r2, 30); \
103 r2 = _mm_slli_epi32(r2, 2); \
104 \
105 r0 = _mm_xor_si128(r0, r3); \
106 r0 = _mm_xor_si128(r0, r2); \
107 \
108 (XW0) = r0; \
109 (
prep).u128 = _mm_add_epi32(r0, K); \
110 } while(0)
#define prep(prep, XW0, XW1, XW2, XW3, K)