#include "crypto_core_multsntrup857.h"
#include "crypto_core_multsntrup857_ntt.h"
#include <immintrin.h>
#include <stdint.h>

/* auto-generated; do not edit */


typedef int8_t int8;
typedef int16_t int16;

#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))

typedef union {
    int16 data[93 * 16];
    __m256i _dummy;
} vec1488;

static const vec1488 qdata_7681 = { .data = {

#define q_x16 (qdata[0])
        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,

#define qrecip_x16 (qdata[1])
        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,

#define qshift_x16 (qdata[2])
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,

#define zeta4_x16 (qdata[3])
        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,

#define zeta4_x16_qinv (qdata[4])
        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,

#define zeta8_x16 (qdata[5])
        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,

#define zeta8_x16_qinv (qdata[6])
        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,

#define zetainv8_x16 (qdata[7])
        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,

#define zetainv8_x16_qinv (qdata[8])
        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,

#define zeta_x4_16 (qdata+9)
        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_x4_16 (qdata+12)
        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define zeta_x4_32 (qdata+15)
        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_x4_32 (qdata+20)
        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define zeta_64 (qdata+25)
        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_64 (qdata+28)
        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define zeta_128 (qdata+31)
        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_128 (qdata+36)
        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define zeta_256 (qdata+41)
        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_256 (qdata+50)
        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define zeta_512 (qdata+59)
        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#define qinvzeta_512 (qdata+76)
        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }
};

static const vec1488 qdata_10753 = { .data = {

        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,

        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,

        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,

        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,

        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,

        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,

        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,

        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,

        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }
};

static inline __m256i sub_x16(__m256i a, __m256i b) {
    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
    return _mm256_sub_epi16(a, b);
}

static inline __m256i add_x16(__m256i a, __m256i b) {
    return _mm256_add_epi16(a, b);
}

static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
    y = _mm256_mulhrs_epi16(y, qshift_x16);
    y = _mm256_mullo_epi16(y, q_x16);
    return sub_x16(x, y);
}

static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
    __m256i b = _mm256_mulhi_epi16(x, y);
    __m256i d = _mm256_mullo_epi16(x, yqinv);
    __m256i e = _mm256_mulhi_epi16(d, q_x16);
    return sub_x16(b, e);
}

typedef union {
    int8 data[32];
    __m256i _dummy;
} byte32;
static const byte32 shuffle_buf = { .data = {
        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
    }
};
#define shuffle (*(__m256i *) shuffle_buf.data)

static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
    __m256i x = _mm256_loadu_si256(p);
    x = _mm256_permute2x128_si256(x, x, 1);
    x = _mm256_shuffle_epi8(x, shuffle);
    return x;
}

static void ntt128(int16 *f, int reps, const __m256i *qdata) {
    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
    int16 *origf = f;
    int rep;
    __m256i zetainv_128_0 = zetainv(128, 0);
    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
    __m256i zetainv_128_1 = zetainv(128, 1);
    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
    for (rep = 0; rep < reps; ++rep) {
        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f3 = sub_x16(g3, g2);
        f2 = add_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));

        g2 = _mm256_unpacklo_epi16(f2, f3);
        g3 = _mm256_unpackhi_epi16(f2, f3);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
        f0 = reduce_x16(qdata, f0);

        g0 = _mm256_unpacklo_epi16(f0, f1);
        h0 = _mm256_unpacklo_epi32(g0, g2);
        h1 = _mm256_unpackhi_epi32(g0, g2);
        g1 = _mm256_unpackhi_epi16(f0, f1);
        h2 = _mm256_unpacklo_epi32(g1, g3);
        h3 = _mm256_unpackhi_epi32(g1, g3);
        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);

        _mm256_storeu_si256((__m256i *) (f + 0), f0);
        _mm256_storeu_si256((__m256i *) (f + 64), f2);
        _mm256_storeu_si256((__m256i *) (f + 32), f1);
        _mm256_storeu_si256((__m256i *) (f + 96), f3);

        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f3 = sub_x16(g3, g2);
        f2 = add_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));

        g2 = _mm256_unpacklo_epi16(f2, f3);
        g3 = _mm256_unpackhi_epi16(f2, f3);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
        f0 = reduce_x16(qdata, f0);

        g0 = _mm256_unpacklo_epi16(f0, f1);
        h0 = _mm256_unpacklo_epi32(g0, g2);
        h1 = _mm256_unpackhi_epi32(g0, g2);
        g1 = _mm256_unpackhi_epi16(f0, f1);
        h2 = _mm256_unpacklo_epi32(g1, g3);
        h3 = _mm256_unpackhi_epi32(g1, g3);
        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);

        _mm256_storeu_si256((__m256i *) (f + 16), f0);
        _mm256_storeu_si256((__m256i *) (f + 80), f2);
        _mm256_storeu_si256((__m256i *) (f + 48), f1);
        _mm256_storeu_si256((__m256i *) (f + 112), f3);

        f += 128;
    }
    f = origf;
    for (rep = 0; rep < reps; ++rep) {
        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f3 = sub_x16(g3, g2);
        f2 = add_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));

        g2 = _mm256_unpacklo_epi64(f2, f3);
        g3 = _mm256_unpackhi_epi64(f2, f3);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
        f0 = reduce_x16(qdata, f0);

        g1 = _mm256_unpackhi_epi64(f0, f1);
        g0 = _mm256_unpacklo_epi64(f0, f1);
        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);

        _mm256_storeu_si256((__m256i *) (f + 64), f1);
        _mm256_storeu_si256((__m256i *) (f + 80), f3);
        _mm256_storeu_si256((__m256i *) (f + 0), f0);
        _mm256_storeu_si256((__m256i *) (f + 16), f2);

        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f3 = sub_x16(g3, g2);
        f2 = add_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));

        g2 = _mm256_unpacklo_epi64(f2, f3);
        g3 = _mm256_unpackhi_epi64(f2, f3);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
        f0 = reduce_x16(qdata, f0);

        g1 = _mm256_unpackhi_epi64(f0, f1);
        g0 = _mm256_unpacklo_epi64(f0, f1);
        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);

        _mm256_storeu_si256((__m256i *) (f + 96), f1);
        _mm256_storeu_si256((__m256i *) (f + 112), f3);
        _mm256_storeu_si256((__m256i *) (f + 32), f0);
        _mm256_storeu_si256((__m256i *) (f + 48), f2);

        f += 128;
    }
    f = origf;
    for (rep = 0; rep < reps; ++rep) {

        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f2 = add_x16(g2, g3);
        f3 = sub_x16(g2, g3);
        f2 = reduce_x16(qdata, f2);
        f3 = reduce_x16(qdata, f3);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f0 = reduce_x16(qdata, f0);

        h0 = f0;
        h1 = f1;
        h2 = f2;
        h3 = f3;

        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
        g3 = sub_x16(f1, f3);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f1, f3);

        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
        g2 = sub_x16(f0, f2);
        g0 = add_x16(f0, f2);

        f3 = sub_x16(g3, g2);
        f2 = add_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);

        f1 = sub_x16(g0, g1);
        f0 = add_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
        f0 = reduce_x16(qdata, f0);

        g0 = add_x16(h0, f0);
        g1 = add_x16(h1, f1);
        g2 = add_x16(h2, f2);
        g3 = add_x16(h3, f3);
        _mm256_storeu_si256((__m256i *) (f + 0), g0);
        _mm256_storeu_si256((__m256i *) (f + 16), g1);
        _mm256_storeu_si256((__m256i *) (f + 32), g2);
        _mm256_storeu_si256((__m256i *) (f + 48), g3);
        g0 = sub_x16(h0, f0);
        g1 = sub_x16(h1, f1);
        g2 = sub_x16(h2, f2);
        g3 = sub_x16(h3, f3);
        _mm256_storeu_si256((__m256i *) (f + 64), g0);
        _mm256_storeu_si256((__m256i *) (f + 80), g1);
        _mm256_storeu_si256((__m256i *) (f + 96), g2);
        _mm256_storeu_si256((__m256i *) (f + 112), g3);
        f += 128;
    }
}

static void ntt512(int16 *f, int reps, const __m256i *qdata) {
    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
    int16 *origf = f;
    int rep;
    __m256i zetainv_512[8];
    __m256i zetainv_qinv_512[8];
    int i;
    for (i = 0; i < 8; ++i) {
        zetainv_512[i] = zetainv(512, i);
    }
    for (i = 0; i < 8; ++i) {
        zetainv_qinv_512[i] = zetainv_qinv(512, i);
    }
    for (rep = 0; rep < reps; ++rep) {
        for (i = 0; i < 8; ++i) {
            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
            g3 = sub_x16(f1, f3);
            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
            g1 = add_x16(f1, f3);

            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
            g2 = sub_x16(f0, f2);
            g0 = add_x16(f0, f2);

            f3 = sub_x16(g3, g2);
            f2 = add_x16(g2, g3);
            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));

            f1 = sub_x16(g0, g1);
            f0 = add_x16(g0, g1);
            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
            f0 = reduce_x16(qdata, f0);

            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);

        }
        f += 512;
    }
    f = origf;
    ntt128(f, reps * 4, qdata);
}

void PQCLEAN_NTRULPR857_AVX2_ntt512_7681(int16 *f, int reps) {
    ntt512(f, reps, (const __m256i *) qdata_7681.data);
}

void PQCLEAN_NTRULPR857_AVX2_ntt512_10753(int16 *f, int reps) {
    ntt512(f, reps, (const __m256i *) qdata_10753.data);
}

static void invntt128(int16 *f, int reps, const __m256i *qdata) {
    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
    int16 *origf = f;
    int rep;
    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
    __m256i zetainv_64_0 = zetainv(64, 0);
    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
    __m256i zetainv_128_0 = zetainv(128, 0);
    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
    __m256i zetainv_64_1 = zetainv(64, 1);
    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
    __m256i zetainv_128_1 = zetainv(128, 1);
    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
    for (rep = 0; rep < reps; ++rep) {
        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
        g3 = _mm256_loadu_si256((__m256i *) (f + 112));

        h1 = sub_x16(f0, f1);
        h1 = reduce_x16(qdata, h1);
        h0 = add_x16(f0, f1);
        h3 = sub_x16(f2, f3);
        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
        h2 = add_x16(f2, f3);
        f1 = sub_x16(g0, g1);
        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
        f0 = add_x16(g0, g1);
        f3 = sub_x16(g2, g3);
        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
        f2 = add_x16(g2, g3);

        g0 = add_x16(h0, h2);
        g0 = reduce_x16(qdata, g0);
        g2 = sub_x16(h0, h2);
        g2 = reduce_x16(qdata, g2);
        g1 = sub_x16(h1, h3);
        g3 = add_x16(h1, h3);
        h2 = sub_x16(f0, f2);
        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
        h0 = add_x16(f0, f2);
        h3 = add_x16(f1, f3);
        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
        h1 = sub_x16(f1, f3);

        f0 = add_x16(g0, h0);
        g0 = sub_x16(g0, h0);
        f1 = add_x16(g1, h1);
        g1 = sub_x16(g1, h1);
        f2 = sub_x16(g2, h2);
        g2 = add_x16(g2, h2);
        f3 = sub_x16(g3, h3);
        g3 = add_x16(g3, h3);

        _mm256_storeu_si256((__m256i *) (f +   0), f0);
        _mm256_storeu_si256((__m256i *) (f +  32), g0);
        _mm256_storeu_si256((__m256i *) (f +  64), f1);
        _mm256_storeu_si256((__m256i *) (f +  96), g1);
        _mm256_storeu_si256((__m256i *) (f +  16), f2);
        _mm256_storeu_si256((__m256i *) (f +  48), g2);
        _mm256_storeu_si256((__m256i *) (f +  80), f3);
        _mm256_storeu_si256((__m256i *) (f + 112), g3);

        f += 128;
    }
    f = origf;
    for (rep = 0; rep < reps; ++rep) {
        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
        f3 = _mm256_loadu_si256((__m256i *) (f + 80));

        g0 = _mm256_unpacklo_epi64(f0, f1);
        g1 = _mm256_unpacklo_epi64(f2, f3);
        g2 = _mm256_unpackhi_epi64(f0, f1);
        g3 = _mm256_unpackhi_epi64(f2, f3);
        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);

        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));

        g3 = add_x16(f3, f2);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g2 = sub_x16(f3, f2);

        f0 = reduce_x16(qdata, f0);
        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);

        g1 = add_x16(f0, f1);
        g0 = sub_x16(f0, f1);

        f1 = add_x16(g1, g3);
        f3 = sub_x16(g1, g3);
        f0 = add_x16(g0, g2);
        f2 = sub_x16(g0, g2);

        _mm256_storeu_si256((__m256i *) (f + 64), f1);
        _mm256_storeu_si256((__m256i *) (f + 80), f3);
        _mm256_storeu_si256((__m256i *) (f + 0), f0);
        _mm256_storeu_si256((__m256i *) (f + 16), f2);

        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
        f3 = _mm256_loadu_si256((__m256i *) (f + 112));

        g0 = _mm256_unpacklo_epi64(f0, f1);
        g1 = _mm256_unpacklo_epi64(f2, f3);
        g2 = _mm256_unpackhi_epi64(f0, f1);
        g3 = _mm256_unpackhi_epi64(f2, f3);
        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);

        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));

        g3 = add_x16(f3, f2);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g2 = sub_x16(f3, f2);

        f0 = reduce_x16(qdata, f0);
        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);

        g1 = add_x16(f0, f1);
        g0 = sub_x16(f0, f1);

        f1 = add_x16(g1, g3);
        f3 = sub_x16(g1, g3);
        f0 = add_x16(g0, g2);
        f2 = sub_x16(g0, g2);

        _mm256_storeu_si256((__m256i *) (f + 96), f1);
        _mm256_storeu_si256((__m256i *) (f + 112), f3);
        _mm256_storeu_si256((__m256i *) (f + 32), f0);
        _mm256_storeu_si256((__m256i *) (f + 48), f2);

        f += 128;
    }
    f = origf;
    for (rep = 0; rep < reps; ++rep) {
        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
        f3 = _mm256_loadu_si256((__m256i *) (f + 96));

        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
        f0 = _mm256_unpacklo_epi16(g0, g2);
        f2 = _mm256_unpackhi_epi16(g0, g2);
        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
        f1 = _mm256_unpacklo_epi16(g1, g3);
        f3 = _mm256_unpackhi_epi16(g1, g3);
        g1 = _mm256_unpackhi_epi16(f0, f2);
        g0 = _mm256_unpacklo_epi16(f0, f2);
        g3 = _mm256_unpackhi_epi16(f1, f3);
        g2 = _mm256_unpacklo_epi16(f1, f3);
        f2 = _mm256_unpacklo_epi64(g1, g3);
        f3 = _mm256_unpackhi_epi64(g1, g3);
        f0 = _mm256_unpacklo_epi64(g0, g2);
        f1 = _mm256_unpackhi_epi64(g0, g2);

        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
        f0 = reduce_x16(qdata, f0);
        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);

        g3 = add_x16(f3, f2);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f0, f1);
        g2 = sub_x16(f3, f2);
        g0 = sub_x16(f0, f1);

        f1 = add_x16(g1, g3);
        f3 = sub_x16(g1, g3);
        f0 = add_x16(g0, g2);
        f2 = sub_x16(g0, g2);

        _mm256_storeu_si256((__m256i *) (f + 32), f1);
        _mm256_storeu_si256((__m256i *) (f + 96), f3);
        _mm256_storeu_si256((__m256i *) (f + 0), f0);
        _mm256_storeu_si256((__m256i *) (f + 64), f2);

        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
        f3 = _mm256_loadu_si256((__m256i *) (f + 112));

        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
        f0 = _mm256_unpacklo_epi16(g0, g2);
        f2 = _mm256_unpackhi_epi16(g0, g2);
        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
        f1 = _mm256_unpacklo_epi16(g1, g3);
        f3 = _mm256_unpackhi_epi16(g1, g3);
        g1 = _mm256_unpackhi_epi16(f0, f2);
        g0 = _mm256_unpacklo_epi16(f0, f2);
        g3 = _mm256_unpackhi_epi16(f1, f3);
        g2 = _mm256_unpacklo_epi16(f1, f3);
        f2 = _mm256_unpacklo_epi64(g1, g3);
        f3 = _mm256_unpackhi_epi64(g1, g3);
        f0 = _mm256_unpacklo_epi64(g0, g2);
        f1 = _mm256_unpackhi_epi64(g0, g2);

        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
        f0 = reduce_x16(qdata, f0);
        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);

        g3 = add_x16(f3, f2);
        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
        g1 = add_x16(f0, f1);
        g2 = sub_x16(f3, f2);
        g0 = sub_x16(f0, f1);

        f1 = add_x16(g1, g3);
        f3 = sub_x16(g1, g3);
        f0 = add_x16(g0, g2);
        f2 = sub_x16(g0, g2);

        _mm256_storeu_si256((__m256i *) (f + 48), f1);
        _mm256_storeu_si256((__m256i *) (f + 112), f3);
        _mm256_storeu_si256((__m256i *) (f + 16), f0);
        _mm256_storeu_si256((__m256i *) (f + 80), f2);

        f += 128;
    }
}

static void invntt512(int16 *f, int reps, const __m256i *qdata) {
    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
    int rep;
    __m256i zetainv_512[8];
    __m256i zetainv_qinv_512[8];
    __m256i zetainv_256[8];
    __m256i zetainv_qinv_256[8];
    int i;
    for (i = 0; i < 8; ++i) {
        zetainv_512[i] = zetainv(512, i);
    }
    for (i = 0; i < 8; ++i) {
        zetainv_qinv_512[i] = zetainv_qinv(512, i);
    }
    for (i = 0; i < 8; ++i) {
        zetainv_256[i] = zetainv(256, i);
    }
    for (i = 0; i < 8; ++i) {
        zetainv_qinv_256[i] = zetainv_qinv(256, i);
    }
    invntt128(f, 4 * reps, qdata);
    for (rep = 0; rep < reps; ++rep) {
        for (i = 0; i < 8; ++i) {
            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));

            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
            g3 = add_x16(f3, f2);
            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
            g2 = sub_x16(f3, f2);

            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));

            f0 = reduce_x16(qdata, f0);
            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
            g1 = add_x16(f0, f1);
            g0 = sub_x16(f0, f1);

            f1 = add_x16(g1, g3);
            f3 = sub_x16(g1, g3);
            f0 = add_x16(g0, g2);
            f2 = sub_x16(g0, g2);

            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
        }
        f += 512;
    }
}

void PQCLEAN_NTRULPR857_AVX2_invntt512_7681(int16 *f, int reps) {
    invntt512(f, reps, (const __m256i *) qdata_7681.data);
}

void PQCLEAN_NTRULPR857_AVX2_invntt512_10753(int16 *f, int reps) {
    invntt512(f, reps, (const __m256i *) qdata_10753.data);
}
