For a polynomial the 'th root of unity and the 'th root of unity
with with
The CT algorithm splits the size-n input into one vector that contains the even-indexed values and another one that contains the odd-indexed values. After applying the NTT on both halves, denote by the vector obtained from the even indexes and with the vector obtained from the odd indexes. The full-size NTT vector is obtained as follows: the first half of the resulting vector is computed as and the second half as with from 0 to

The GS algorithm splits the size- input into one vector that contains the first values and another one that contains the last values. After applying the NTT on both halves, let us denote by the vector obtained from the first left indexes and with the vector obtained from the last right indexes. The final result is obtained as follows: the even indexes of the resulting vector are updated as and the second half as with from 0 to

CT NTT (Kyber round 3)
/*************************************************
* Name: ntt
*
* Description: Inplace number-theoretic transform (NTT) in Rq
* input is in standard order, output is in bitreversed order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
**************************************************/
void ntt(int16_t r[256]) {
unsigned int len, start, j, k;
int16_t t, zeta;
k = 1;
for(len = 128; len >= 2; len >>= 1) {
for(start = 0; start < 256; start = j + len) {
zeta = zetas[k++];
for(j = start; j < start + len; ++j) {
t = fqmul(zeta, r[j + len]);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
}
}
}
}
GS INTT (Kyber round 3)
/*************************************************
* Name: invntt_tomont
*
* Description: Inplace inverse number-theoretic transform in Rq and
* multiplication by Montgomery factor 2^16.
* Input is in bitreversed order, output is in standard order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
**************************************************/
void invntt(int16_t r[256]) {
unsigned int start, len, j, k;
int16_t t, zeta;
k = 0;
for(len = 2; len <= 128; len <<= 1) {
for(start = 0; start < 256; start = j + len) {
zeta = zetas_inv[k++];
for(j = start; j < start + len; ++j) {
t = r[j];
r[j] = barrett_reduce(t + r[j + len]);
r[j + len] = t - r[j + len];
r[j + len] = fqmul(zeta, r[j + len]);
}
}
}
for(j = 0; j < 256; ++j)
r[j] = fqmul(r[j], zetas_inv[127]);
}
/*************************************************
* Name: montgomery_reduce
*
* Description: Montgomery reduction; given a 32-bit integer a, computes
* 16-bit integer congruent to a * R^-1 mod q,
* where R=2^16
*
* Arguments: - int32_t a: input integer to be reduced;
* has to be in {-q2^15,...,q2^15-1}
*
* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
**************************************************/
int16_t montgomery_reduce(int32_t a)
{
int32_t t;
int16_t u;
u = a*QINV;
t = (int32_t)u*KYBER_Q;
t = a - t;
t >>= 16;
return t;
}
/*************************************************
* Name: barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* 16-bit integer congruent to a mod q in {0,...,q}
*
* Arguments: - int16_t a: input integer to be reduced
*
* Returns: integer in {0,...,q} congruent to a modulo q.
**************************************************/
int16_t barrett_reduce(int16_t a) {
int16_t t;
const int16_t v = ((1U << 26) + KYBER_Q/2)/KYBER_Q;
t = (int32_t)v*a >> 26;
t *= KYBER_Q;
return a - t;
}