diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst
index 457fe96061..7f7a7b682b 100644
--- a/doc/source/nmod_poly.rst
+++ b/doc/source/nmod_poly.rst
@@ -1225,6 +1225,11 @@ Division
The algorithm used is to call :func:`div_newton_n` and then multiply out
and compute the remainder.
+
+Division with special divisors
+--------------------------------------------------------------------------------
+
+
.. function:: ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod)
Sets ``(Q, len-1)`` to the quotient of ``(A, len)`` on division
@@ -1237,6 +1242,56 @@ Division
Sets `Q` to the quotient of `A` on division by `(x - c)`, and returns
the remainder, equal to the value of `A` evaluated at `c`.
+.. function:: void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod)
+
+ Sets the first `n` coefficients of ``RQ`` to the remainder of
+ ``(A, len)`` on division by `x^n - c`, and the next ``len - n``
+ coefficients (from index `n` to index ``len-1``) to the
+ quotient of this division. `A` and `RQ` are allowed to be the
+ same, but may not overlap partially in any other way.
+ Constraints: `len \ge n > 0`, and `c` is reduced modulo
+ ``mod.n``.
+
+.. function:: void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod)
+
+ Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = 1`,
+ that is, divisor is `x^n - 1`.
+
+.. function:: void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod)
+
+ Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = -1`,
+ that is, divisor is `x^n + 1`.
+
+.. function:: void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+
+ Identical to :func:`_nmod_poly_divrem_xnmc` but exploiting the precomputed
+ ``c_precomp`` obtained via :func:`n_mulmod_precomp_shoup` for faster
+ modular multiplications by `c`. Additional constraint: the modulus ``modn``
+ must be less than `2^{\mathtt{FLINT\_BITS} - 1}`.
+
+.. function:: void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+
+ Identical to :func:`_nmod_poly_divrem_xnmc_precomp` but handling modular
+ reductions lazily. Precisely, if all coefficients of `A` are less than or
+ equal to `m`, the input requirement is `m + 2n \le
+ 2^{\mathtt{FLINT\_BITS}}`, and the output coefficients of ``RQ`` are in
+ `[0, m+2n)` and equal to the sought values modulo `n`. In particular the
+ coefficients of `A` need not be reduced modulo ``n``, and the output may
+ not be either. However, the value ``c`` should be reduced modulo `n`.
+
+ In the case where `m = n-1` (coefficients of `A` are reduced modulo
+ `n`), then the above leads to the requirement `3n-1 \le
+ 2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits,
+ and `n \le 1431655765` for 32 bits), and reducing the output coefficients
+ just amounts to subtracting either `n` or `2n` to each of them.
+
+.. function:: void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, ulong n, ulong c)
+
+ Sets `Q` and `R` to the quotient and remainder of `A` on division by `x^n -
+ c`. Constraints: `n` is nonzero and ``c`` is reduced modulo the modulus of
+ `A`. Incorporates specialized code for the cases `c \in \{-1,0,1\}`, and
+ calls variants with precomputation on `c` when possible.
+
Divisibility testing
--------------------------------------------------------------------------------
@@ -1317,14 +1372,14 @@ Evaluation
.. function:: ulong _nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong n)
Evaluates ``poly`` at the value ``c`` modulo ``n``, with lazy reductions
- modulo `n`. Precisely, if all coefficients of ``poly`` are less than `m`,
- the input requirement is `m \le 2^{\mathtt{FLINT\_BITS}} - 2n + 1`, and the
- output value is in `[0, m+2n-1)` and equal to the sought evaluation modulo
- `n`. In particular the coefficients of ``poly`` need not be reduced modulo
- ``n``, and the output may not be either. However, the value ``c`` should be
- reduced modulo `n`.
-
- In the case where `m = n` (coefficients of ``poly`` are reduced modulo
+ modulo `n`. Precisely, if all coefficients of ``poly`` are less than or
+ equal to `m`, the input requirement is `m + 2n \le
+ 2^{\mathtt{FLINT\_BITS}}`, and the output value is in `[0, m+2n)` and equal
+ to the sought evaluation modulo `n`. In particular the coefficients of
+ ``poly`` need not be reduced modulo ``n``, and the output may not be
+ either. However, the value ``c`` should be reduced modulo `n`.
+
+ In the case where `m = n-1` (coefficients of ``poly`` are reduced modulo
`n`), then the above leads to the requirement `3n-1 \le
2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits,
and `n \le 1431655765` for 32 bits), and reducing the output just amounts
diff --git a/src/nmod_poly.h b/src/nmod_poly.h
index b6c33a3b72..c963a4ab65 100644
--- a/src/nmod_poly.h
+++ b/src/nmod_poly.h
@@ -502,10 +502,25 @@ void nmod_poly_div_newton_n_preinv (nmod_poly_t Q, const nmod_poly_t A, const nm
void _nmod_poly_divrem_newton_n_preinv (nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr Binv, slong lenBinv, nmod_t mod);
void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t Binv);
-ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod);
+/* Division with special divisors *******************************************/
+ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod);
ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c);
+void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod);
+void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c);
+
+void _nmod_poly_rem_xnmc(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod);
+void _nmod_poly_rem_xnm1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_rem_xnp1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_rem_xnmc_precomp(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void _nmod_poly_rem_xnmc_precomp_lazy(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void nmod_poly_rem_xnmc(nmod_poly_t R, nmod_poly_t A, ulong n, ulong c);
+
/* Divisibility testing *****************************************************/
int _nmod_poly_divides_classical(nn_ptr Q, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nmod_t mod);
diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c
new file mode 100644
index 0000000000..138cf5af54
--- /dev/null
+++ b/src/nmod_poly/divrem_xnmc.c
@@ -0,0 +1,284 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+#include "ulong_extras.h"
+
+/* division by x**n - 1 */
+void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r;
+
+ if (RQ != A)
+ for (j = 0; j < n; j++)
+ RQ[len-n+j] = A[len-n+j];
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn);
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn);
+ i -= n;
+ }
+}
+
+/* division by x**n + 1 */
+void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r;
+
+ if (RQ != A)
+ for (j = 0; j < n; j++)
+ RQ[len-n+j] = A[len-n+j];
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn);
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn);
+ i -= n;
+ }
+}
+
+/* division by x**n - c, general variant */
+void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val;
+
+ if (RQ != A)
+ for (j = 0; j < n; j++)
+ RQ[len-n+j] = A[len-n+j];
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ val = nmod_mul(RQ[i+n+j], c, mod);
+ RQ[i+j] = n_addmod(val, A[i+j], mod.n);
+ }
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ val = nmod_mul(RQ[i+n+j], c, mod);
+ RQ[i+j] = n_addmod(val, A[i+j], mod.n);
+ }
+ i -= n;
+ }
+}
+
+/* division by x**n - c, with precomputation on c */
+/* constraint: modn < 2**(FLINT_BITS-1) */
+void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val;
+
+ if (RQ != A)
+ for (j = 0; j < n; j++)
+ RQ[len-n+j] = A[len-n+j];
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn);
+ RQ[i+j] = n_addmod(val, A[i+j], modn);
+ }
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn);
+ RQ[i+j] = n_addmod(val, A[i+j], modn);
+ }
+ i -= n;
+ }
+}
+
+/* division by x**n - c, lazy with precomputation */
+/* constraint: max(A) + 2*modn <= 2**FLINT_BITS */
+/* coeff bounds: in [0, max(A)] | out [0, max(A) + 2*modn) */
+void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val, p_hi, p_lo;
+
+ if (RQ != A)
+ for (j = 0; j < n; j++)
+ RQ[len-n+j] = A[len-n+j];
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+ val = RQ[i+n+j];
+ umul_ppmm(p_hi, p_lo, c_precomp, val);
+ val = c * val - p_hi * modn;
+ /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */
+ RQ[i+j] = val + A[i+j];
+ }
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+ val = RQ[i+n+j];
+ umul_ppmm(p_hi, p_lo, c_precomp, val);
+ val = c * val - p_hi * modn;
+ /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */
+ RQ[i+j] = val + A[i+j];
+ }
+ i -= n;
+ }
+}
+
+void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c)
+{
+ const ulong len = A->length;
+
+ if (len <= n)
+ {
+ nmod_poly_zero(Q);
+ nmod_poly_set(R, A);
+ return;
+ }
+
+ if (c == 0)
+ {
+ nmod_poly_set_trunc(R, A, n);
+ nmod_poly_shift_right(Q, A, n);
+ return;
+ }
+
+ int lazy = 0;
+ nn_ptr RQ = _nmod_vec_init(len);
+
+ /* perform division */
+ if (c == 1)
+ _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod.n);
+
+ else if (c == A->mod.n - 1)
+ _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod.n);
+
+ /* if degree below the n_mulmod_shoup threshold, */
+ /* or if modulus forbids n_mulmod_shoup usage, use general */
+#if FLINT_MULMOD_SHOUP_THRESHOLD <= 2
+ else if (A->mod.norm == 0) /* here A->length >= threshold */
+#else
+ else if ((A->length < FLINT_MULMOD_SHOUP_THRESHOLD)
+ || (A->mod.norm == 0))
+#endif
+ {
+ _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod);
+ }
+
+ else
+ {
+ const ulong modn = A->mod.n;
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+
+ /* if 3*mod.n - 1 <= 2**FLINT_BITS, use precomp+lazy variant */
+#if FLINT_BITS == 64
+ if (modn <= UWORD(6148914691236517205))
+#else /* FLINT_BITS == 32 */
+ if (modn <= UWORD(1431655765))
+#endif
+ {
+ lazy = 1;
+ _nmod_poly_divrem_xnmc_precomp_lazy(RQ, A->coeffs, len, n, c, c_precomp, modn);
+ }
+
+ /* use n_mulmod_shoup, non-lazy variant */
+ else
+ {
+ _nmod_poly_divrem_xnmc_precomp(RQ, A->coeffs, len, n, c, c_precomp, modn);
+ }
+ }
+
+ /* copy remainder R */
+ nmod_poly_fit_length(R, n);
+ if (lazy)
+ {
+ /* correct excess */
+ const ulong modn = A->mod.n;
+ for (ulong i = 0; i < n; i++)
+ {
+ ulong v = RQ[i];
+ if (v >= 2*modn)
+ v -= 2*modn;
+ else if (v >= modn)
+ v -= modn;
+ R->coeffs[i] = v;
+ }
+ }
+ else
+ {
+ _nmod_vec_set(R->coeffs, RQ, n);
+ }
+ _nmod_poly_set_length(R, n);
+ _nmod_poly_normalise(R);
+
+ /* copy quotient Q */
+ nmod_poly_fit_length(Q, len - n);
+ if (lazy)
+ {
+ /* correct excess */
+ const ulong modn = A->mod.n;
+ for (ulong i = 0; i < len - n; i++)
+ {
+ ulong v = RQ[n+i];
+ if (v >= 2*modn)
+ v -= 2*modn;
+ else if (v >= modn)
+ v -= modn;
+ Q->coeffs[i] = v;
+ }
+ }
+ else
+ {
+ _nmod_vec_set(Q->coeffs, RQ + n, len - n);
+ }
+ _nmod_poly_set_length(Q, len - n);
+
+ _nmod_vec_clear(RQ);
+}
diff --git a/src/nmod_poly/evaluate_nmod.c b/src/nmod_poly/evaluate_nmod.c
index 5e843005da..2dda5d3a87 100644
--- a/src/nmod_poly/evaluate_nmod.c
+++ b/src/nmod_poly/evaluate_nmod.c
@@ -119,20 +119,20 @@ nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c)
return _nmod_poly_evaluate_nmod(poly->coeffs, poly->length, c, poly->mod);
}
- // if 3*mod.n - 1 <= 2**FLINT_BITS, use n_mulmod_shoup, lazy variant
else
{
const ulong modn = poly->mod.n;
+ /* if 3*mod.n - 1 <= 2**FLINT_BITS, use n_mulmod_shoup, lazy variant */
#if FLINT_BITS == 64
if (modn <= UWORD(6148914691236517205))
-#else // FLINT_BITS == 32
+#else /* FLINT_BITS == 32 */
if (modn <= UWORD(1431655765))
#endif
{
const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
ulong val = _nmod_poly_evaluate_nmod_precomp_lazy(poly->coeffs, poly->length, c, c_precomp, modn);
- // correct excess
+ /* correct excess */
if (val >= 2*modn)
val -= 2*modn;
else if (val >= modn)
@@ -140,7 +140,7 @@ nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c)
return val;
}
- // use n_mulmod_shoup, non-lazy variant
+ /* use n_mulmod_shoup, non-lazy variant */
else
{
const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
diff --git a/src/nmod_poly/profile/p-divrem_xnmc.c b/src/nmod_poly/profile/p-divrem_xnmc.c
new file mode 100644
index 0000000000..b718557c6b
--- /dev/null
+++ b/src/nmod_poly/profile/p-divrem_xnmc.c
@@ -0,0 +1,367 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "flint.h"
+#include "nmod.h"
+#include "profiler.h"
+#include "ulong_extras.h"
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+
+int
+_nmod_poly_divrem_try_sparse(nn_ptr Q, nn_ptr R, nn_srcptr A,
+ slong lenA, nn_srcptr B, slong lenB,
+ nn_srcptr Binv, slong FLINT_UNUSED(lenBinv), nmod_t mod);
+
+static inline ulong
+_get_c_from_ctype(int ctype, flint_rand_t state, ulong modn)
+{
+ if (ctype == 0)
+ return modn - 1;
+ else if (ctype == 1)
+ return 1;
+ else
+ return n_randint(state, modn);
+}
+
+typedef struct
+{
+ flint_bitcnt_t bits;
+ slong length;
+ ulong n;
+ int ctype; /* 0: c == -1; 1: c == 1; else: general*/
+} info_t;
+
+void sample_divrem(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nmod_poly_t poly, div, quo, rem;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ nmod_poly_init2(poly, modn, length);
+ _nmod_poly_set_length(poly, length);
+ for (j = 0; j < length; j++)
+ poly->coeffs[j] = n_randint(state, modn);
+ _nmod_poly_normalise(poly);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ nmod_poly_init_mod(quo, poly->mod);
+ nmod_poly_init_mod(rem, poly->mod);
+ nmod_poly_init_mod(div, poly->mod);
+
+ /* divisor xnmc: b == x**n - c */
+ nmod_poly_set_coeff_ui(div, n, 1);
+ nmod_poly_set_coeff_ui(div, 0, n_negmod(c, modn));
+
+ prof_start();
+ nmod_poly_divrem(quo, rem, poly, div);
+ prof_stop();
+
+ nmod_poly_clear(quo);
+ nmod_poly_clear(rem);
+ nmod_poly_clear(poly);
+ nmod_poly_clear(div);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_divrem_sparse(void * arg, ulong count)
+{
+ ulong modn;
+ nmod_t mod;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i;
+
+ nn_ptr Q, R, A, B;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+ nmod_init(&mod, modn);
+
+ Q = _nmod_vec_init(length - n);
+ R = _nmod_vec_init(n);
+ A = _nmod_vec_init(length);
+ _nmod_vec_rand(A, state, length, mod);
+ B = _nmod_vec_init(n+1);
+ _nmod_vec_zero(B, n+1);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+ B[0] = modn - c;
+ B[n] = 1;
+
+ prof_start();
+ _nmod_poly_divrem_try_sparse(Q, R, A, length, B, n+1, NULL, 0, mod);
+ prof_stop();
+
+ _nmod_vec_clear(Q);
+ _nmod_vec_clear(R);
+ _nmod_vec_clear(A);
+ _nmod_vec_clear(B);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_interface(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nmod_poly_t poly, quo, rem;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ nmod_poly_init2(poly, modn, length);
+ _nmod_poly_set_length(poly, length);
+ for (j = 0; j < length; j++)
+ poly->coeffs[j] = n_randint(state, modn);
+ _nmod_poly_normalise(poly);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ nmod_poly_init_mod(quo, poly->mod);
+ nmod_poly_init_mod(rem, poly->mod);
+
+ prof_start();
+ nmod_poly_divrem_xnmc(quo, rem, poly, n, c);
+ prof_stop();
+
+ nmod_poly_clear(quo);
+ nmod_poly_clear(rem);
+ nmod_poly_clear(poly);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_generic(void * arg, ulong count)
+{
+ ulong modn;
+ nmod_t mod;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nn_ptr qr;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+ nmod_init(&mod, modn);
+
+ qr = _nmod_vec_init(length);
+ for (j = 0; j < length; j++)
+ qr[j] = n_randint(state, modn);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ _nmod_poly_divrem_xnmc(qr, qr, length, n, c, mod);
+ prof_stop();
+
+ _nmod_vec_clear(qr);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_precomp(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nn_ptr qr;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ qr = _nmod_vec_init(length);
+ for (j = 0; j < length; j++)
+ qr[j] = n_randint(state, modn);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ _nmod_poly_divrem_xnmc_precomp(qr, qr, length, n, c, c_precomp, modn);
+ prof_stop();
+
+ _nmod_vec_clear(qr);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_precomp_lazy(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nn_ptr qr;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ qr = _nmod_vec_init(length);
+ for (j = 0; j < length; j++)
+ qr[j] = n_randint(state, modn);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ _nmod_poly_divrem_xnmc_precomp_lazy(qr, qr, length, n, c, c_precomp, modn);
+ prof_stop();
+
+ _nmod_vec_clear(qr);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+int main(void)
+{
+ const int nblengths = 15;
+ slong lengths[15] = {5, 40, 80, 160, 320,
+ 640, 1280, 2560, 5120, 10240,
+ 20480, 40960, 81920, 163840, 327680};
+
+ double min, max;
+ double mins_divrem;
+ double mins_divrem_sparse;
+ double mins_interface;
+ double mins_generic;
+ double mins_precomp;
+ double mins_precomp_lazy;
+ info_t info;
+ flint_bitcnt_t i;
+
+ flint_printf("profiled:\n");
+ flint_printf("\t1. divrem\n\t1bis. divrem sparse\n\t2. poly interface\n\t3. vec general\n");
+ flint_printf("\t4. vec precomp\n\t5. precomp_lazy (no excess correction)\n");
+ flint_printf("column c: 0 -> c == -1, 1 -> c == 1, 2 -> c general\n");
+ flint_printf("note: variant 2. expected to be faster than variants 3.4.5. when c == 1 or c == -1\n\n");
+ flint_printf("bit c len n 1. 1bis. 2. 3. 4. 5.\n");
+
+ for (i = 62; i <= FLINT_BITS; i++)
+ {
+ info.bits = i;
+
+ for (int ct = 0; ct <= 2; ct++)
+ {
+ info.ctype = ct;
+ for (int len = 0; len < nblengths; ++len)
+ {
+ info.length = lengths[len];
+
+ info.n = 1;
+ while (info.n < (ulong)lengths[len])
+ {
+ prof_repeat(&min, &max, sample_divrem, (void *) &info);
+ mins_divrem = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_divrem_sparse, (void *) &info);
+ mins_divrem_sparse = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_interface, (void *) &info);
+ mins_interface = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_generic, (void *) &info);
+ mins_generic = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_precomp, (void *) &info);
+ mins_precomp = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info);
+ mins_precomp_lazy = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+
+ if (i < FLINT_BITS-1)
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e %.1e",
+ info.bits, info.ctype, info.length, info.n,
+ mins_divrem, mins_divrem_sparse, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy);
+ flint_printf("\n");
+ }
+ else if (i < FLINT_BITS)
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e na",
+ info.bits, info.ctype, info.length, info.n,
+ mins_divrem, mins_divrem_sparse, mins_interface, mins_generic, mins_precomp);
+ flint_printf("\n");
+ }
+ else /* i == FLINT_BITS */
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na na",
+ info.bits, info.ctype, info.length, info.n,
+ mins_divrem, mins_divrem_sparse, mins_interface, mins_generic);
+ flint_printf("\n");
+ }
+
+ info.n = 5 * info.n;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/src/nmod_poly/profile/p-evaluate_nmod.c b/src/nmod_poly/profile/p-evaluate_nmod.c
index b9854c32c0..76bfcae173 100644
--- a/src/nmod_poly/profile/p-evaluate_nmod.c
+++ b/src/nmod_poly/profile/p-evaluate_nmod.c
@@ -43,6 +43,7 @@ void sample_interface(void * arg, ulong count)
_nmod_poly_set_length(poly, length);
for (j = 0; j < length; j++)
poly->coeffs[j] = n_randint(state, n);
+ _nmod_poly_normalise(poly);
pt = n_randint(state, n);
diff --git a/src/nmod_poly/profile/p-rem_xnmc.c b/src/nmod_poly/profile/p-rem_xnmc.c
new file mode 100644
index 0000000000..c702ccce40
--- /dev/null
+++ b/src/nmod_poly/profile/p-rem_xnmc.c
@@ -0,0 +1,317 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "flint.h"
+#include "nmod.h"
+#include "profiler.h"
+#include "ulong_extras.h"
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+
+static inline ulong
+_get_c_from_ctype(int ctype, flint_rand_t state, ulong modn)
+{
+ if (ctype == 0)
+ return modn - 1;
+ else if (ctype == 1)
+ return 1;
+ else
+ return n_randint(state, modn);
+}
+
+typedef struct
+{
+ flint_bitcnt_t bits;
+ slong length;
+ ulong n;
+ int ctype; /* 0: c == -1; 1: c == 1; else: general*/
+} info_t;
+
+void sample_rem(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nmod_poly_t poly, div, rem;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ nmod_poly_init2(poly, modn, length);
+ _nmod_poly_set_length(poly, length);
+ for (j = 0; j < length; j++)
+ poly->coeffs[j] = n_randint(state, modn);
+ _nmod_poly_normalise(poly);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ nmod_poly_init_mod(rem, poly->mod);
+ nmod_poly_init_mod(div, poly->mod);
+
+ /* divisor xnmc: b == x**n - c */
+ nmod_poly_set_coeff_ui(div, n, 1);
+ nmod_poly_set_coeff_ui(div, 0, n_negmod(c, modn));
+
+ prof_start();
+ nmod_poly_rem(rem, poly, div);
+ prof_stop();
+
+ nmod_poly_clear(rem);
+ nmod_poly_clear(poly);
+ nmod_poly_clear(div);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_interface(void * arg, ulong count)
+{
+ ulong modn;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i, j;
+
+ nmod_poly_t poly, rem;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+
+ nmod_poly_init2(poly, modn, length);
+ _nmod_poly_set_length(poly, length);
+ for (j = 0; j < length; j++)
+ poly->coeffs[j] = n_randint(state, modn);
+ _nmod_poly_normalise(poly);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ nmod_poly_init_mod(rem, poly->mod);
+
+ prof_start();
+ nmod_poly_rem_xnmc(rem, poly, n, c);
+ prof_stop();
+
+ nmod_poly_clear(rem);
+ nmod_poly_clear(poly);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_generic(void * arg, ulong count)
+{
+ ulong modn;
+ nmod_t mod;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i;
+
+ nn_ptr r, a;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+ nmod_init(&mod, modn);
+
+ r = _nmod_vec_init(n);
+ a = _nmod_vec_init(length);
+ _nmod_vec_rand(a, state, length, mod);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ _nmod_poly_rem_xnmc(r, a, length, n, c, mod);
+ prof_stop();
+
+ _nmod_vec_clear(r);
+ _nmod_vec_clear(a);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_precomp(void * arg, ulong count)
+{
+ ulong modn;
+ nmod_t mod;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i;
+
+ nn_ptr r, a;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+ nmod_init(&mod, modn);
+
+ r = _nmod_vec_init(n);
+ a = _nmod_vec_init(length);
+ _nmod_vec_rand(a, state, length, mod);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ _nmod_poly_rem_xnmc_precomp(r, a, length, n, c, c_precomp, modn);
+ prof_stop();
+
+ _nmod_vec_clear(a);
+ _nmod_vec_clear(r);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+void sample_precomp_lazy(void * arg, ulong count)
+{
+ ulong modn;
+ nmod_t mod;
+ info_t * info = (info_t *) arg;
+ flint_bitcnt_t bits = info->bits;
+ slong length = info->length;
+ ulong n = info->n;
+ ulong c;
+
+ slong i;
+
+ nn_ptr r, a;
+
+ FLINT_TEST_INIT(state);
+
+ for (i = 0; i < (slong)count; i++)
+ {
+ modn = n_randprime(state, bits, 1);
+ nmod_init(&mod, modn);
+
+ r = _nmod_vec_init(n);
+ a = _nmod_vec_init(length);
+ _nmod_vec_rand(a, state, length, mod);
+
+ c = _get_c_from_ctype(info->ctype, state, modn);
+
+ prof_start();
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ _nmod_poly_rem_xnmc_precomp_lazy(r, a, length, n, c, c_precomp, modn);
+ prof_stop();
+
+ _nmod_vec_clear(r);
+ _nmod_vec_clear(a);
+ }
+
+ FLINT_TEST_CLEAR(state);
+}
+
+int main(void)
+{
+ const int nblengths = 15;
+ slong lengths[15] = {5, 40, 80, 160, 320,
+ 640, 1280, 2560, 5120, 10240,
+ 20480, 40960, 81920, 163840, 327680};
+
+ double min, max;
+ double mins_rem;
+ double mins_interface;
+ double mins_generic;
+ double mins_precomp;
+ double mins_precomp_lazy;
+ info_t info;
+ flint_bitcnt_t i;
+
+ flint_printf("profiled:\n");
+ flint_printf("\t1. rem\n\t2. poly interface\n\t3. vec general\n");
+ flint_printf("\t4. vec precomp\n\t5. precomp_lazy (no excess correction)\n");
+ flint_printf("column c: 0 -> c == -1, 1 -> c == 1, 2 -> c general\n");
+ flint_printf("note: variant 2. expected to be faster than variants 3.4.5. when c == 1 or c == -1\n\n");
+ flint_printf("bit c len n 1. 2. 3. 4. 5.\n");
+
+ for (i = 62; i <= FLINT_BITS; i++)
+ {
+ info.bits = i;
+
+ for (int ct = 0; ct <= 2; ct++)
+ {
+ info.ctype = ct;
+ for (int len = 0; len < nblengths; ++len)
+ {
+ info.length = lengths[len];
+
+ info.n = 1;
+ while (info.n < (ulong)lengths[len])
+ {
+ prof_repeat(&min, &max, sample_rem, (void *) &info);
+ mins_rem = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_interface, (void *) &info);
+ mins_interface = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_generic, (void *) &info);
+ mins_generic = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_precomp, (void *) &info);
+ mins_precomp = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+ prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info);
+ mins_precomp_lazy = min/(double)FLINT_CLOCK_SCALE_FACTOR;
+
+ if (i < FLINT_BITS-1)
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e",
+ info.bits, info.ctype, info.length, info.n,
+ mins_rem, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy);
+ flint_printf("\n");
+ }
+ else if (i < FLINT_BITS)
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na",
+ info.bits, info.ctype, info.length, info.n,
+ mins_rem, mins_interface, mins_generic, mins_precomp);
+ flint_printf("\n");
+ }
+ else /* i == FLINT_BITS */
+ {
+ flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e na na",
+ info.bits, info.ctype, info.length, info.n,
+ mins_rem, mins_interface, mins_generic);
+ flint_printf("\n");
+ }
+
+ info.n = 5 * info.n;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/src/nmod_poly/rem_xnmc.c b/src/nmod_poly/rem_xnmc.c
new file mode 100644
index 0000000000..c7e8439d3b
--- /dev/null
+++ b/src/nmod_poly/rem_xnmc.c
@@ -0,0 +1,250 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+#include "ulong_extras.h"
+
+/* division by x**n - 1 */
+void _nmod_poly_rem_xnm1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r;
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ R[j] = n_addmod(A[i+n+j], A[i+j], modn);
+ for (j = r; j < n; j++)
+ R[j] = A[i+j];
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ R[j] = n_addmod(R[j], A[i+j], modn);
+ i -= n;
+ }
+}
+
+/* division by x**n + 1 */
+void _nmod_poly_rem_xnp1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r;
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ R[j] = n_submod(A[i+j], A[i+n+j], modn);
+ for (j = r; j < n; j++)
+ R[j] = A[i+j];
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ R[j] = n_submod(A[i+j], R[j], modn);
+ i -= n;
+ }
+}
+
+/* division by x**n - c, general variant */
+void _nmod_poly_rem_xnmc(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val;
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ val = nmod_mul(A[i+n+j], c, mod);
+ R[j] = n_addmod(val, A[i+j], mod.n);
+ }
+ for (j = r; j < n; j++)
+ R[j] = A[i+j];
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ val = nmod_mul(R[j], c, mod);
+ R[j] = n_addmod(val, A[i+j], mod.n);
+ }
+ i -= n;
+ }
+}
+
+/* division by x**n - c, with precomputation on c */
+/* constraint: modn < 2**(FLINT_BITS-1) */
+void _nmod_poly_rem_xnmc_precomp(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val;
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ val = n_mulmod_shoup(c, A[i+n+j], c_precomp, modn);
+ R[j] = n_addmod(val, A[i+j], modn);
+ }
+ for (j = r; j < n; j++)
+ R[j] = A[i+j];
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ val = n_mulmod_shoup(c, R[j], c_precomp, modn);
+ R[j] = n_addmod(val, A[i+j], modn);
+ }
+ i -= n;
+ }
+}
+
+/* division by x**n - c, lazy with precomputation */
+/* constraint: max(A) + 2*modn <= 2**FLINT_BITS */
+/* coeff bounds: in [0, max(A)] | out [0, max(A) + 2*modn) */
+void _nmod_poly_rem_xnmc_precomp_lazy(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+ /* assumes len >= n */
+ slong i;
+ ulong j, r, val, p_hi, p_lo;
+
+ r = len % n;
+ i = len - r - n; /* multiple of n, >= 0 by assumption */
+
+ for (j = 0; j < r; j++)
+ {
+ /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+ val = A[i+n+j];
+ umul_ppmm(p_hi, p_lo, c_precomp, val);
+ val = c * val - p_hi * modn;
+ /* lazy addition, yields R[i+j] in [0..k+2n), where max(R) <= k */
+ R[j] = val + A[i+j];
+ }
+ for (j = r; j < n; j++)
+ R[j] = A[i+j];
+
+ i -= n;
+ while (i >= 0)
+ {
+ for (j = 0; j < n; j++)
+ {
+ /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+ val = R[j];
+ umul_ppmm(p_hi, p_lo, c_precomp, val);
+ val = c * val - p_hi * modn;
+ /* lazy addition, yields R[i+j] in [0..k+2n), where max(R) <= k */
+ R[j] = val + A[i+j];
+ }
+ i -= n;
+ }
+}
+
+void nmod_poly_rem_xnmc(nmod_poly_t R, nmod_poly_t A, ulong n, ulong c)
+{
+ const ulong len = A->length;
+
+ if (len <= n)
+ {
+ nmod_poly_set(R, A);
+ return;
+ }
+
+ if (c == 0)
+ {
+ nmod_poly_set_trunc(R, A, n);
+ return;
+ }
+
+ int lazy = 0;
+ nn_ptr Rvec = _nmod_vec_init(n);
+
+ /* perform division */
+ if (c == 1)
+ _nmod_poly_rem_xnm1(Rvec, A->coeffs, len, n, A->mod.n);
+
+ else if (c == A->mod.n - 1)
+ _nmod_poly_rem_xnp1(Rvec, A->coeffs, len, n, A->mod.n);
+
+ /* if degree below the n_mulmod_shoup threshold, */
+ /* or if modulus forbids n_mulmod_shoup usage, use general */
+#if FLINT_MULMOD_SHOUP_THRESHOLD <= 2
+ else if (A->mod.norm == 0) /* here A->length >= threshold */
+#else
+ else if ((A->length < FLINT_MULMOD_SHOUP_THRESHOLD)
+ || (A->mod.norm == 0))
+#endif
+ {
+ _nmod_poly_rem_xnmc(Rvec, A->coeffs, len, n, c, A->mod);
+ }
+
+ else
+ {
+ const ulong modn = A->mod.n;
+ const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+
+ /* if 3*mod.n - 1 <= 2**FLINT_BITS, use precomp+lazy variant */
+#if FLINT_BITS == 64
+ if (modn <= UWORD(6148914691236517205))
+#else /* FLINT_BITS == 32 */
+ if (modn <= UWORD(1431655765))
+#endif
+ {
+ lazy = 1;
+ _nmod_poly_rem_xnmc_precomp_lazy(Rvec, A->coeffs, len, n, c, c_precomp, modn);
+ }
+
+ /* use n_mulmod_shoup, non-lazy variant */
+ else
+ {
+ _nmod_poly_rem_xnmc_precomp(Rvec, A->coeffs, len, n, c, c_precomp, modn);
+ }
+ }
+
+ /* copy remainder R */
+ nmod_poly_fit_length(R, n);
+ if (lazy)
+ {
+ /* correct excess */
+ const ulong modn = A->mod.n;
+ for (ulong i = 0; i < n; i++)
+ {
+ ulong v = Rvec[i];
+ if (v >= 2*modn)
+ v -= 2*modn;
+ else if (v >= modn)
+ v -= modn;
+ R->coeffs[i] = v;
+ }
+ }
+ else
+ {
+ _nmod_vec_set(R->coeffs, Rvec, n);
+ }
+ _nmod_poly_set_length(R, n);
+ _nmod_poly_normalise(R);
+
+ _nmod_vec_clear(Rvec);
+}
diff --git a/src/nmod_poly/test/main.c b/src/nmod_poly/test/main.c
index 04dc56b5e8..b6df1a5e2c 100644
--- a/src/nmod_poly/test/main.c
+++ b/src/nmod_poly/test/main.c
@@ -45,6 +45,7 @@
#include "t-divrem.c"
#include "t-divrem_newton_n_preinv.c"
#include "t-div_root.c"
+#include "t-divrem_xnmc.c"
#include "t-div_series_basecase.c"
#include "t-div_series.c"
#include "t-equal_trunc.c"
@@ -108,6 +109,7 @@
#include "t-pow_trunc.c"
#include "t-product_roots_nmod_vec.c"
#include "t-rem.c"
+#include "t-rem_xnmc.c"
#include "t-resultant.c"
#include "t-resultant_euclidean.c"
#include "t-resultant_hgcd.c"
@@ -171,6 +173,7 @@ test_struct tests[] =
TEST_FUNCTION(nmod_poly_divrem),
TEST_FUNCTION(nmod_poly_divrem_newton_n_preinv),
TEST_FUNCTION(nmod_poly_div_root),
+ TEST_FUNCTION(nmod_poly_divrem_xnmc),
TEST_FUNCTION(nmod_poly_div_series_basecase),
TEST_FUNCTION(nmod_poly_div_series),
TEST_FUNCTION(nmod_poly_equal_trunc),
@@ -234,6 +237,7 @@ test_struct tests[] =
TEST_FUNCTION(nmod_poly_pow_trunc),
TEST_FUNCTION(nmod_poly_product_roots_nmod_vec),
TEST_FUNCTION(nmod_poly_rem),
+ TEST_FUNCTION(nmod_poly_rem_xnmc),
TEST_FUNCTION(nmod_poly_resultant),
TEST_FUNCTION(nmod_poly_resultant_euclidean),
TEST_FUNCTION(nmod_poly_resultant_hgcd),
diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c
new file mode 100644
index 0000000000..a3457be816
--- /dev/null
+++ b/src/nmod_poly/test/t-divrem_xnmc.c
@@ -0,0 +1,187 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "nmod.h"
+#include "nmod_vec.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod_poly.h"
+
+TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state)
+{
+ int i, result = 1;
+
+ for (i = 0; i < 5000 * flint_test_multiplier(); i++)
+ {
+ nmod_poly_t a, b;
+ nmod_poly_t qok, rok, q1, r1;
+ nn_ptr qr1;
+ ulong modn, len, n, c;
+ nmod_t mod;
+
+ /* modulus */
+ if (i < 128)
+ modn = n_randtest_bits(state, FLINT_BITS);
+ else if (i < 256)
+ modn = n_randtest_bits(state, FLINT_BITS-1);
+ else if (i < 384)
+ modn = n_randtest_bits(state, FLINT_BITS-2);
+ else
+ modn = n_randtest_not_zero(state);
+ nmod_init(&mod, modn);
+
+ /* initialize polynomials */
+ nmod_poly_init_mod(a, mod);
+ nmod_poly_init_mod(b, mod);
+ nmod_poly_init_mod(qok, mod);
+ nmod_poly_init_mod(rok, mod);
+ nmod_poly_init_mod(q1, mod);
+ nmod_poly_init_mod(r1, mod);
+
+ /* pick parameters */
+ len = 5 + n_randint(state, 100); /* poly length, [5, 105) */
+ n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */
+
+ if (i % 16 == 0 && modn > 1)
+ c = 1;
+ else if (i % 16 == 1)
+ c = modn - 1;
+ else if (i % 16 == 2)
+ c = 0;
+ else
+ c = n_randint(state, modn);
+
+ /* random a of length <= len; qr1 with same coefficients */
+ nmod_poly_randtest(a, state, len);
+ qr1 = _nmod_vec_init(a->length);
+ _nmod_vec_set(qr1, a->coeffs, a->length);
+
+ /* divisor xnmc: b == x**n - c */
+ nmod_poly_set_coeff_ui(b, n, 1);
+ nmod_poly_set_coeff_ui(b, 0, n_negmod(c, modn));
+
+ // correct quotient/remainder
+ if (modn == 1)
+ {
+ nmod_poly_zero(rok);
+ nmod_poly_zero(qok);
+ }
+ else
+ nmod_poly_divrem(qok, rok, a, b);
+
+ // general interface
+ nmod_poly_divrem_xnmc(q1, r1, a, n, c);
+
+ result = (nmod_poly_equal(qok, q1) && nmod_poly_equal(rok, r1));
+ if (!result)
+ {
+ flint_printf("FAIL (poly 1):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, poly a :\n", n, c);
+ nmod_poly_print(a), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+
+ /* testing in-place underscore version, requires length >= n */
+ if (a->length >= (slong)n)
+ {
+ /* general function */
+ _nmod_poly_divrem_xnmc(qr1, qr1, a->length, n, c, a->mod);
+
+ result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length)
+ && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length));
+ if (!result)
+ {
+ flint_printf("FAIL (vec):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+
+ /* special cases 1, -1 */
+ _nmod_vec_set(qr1, a->coeffs, a->length);
+ if (c == 1 || c == modn - 1)
+ {
+ if (c == 1)
+ _nmod_poly_divrem_xnm1(qr1, qr1, a->length, n, a->mod.n);
+ else if (c == modn-1)
+ _nmod_poly_divrem_xnp1(qr1, qr1, a->length, n, a->mod.n);
+
+ result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length)
+ && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length));
+ if (!result)
+ {
+ flint_printf("FAIL (vec, +1 / - 1):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+ }
+
+ /* cases using precomp */
+ _nmod_vec_set(qr1, a->coeffs, a->length);
+ if (NMOD_CAN_USE_SHOUP(mod))
+ {
+ ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ int lazy = 0;
+
+#if FLINT_BITS == 64
+ if (modn <= UWORD(6148914691236517205))
+#else // FLINT_BITS == 32
+ if (modn <= UWORD(1431655765))
+#endif
+ {
+ lazy = 1;
+ _nmod_poly_divrem_xnmc_precomp_lazy(qr1, qr1, a->length, n, c, c_precomp, modn);
+ for (long ii = 0; ii < a->length; ii++)
+ {
+ if (qr1[ii] >= 2*modn)
+ qr1[ii] -= 2*modn;
+ else if (qr1[ii] >= modn)
+ qr1[ii] -= modn;
+ }
+ }
+ else
+ _nmod_poly_divrem_xnmc_precomp(qr1, qr1, a->length, n, c, c_precomp, modn);
+
+ result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length)
+ && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length));
+ if (!result)
+ {
+ if (lazy)
+ flint_printf("FAIL (vec, precomp lazy):\n");
+ else
+ flint_printf("FAIL (vec, precomp):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+ }
+ }
+
+ nmod_poly_clear(a);
+ nmod_poly_clear(b);
+ nmod_poly_clear(qok);
+ nmod_poly_clear(rok);
+ nmod_poly_clear(q1);
+ nmod_poly_clear(r1);
+ _nmod_vec_clear(qr1);
+ }
+
+ TEST_FUNCTION_END(state);
+}
diff --git a/src/nmod_poly/test/t-rem_xnmc.c b/src/nmod_poly/test/t-rem_xnmc.c
new file mode 100644
index 0000000000..99b26980c6
--- /dev/null
+++ b/src/nmod_poly/test/t-rem_xnmc.c
@@ -0,0 +1,174 @@
+/*
+ Copyright (C) 2025 Vincent Neiger
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "nmod.h"
+#include "nmod_vec.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod_poly.h"
+
+TEST_FUNCTION_START(nmod_poly_rem_xnmc, state)
+{
+ int i, result = 1;
+
+ for (i = 0; i < 5000 * flint_test_multiplier(); i++)
+ {
+ nmod_poly_t a, b;
+ nmod_poly_t rok, r1;
+ nn_ptr rvec1;
+ ulong modn, len, n, c;
+ nmod_t mod;
+
+ /* modulus */
+ if (i < 128)
+ modn = n_randtest_bits(state, FLINT_BITS);
+ else if (i < 256)
+ modn = n_randtest_bits(state, FLINT_BITS-1);
+ else if (i < 384)
+ modn = n_randtest_bits(state, FLINT_BITS-2);
+ else
+ modn = n_randtest_not_zero(state);
+ nmod_init(&mod, modn);
+
+ /* initialize polynomials */
+ nmod_poly_init_mod(a, mod);
+ nmod_poly_init_mod(b, mod);
+ nmod_poly_init_mod(rok, mod);
+ nmod_poly_init_mod(r1, mod);
+
+ /* pick parameters */
+ len = 5 + n_randint(state, 100); /* poly length, [5, 105) */
+ n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */
+
+ if (i % 16 == 0 && modn > 1)
+ c = 1;
+ else if (i % 16 == 1)
+ c = modn - 1;
+ else if (i % 16 == 2)
+ c = 0;
+ else
+ c = n_randint(state, modn);
+
+ /* random a of length <= len; rvec1 of length == n */
+ nmod_poly_randtest(a, state, len);
+ rvec1 = _nmod_vec_init(n);
+
+ /* divisor xnmc: b == x**n - c */
+ nmod_poly_set_coeff_ui(b, n, 1);
+ nmod_poly_set_coeff_ui(b, 0, n_negmod(c, modn));
+
+ // correct quotient/remainder
+ if (modn == 1)
+ nmod_poly_zero(rok);
+ else
+ nmod_poly_rem(rok, a, b);
+
+ // general interface
+ nmod_poly_rem_xnmc(r1, a, n, c);
+
+ result = (nmod_poly_equal(rok, r1));
+ if (!result)
+ {
+ flint_printf("FAIL (poly 1):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, poly a :\n", n, c);
+ nmod_poly_print(a), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+
+ /* testing in-place underscore version, requires length >= n */
+ if (a->length >= (slong)n)
+ {
+ /* general function */
+ _nmod_poly_rem_xnmc(rvec1, a->coeffs, a->length, n, c, a->mod);
+
+ result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length));
+ if (!result)
+ {
+ flint_printf("FAIL (vec):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+
+ /* special cases 1, -1 */
+ if (c == 1 || c == modn - 1)
+ {
+ if (c == 1)
+ _nmod_poly_rem_xnm1(rvec1, a->coeffs, a->length, n, a->mod.n);
+ else if (c == modn-1)
+ _nmod_poly_rem_xnp1(rvec1, a->coeffs, a->length, n, a->mod.n);
+
+ result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length));
+ if (!result)
+ {
+ flint_printf("FAIL (vec, +1 / - 1):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+ }
+
+ /* cases using precomp */
+ if (NMOD_CAN_USE_SHOUP(mod))
+ {
+ ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+ int lazy = 0;
+
+#if FLINT_BITS == 64
+ if (modn <= UWORD(6148914691236517205))
+#else // FLINT_BITS == 32
+ if (modn <= UWORD(1431655765))
+#endif
+ {
+ lazy = 1;
+ _nmod_poly_rem_xnmc_precomp_lazy(rvec1, a->coeffs, a->length, n, c, c_precomp, modn);
+ for (ulong ii = 0; ii < n; ii++)
+ {
+ if (rvec1[ii] >= 2*modn)
+ rvec1[ii] -= 2*modn;
+ else if (rvec1[ii] >= modn)
+ rvec1[ii] -= modn;
+ }
+ }
+ else
+ _nmod_poly_rem_xnmc_precomp(rvec1, a->coeffs, a->length, n, c, c_precomp, modn);
+
+ result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length));
+ if (!result)
+ {
+ if (lazy)
+ flint_printf("FAIL (vec, precomp lazy):\n");
+ else
+ flint_printf("FAIL (vec, precomp):\n");
+ flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n);
+ flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c);
+ _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n");
+ fflush(stdout);
+ flint_abort();
+ }
+ }
+ }
+
+ nmod_poly_clear(a);
+ nmod_poly_clear(b);
+ nmod_poly_clear(rok);
+ nmod_poly_clear(r1);
+ _nmod_vec_clear(rvec1);
+ }
+
+ TEST_FUNCTION_END(state);
+}