From 885aab99eff994845b66cbb8859215eada118f4b Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 00:03:24 +0100 Subject: [PATCH 1/9] Add nmod_poly_divrem specialized to divisor x**n - c: - add nmod_poly_divrem_xnmc, general version - add test, doc, draft of profile --- doc/source/nmod_poly.rst | 21 ++ src/nmod_poly.h | 6 +- src/nmod_poly/divrem_xnmc.c | 76 ++++++ src/nmod_poly/profile/p-divrem_xnmc.c | 344 ++++++++++++++++++++++++ src/nmod_poly/profile/p-evaluate_nmod.c | 1 + src/nmod_poly/test/main.c | 2 + src/nmod_poly/test/t-divrem_xnmc.c | 101 +++++++ 7 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 src/nmod_poly/divrem_xnmc.c create mode 100644 src/nmod_poly/profile/p-divrem_xnmc.c create mode 100644 src/nmod_poly/test/t-divrem_xnmc.c diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst index 457fe96061..24989705ba 100644 --- a/doc/source/nmod_poly.rst +++ b/doc/source/nmod_poly.rst @@ -1225,6 +1225,11 @@ Division The algorithm used is to call :func:`div_newton_n` and then multiply out and compute the remainder. + +Division with special divisors +-------------------------------------------------------------------------------- + + .. function:: ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod) Sets ``(Q, len-1)`` to the quotient of ``(A, len)`` on division @@ -1237,6 +1242,22 @@ Division Sets `Q` to the quotient of `A` on division by `(x - c)`, and returns the remainder, equal to the value of `A` evaluated at `c`. +.. function:: void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); + + Sets the first `n` coefficients of ``RQ`` to the remainder of + ``(A, len)`` on division by `x^n - c`, and the next ``len - n`` + coefficients (from index `n` to index ``len-1``) to the + quotient of this division. `A` and `RQ` are allowed to be the + same, but may not overlap partially in any other way. + Constraints: `len \ge n > 0`, and `c` is reduced modulo + ``mod.n``. + +.. function:: void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, ulong n, ulong c) + + Sets `Q` and `R` to the quotient and remainder of `A` on division by `x^n - + c`. Constraints: `n` is nonzero and ``c`` is reduced modulo the modulus of + `A`. + Divisibility testing -------------------------------------------------------------------------------- diff --git a/src/nmod_poly.h b/src/nmod_poly.h index b6c33a3b72..e407fdb4ea 100644 --- a/src/nmod_poly.h +++ b/src/nmod_poly.h @@ -502,10 +502,14 @@ void nmod_poly_div_newton_n_preinv (nmod_poly_t Q, const nmod_poly_t A, const nm void _nmod_poly_divrem_newton_n_preinv (nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr Binv, slong lenBinv, nmod_t mod); void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t Binv); -ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod); +/* Division with special divisors *******************************************/ +ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod); ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c); +void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); +void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); + /* Divisibility testing *****************************************************/ int _nmod_poly_divides_classical(nn_ptr Q, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nmod_t mod); diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c new file mode 100644 index 0000000000..69e3eb3280 --- /dev/null +++ b/src/nmod_poly/divrem_xnmc.c @@ -0,0 +1,76 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +/* #include "flint.h" */ +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "stdio.h" + +void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val; + + if (RQ != A) + for (j = 0; j < n; j++) + RQ[len-n+j] = A[len-n+j]; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + val = nmod_mul(RQ[i+n+j], c, mod); + RQ[i+j] = n_addmod(val, A[i+j], mod.n); + } + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + val = nmod_mul(RQ[i+n+j], c, mod); + RQ[i+j] = n_addmod(val, A[i+j], mod.n); + } + i -= n; + } +} + +void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c) +{ + const ulong len = A->length; + + if (len <= n) + { + nmod_poly_zero(Q); + nmod_poly_set(R, A); + return; + } + + nn_ptr RQ = _nmod_vec_init(len); + + /* perform division */ + _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod); + + /* copy remainder R */ + nmod_poly_fit_length(R, n); + _nmod_vec_set(R->coeffs, RQ, n); + _nmod_poly_set_length(R, n); + _nmod_poly_normalise(R); + + /* copy quotient Q */ + nmod_poly_fit_length(Q, len - n); + _nmod_vec_set(Q->coeffs, RQ + n, len - n); + _nmod_poly_set_length(Q, len - n); + + _nmod_vec_clear(RQ); +} diff --git a/src/nmod_poly/profile/p-divrem_xnmc.c b/src/nmod_poly/profile/p-divrem_xnmc.c new file mode 100644 index 0000000000..cae9fe5a23 --- /dev/null +++ b/src/nmod_poly/profile/p-divrem_xnmc.c @@ -0,0 +1,344 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "nmod.h" +#include "profiler.h" +#include "ulong_extras.h" +#include "nmod_poly.h" +#include "nmod_vec.h" + +static inline ulong +_get_c_from_ctype(int ctype, flint_rand_t state, ulong modn) +{ + if (ctype == 1) + return 1; + else if (ctype == -1) + return -1; + else if (ctype == 0) + return 0; + else + return n_randint(state, modn); +} + +typedef struct +{ + flint_bitcnt_t bits; + slong length; + ulong n; + int ctype; /* 0: c == 0; 1: c == 1; -1: c == -1; else: general*/ +} info_t; + +void sample_divrem(void * arg, ulong count) +{ + ulong modn; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i, j; + + nmod_poly_t poly, div, quo, rem; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + + nmod_poly_init2(poly, modn, length); + _nmod_poly_set_length(poly, length); + for (j = 0; j < length; j++) + poly->coeffs[j] = n_randint(state, modn); + _nmod_poly_normalise(poly); + + c = _get_c_from_ctype(info->ctype, state, modn); + + nmod_poly_init_mod(quo, poly->mod); + nmod_poly_init_mod(rem, poly->mod); + nmod_poly_init_mod(div, poly->mod); + + /* divisor xnmc: b == x**n - c */ + nmod_poly_set_coeff_ui(div, n, 1); + nmod_poly_set_coeff_ui(div, 0, n_negmod(c, modn)); + + prof_start(); + nmod_poly_divrem(quo, rem, poly, div); + prof_stop(); + + nmod_poly_clear(quo); + nmod_poly_clear(rem); + nmod_poly_clear(poly); + nmod_poly_clear(div); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_interface(void * arg, ulong count) +{ + ulong modn; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i, j; + + nmod_poly_t poly, quo, rem; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + + nmod_poly_init2(poly, modn, length); + _nmod_poly_set_length(poly, length); + for (j = 0; j < length; j++) + poly->coeffs[j] = n_randint(state, modn); + _nmod_poly_normalise(poly); + + c = _get_c_from_ctype(info->ctype, state, modn); + + nmod_poly_init_mod(quo, poly->mod); + nmod_poly_init_mod(rem, poly->mod); + + prof_start(); + nmod_poly_divrem_xnmc(quo, rem, poly, n, c); + prof_stop(); + + nmod_poly_clear(quo); + nmod_poly_clear(rem); + nmod_poly_clear(poly); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_generic(void * arg, ulong count) +{ + ulong n; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + slong i, j; + + ulong * poly; + ulong pt; + + FLINT_TEST_INIT(state); + + poly = _nmod_vec_init(length); + + for (i = 0; i < count; i++) + { + n = n_randbits(state, bits); + if (n == UWORD(0)) n++; + + nmod_init(&mod, n); + + for (j = 0; j < length; j++) + poly[j] = n_randint(state, n); + + pt = n_randint(state, n); + + prof_start(); + for (j = 0; j < 100; j++) + _nmod_poly_evaluate_nmod(poly, length, pt, mod); + prof_stop(); + } + + _nmod_vec_clear(poly); + + FLINT_TEST_CLEAR(state); +} + +void sample_precomp(void * arg, ulong count) +{ + ulong n; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + slong i, j; + + ulong * poly; + ulong pt; + + FLINT_TEST_INIT(state); + + poly = _nmod_vec_init(length); + + for (i = 0; i < count; i++) + { + n = n_randbits(state, bits); + if (n == UWORD(0)) n++; + + nmod_init(&mod, n); + + for (j = 0; j < length; j++) + poly[j] = n_randint(state, n); + + pt = n_randint(state, n); + + prof_start(); + for (j = 0; j < 100; j++) + { + const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); + _nmod_poly_evaluate_nmod_precomp(poly, length, pt, pt_precomp, n); + } + prof_stop(); + } + + _nmod_vec_clear(poly); + + FLINT_TEST_CLEAR(state); +} + +void sample_precomp_lazy(void * arg, ulong count) +{ + ulong n; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + slong i, j; + + ulong * poly; + ulong pt; + + FLINT_TEST_INIT(state); + + poly = _nmod_vec_init(length); + + for (i = 0; i < count; i++) + { + n = n_randbits(state, bits); + if (n == UWORD(0)) n++; + + nmod_init(&mod, n); + + for (j = 0; j < length; j++) + poly[j] = n_randint(state, n); + + pt = n_randint(state, n); + + prof_start(); + for (j = 0; j < 100; j++) + { + const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); + ulong val = _nmod_poly_evaluate_nmod_precomp_lazy(poly, length, pt, pt_precomp, n); + if (val >= 2*n) + val -= 2*n; + else if (val >= n) + val -= n; + } + prof_stop(); + } + + _nmod_vec_clear(poly); + + FLINT_TEST_CLEAR(state); +} + +int main(void) +{ + const int nblengths = 17; + slong lengths[17] = {5, 10, 20, 40, 80, + 160, 320, 640, 1280, 2560, + 5120, 10240, 20480, 40960, 81920, + 163840, 327680}; + + double min, max; + double mins_divrem[nblengths]; + double mins_interface[nblengths]; + double mins_generic[nblengths]; + double mins_precomp[nblengths]; + double mins_precomp_lazy[nblengths]; + info_t info; + flint_bitcnt_t i; + + flint_printf("unit: all measurements in c/l\n"); + flint_printf("profiled: divrem | interface | generic | precomp | precomp_lazy\n"); + + for (i = 62; i <= FLINT_BITS; i++) + { + info.bits = i; + + printf("nbits = %ld\n", i); + for (int len = 0; len < nblengths; ++len) + { + info.length = lengths[len]; + info.n = FLINT_MAX(1, lengths[len] / 5); + info.ctype = 2; + + prof_repeat(&min, &max, sample_divrem, (void *) &info); + mins_divrem[len-1] = min; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface[len-1] = min; + /* prof_repeat(&min, &max, sample_generic, (void *) &info); */ + mins_generic[len-1] = min; + /* prof_repeat(&min, &max, sample_precomp, (void *) &info); */ + mins_precomp[len-1] = min; + /* prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); */ + mins_precomp_lazy[len-1] = min; + } + + if (i < FLINT_BITS-1) + { + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t%.1e", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } + } + else if (i < FLINT_BITS) + { + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t na", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } + } + else // i == FLINT_BITS + { + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t na \t na", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } + } + + flint_printf("\n"); + } + + return 0; +} diff --git a/src/nmod_poly/profile/p-evaluate_nmod.c b/src/nmod_poly/profile/p-evaluate_nmod.c index b9854c32c0..76bfcae173 100644 --- a/src/nmod_poly/profile/p-evaluate_nmod.c +++ b/src/nmod_poly/profile/p-evaluate_nmod.c @@ -43,6 +43,7 @@ void sample_interface(void * arg, ulong count) _nmod_poly_set_length(poly, length); for (j = 0; j < length; j++) poly->coeffs[j] = n_randint(state, n); + _nmod_poly_normalise(poly); pt = n_randint(state, n); diff --git a/src/nmod_poly/test/main.c b/src/nmod_poly/test/main.c index 04dc56b5e8..5872f6b213 100644 --- a/src/nmod_poly/test/main.c +++ b/src/nmod_poly/test/main.c @@ -45,6 +45,7 @@ #include "t-divrem.c" #include "t-divrem_newton_n_preinv.c" #include "t-div_root.c" +#include "t-divrem_xnmc.c" #include "t-div_series_basecase.c" #include "t-div_series.c" #include "t-equal_trunc.c" @@ -171,6 +172,7 @@ test_struct tests[] = TEST_FUNCTION(nmod_poly_divrem), TEST_FUNCTION(nmod_poly_divrem_newton_n_preinv), TEST_FUNCTION(nmod_poly_div_root), + TEST_FUNCTION(nmod_poly_divrem_xnmc), TEST_FUNCTION(nmod_poly_div_series_basecase), TEST_FUNCTION(nmod_poly_div_series), TEST_FUNCTION(nmod_poly_equal_trunc), diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c new file mode 100644 index 0000000000..8985069b0b --- /dev/null +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -0,0 +1,101 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "nmod_vec.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod_poly.h" + +TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) +{ + int i, result = 1; + + for (i = 0; i < 1000 * flint_test_multiplier(); i++) + { + nmod_poly_t a, b; + nmod_poly_t qok, rok, q1, r1; + nn_ptr qr1; + ulong modn, len, n, c; + nmod_t mod; + + /* modulus */ + modn = n_randtest_not_zero(state); /* modulus */ + nmod_init(&mod, modn); + + /* initialize polynomials */ + nmod_poly_init_mod(a, mod); + nmod_poly_init_mod(b, mod); + nmod_poly_init_mod(qok, mod); + nmod_poly_init_mod(rok, mod); + nmod_poly_init_mod(q1, mod); + nmod_poly_init_mod(r1, mod); + + /* pick parameters */ + len = 5 + n_randint(state, 100); /* poly length, [5, 105) */ + n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */ + c = n_randint(state, modn); + + /* random a of length <= len; qr1 with same coefficients */ + nmod_poly_randtest(a, state, len); + qr1 = _nmod_vec_init(a->length); + _nmod_vec_set(qr1, a->coeffs, a->length); + + /* divisor xnmc: b == x**n - c */ + nmod_poly_set_coeff_ui(b, n, 1); + nmod_poly_set_coeff_ui(b, 0, n_negmod(c, modn)); + + // correct quotient/remainder + nmod_poly_divrem(qok, rok, a, b); + + // general variant, no assumption + nmod_poly_divrem_xnmc(q1, r1, a, n, c); + + result = (nmod_poly_equal(qok, q1) && nmod_poly_equal(rok, r1)); + if (!result) + { + flint_printf("FAIL (poly 1):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, poly a :\n", n, c); + nmod_poly_print(a), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + + /* testing in-place underscore version, requires length >= n */ + if (a->length >= (slong)n) + { + _nmod_poly_divrem_xnmc(qr1, qr1, a->length, n, c, a->mod); + result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) + && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); + if (!result) + { + flint_printf("FAIL (vec 1):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); + _nmod_vec_print(qr1, a->length, mod), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + } + + nmod_poly_clear(a); + nmod_poly_clear(b); + nmod_poly_clear(qok); + nmod_poly_clear(rok); + nmod_poly_clear(q1); + nmod_poly_clear(r1); + _nmod_vec_clear(qr1); + } + + /* special cases 1 and -1 */ + + TEST_FUNCTION_END(state); +} From 523b2c301550933217c2be35b421ff6ca001373d Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 00:35:27 +0100 Subject: [PATCH 2/9] add special cases for c == -1, c == +1, c == 0 --- src/nmod_poly.h | 2 + src/nmod_poly/divrem_xnmc.c | 65 +++++++++++++++++++++++++++++- src/nmod_poly/test/t-divrem_xnmc.c | 31 +++++++++++++- 3 files changed, 95 insertions(+), 3 deletions(-) diff --git a/src/nmod_poly.h b/src/nmod_poly.h index e407fdb4ea..cbc52917d4 100644 --- a/src/nmod_poly.h +++ b/src/nmod_poly.h @@ -507,6 +507,8 @@ void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R, const nmod_p ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod); ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c); +void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); +void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c index 69e3eb3280..a0fac9c880 100644 --- a/src/nmod_poly/divrem_xnmc.c +++ b/src/nmod_poly/divrem_xnmc.c @@ -13,6 +13,57 @@ #include "nmod_poly.h" #include "nmod_vec.h" #include "stdio.h" +#include "ulong_extras.h" + +void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) +{ + /* assumes len >= n */ + slong i; + ulong j, r; + + if (RQ != A) + for (j = 0; j < n; j++) + RQ[len-n+j] = A[len-n+j]; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], mod.n); + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], mod.n); + i -= n; + } +} + +void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) +{ + /* assumes len >= n */ + slong i; + ulong j, r; + + if (RQ != A) + for (j = 0; j < n; j++) + RQ[len-n+j] = A[len-n+j]; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], mod.n); + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], mod.n); + i -= n; + } +} void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod) { @@ -56,10 +107,22 @@ void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, return; } + if (c == 0) + { + nmod_poly_set_trunc(R, A, n); + nmod_poly_shift_right(Q, A, n); + return; + } + nn_ptr RQ = _nmod_vec_init(len); /* perform division */ - _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod); + if (c == 1) + _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod); + else if (c == A->mod.n - 1) + _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod); + else + _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod); /* copy remainder R */ nmod_poly_fit_length(R, n); diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c index 8985069b0b..11260a8772 100644 --- a/src/nmod_poly/test/t-divrem_xnmc.c +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -41,7 +41,15 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) /* pick parameters */ len = 5 + n_randint(state, 100); /* poly length, [5, 105) */ n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */ - c = n_randint(state, modn); + + if (i < 20) + c = 1; + else if (i < 40) + c = modn - 1; + else if (i < 60) + c = 0; + else + c = n_randint(state, modn); /* random a of length <= len; qr1 with same coefficients */ nmod_poly_randtest(a, state, len); @@ -73,6 +81,7 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) if (a->length >= (slong)n) { _nmod_poly_divrem_xnmc(qr1, qr1, a->length, n, c, a->mod); + result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); if (!result) @@ -80,10 +89,28 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) flint_printf("FAIL (vec 1):\n"); flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); - _nmod_vec_print(qr1, a->length, mod), flint_printf("\n\n"); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); fflush(stdout); flint_abort(); } + + if (a->length >= (slong)n && (c == 1 || c == modn-1)) + { + if (c == 1) + _nmod_poly_divrem_xnm1(a->coeffs, a->coeffs, a->length, n, a->mod); + else if (c == modn-1) + _nmod_poly_divrem_xnp1(a->coeffs, a->coeffs, a->length, n, a->mod); + + result = _nmod_vec_equal(qr1, a->coeffs, a->length); + if (!result) + { + flint_printf("FAIL (vec 1 - special):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu\n", n, c); + fflush(stdout); + flint_abort(); + } + } } nmod_poly_clear(a); From 33b5bedd7ec34d2751bec2916807e5bbdfca3d8e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 00:44:40 +0100 Subject: [PATCH 3/9] profile: add special cases for c == -1, c == +1, c == 0 --- src/nmod_poly/profile/p-divrem_xnmc.c | 112 +++++++++++++------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/src/nmod_poly/profile/p-divrem_xnmc.c b/src/nmod_poly/profile/p-divrem_xnmc.c index cae9fe5a23..92c8478a5c 100644 --- a/src/nmod_poly/profile/p-divrem_xnmc.c +++ b/src/nmod_poly/profile/p-divrem_xnmc.c @@ -19,12 +19,10 @@ static inline ulong _get_c_from_ctype(int ctype, flint_rand_t state, ulong modn) { - if (ctype == 1) + if (ctype == 0) + return modn - 1; + else if (ctype == 1) return 1; - else if (ctype == -1) - return -1; - else if (ctype == 0) - return 0; else return n_randint(state, modn); } @@ -34,7 +32,7 @@ typedef struct flint_bitcnt_t bits; slong length; ulong n; - int ctype; /* 0: c == 0; 1: c == 1; -1: c == -1; else: general*/ + int ctype; /* 0: c == -1; 1: c == 1; else: general*/ } info_t; void sample_divrem(void * arg, ulong count) @@ -271,69 +269,71 @@ int main(void) info_t info; flint_bitcnt_t i; - flint_printf("unit: all measurements in c/l\n"); flint_printf("profiled: divrem | interface | generic | precomp | precomp_lazy\n"); for (i = 62; i <= FLINT_BITS; i++) { info.bits = i; - printf("nbits = %ld\n", i); - for (int len = 0; len < nblengths; ++len) - { - info.length = lengths[len]; - info.n = FLINT_MAX(1, lengths[len] / 5); - info.ctype = 2; - - prof_repeat(&min, &max, sample_divrem, (void *) &info); - mins_divrem[len-1] = min; - prof_repeat(&min, &max, sample_interface, (void *) &info); - mins_interface[len-1] = min; - /* prof_repeat(&min, &max, sample_generic, (void *) &info); */ - mins_generic[len-1] = min; - /* prof_repeat(&min, &max, sample_precomp, (void *) &info); */ - mins_precomp[len-1] = min; - /* prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); */ - mins_precomp_lazy[len-1] = min; - } - - if (i < FLINT_BITS-1) + for (int ct = 0; ct <= 2; ct++) { + info.ctype = ct; + printf("nbits = %ld, c type = %d\n", i, ct); for (int len = 0; len < nblengths; ++len) { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t%.1e", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); + info.length = lengths[len]; + info.n = FLINT_MAX(1, lengths[len] / 5); + + prof_repeat(&min, &max, sample_divrem, (void *) &info); + mins_divrem[len-1] = min; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface[len-1] = min; + /* prof_repeat(&min, &max, sample_generic, (void *) &info); */ + mins_generic[len-1] = min; + /* prof_repeat(&min, &max, sample_precomp, (void *) &info); */ + mins_precomp[len-1] = min; + /* prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); */ + mins_precomp_lazy[len-1] = min; } - } - else if (i < FLINT_BITS) - { - for (int len = 0; len < nblengths; ++len) + + if (i < FLINT_BITS-1) { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t na", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t%.1e", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } } - } - else // i == FLINT_BITS - { - for (int len = 0; len < nblengths; ++len) + else if (i < FLINT_BITS) + { + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t na", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } + } + else // i == FLINT_BITS { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t na \t na", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); + for (int len = 0; len < nblengths; ++len) + { + flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t na \t na", + lengths[len], + mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, + mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); + flint_printf("\n"); + } } } From f0558bb24dc784d9758983f3ad2a9b77ed9ef705 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 11:39:43 +0100 Subject: [PATCH 4/9] add variants with precomputation when modulus is small enough, tested --- doc/source/nmod_poly.rst | 16 +-- src/nmod_poly.h | 2 + src/nmod_poly/divrem_xnmc.c | 154 ++++++++++++++++++++++++++++- src/nmod_poly/evaluate_nmod.c | 8 +- src/nmod_poly/test/t-divrem_xnmc.c | 76 +++++++++++--- 5 files changed, 226 insertions(+), 30 deletions(-) diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst index 24989705ba..93b4c304c8 100644 --- a/doc/source/nmod_poly.rst +++ b/doc/source/nmod_poly.rst @@ -1338,14 +1338,14 @@ Evaluation .. function:: ulong _nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong n) Evaluates ``poly`` at the value ``c`` modulo ``n``, with lazy reductions - modulo `n`. Precisely, if all coefficients of ``poly`` are less than `m`, - the input requirement is `m \le 2^{\mathtt{FLINT\_BITS}} - 2n + 1`, and the - output value is in `[0, m+2n-1)` and equal to the sought evaluation modulo - `n`. In particular the coefficients of ``poly`` need not be reduced modulo - ``n``, and the output may not be either. However, the value ``c`` should be - reduced modulo `n`. - - In the case where `m = n` (coefficients of ``poly`` are reduced modulo + modulo `n`. Precisely, if all coefficients of ``poly`` are less than or + equal to `m`, the input requirement is `m + 2n \le + 2^{\mathtt{FLINT\_BITS}}`, and the output value is in `[0, m+2n)` and equal + to the sought evaluation modulo `n`. In particular the coefficients of + ``poly`` need not be reduced modulo ``n``, and the output may not be + either. However, the value ``c`` should be reduced modulo `n`. + + In the case where `m = n-1` (coefficients of ``poly`` are reduced modulo `n`), then the above leads to the requirement `3n-1 \le 2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits, and `n \le 1431655765` for 32 bits), and reducing the output just amounts diff --git a/src/nmod_poly.h b/src/nmod_poly.h index cbc52917d4..f2d0aedafe 100644 --- a/src/nmod_poly.h +++ b/src/nmod_poly.h @@ -510,6 +510,8 @@ ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c); void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); +void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); +void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); /* Divisibility testing *****************************************************/ diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c index a0fac9c880..9eb605a6d1 100644 --- a/src/nmod_poly/divrem_xnmc.c +++ b/src/nmod_poly/divrem_xnmc.c @@ -12,9 +12,9 @@ /* #include "flint.h" */ #include "nmod_poly.h" #include "nmod_vec.h" -#include "stdio.h" #include "ulong_extras.h" +/* division by x**n - 1 */ void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) { /* assumes len >= n */ @@ -40,6 +40,7 @@ void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t m } } +/* division by x**n + 1 */ void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) { /* assumes len >= n */ @@ -65,6 +66,7 @@ void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t m } } +/* division by x**n - c, general variant */ void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod) { /* assumes len >= n */ @@ -96,6 +98,81 @@ void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, } } +/* division by x**n - c, with precomputation on c */ +/* constraint: modn < 2**(FLINT_BITS-1) */ +void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val; + + if (RQ != A) + for (j = 0; j < n; j++) + RQ[len-n+j] = A[len-n+j]; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn); + RQ[i+j] = n_addmod(val, A[i+j], modn); + } + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn); + RQ[i+j] = n_addmod(val, A[i+j], modn); + } + i -= n; + } +} + +/* division by x**n - c, lazy with precomputation */ +/* constraint: max(A) + 2*modn <= 2**FLINT_BITS */ +/* coeff bounds: in [0, max(A)] | out [0, max(A) + 2*modn) */ +void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val, p_hi, p_lo; + + if (RQ != A) + for (j = 0; j < n; j++) + RQ[len-n+j] = A[len-n+j]; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + /* computes either val = (c*val mod n) or val = (c*val mod n) + n */ + val = RQ[i+n+j]; + umul_ppmm(p_hi, p_lo, c_precomp, val); + val = c * val - p_hi * modn; + /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */ + RQ[i+j] = val + A[i+j]; + } + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + /* computes either val = (c*val mod n) or val = (c*val mod n) + n */ + val = RQ[i+n+j]; + umul_ppmm(p_hi, p_lo, c_precomp, val); + val = c * val - p_hi * modn; + /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */ + RQ[i+j] = val + A[i+j]; + } + i -= n; + } +} + void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c) { const ulong len = A->length; @@ -114,25 +191,94 @@ void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, return; } + int lazy = 0; nn_ptr RQ = _nmod_vec_init(len); /* perform division */ if (c == 1) _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod); + else if (c == A->mod.n - 1) _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod); - else + + /* if degree below the n_mulmod_shoup threshold, */ + /* or if modulus forbids n_mulmod_shoup usage, use general */ +#if FLINT_MULMOD_SHOUP_THRESHOLD <= 2 + else if (A->mod.norm == 0) /* here A->length >= threshold */ +#else + else if ((A->length < FLINT_MULMOD_SHOUP_THRESHOLD) + || (A->mod.norm == 0)) +#endif + { _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod); + } + + else + { + const ulong modn = A->mod.n; + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + + /* if 3*mod.n - 1 <= 2**FLINT_BITS, use precomp+lazy variant */ +#if FLINT_BITS == 64 + if (modn <= UWORD(6148914691236517205)) +#else /* FLINT_BITS == 32 */ + if (modn <= UWORD(1431655765)) +#endif + { + lazy = 1; + _nmod_poly_divrem_xnmc_precomp_lazy(RQ, A->coeffs, len, n, c, c_precomp, modn); + } + + /* use n_mulmod_shoup, non-lazy variant */ + else + { + _nmod_poly_divrem_xnmc_precomp(RQ, A->coeffs, len, n, c, c_precomp, modn); + } + } /* copy remainder R */ nmod_poly_fit_length(R, n); - _nmod_vec_set(R->coeffs, RQ, n); + if (lazy) + { + /* correct excess */ + const ulong modn = A->mod.n; + for (ulong i = 0; i < n; i++) + { + ulong v = RQ[i]; + if (v >= 2*modn) + v -= 2*modn; + else if (v >= modn) + v -= modn; + R->coeffs[i] = v; + } + } + else + { + _nmod_vec_set(R->coeffs, RQ, n); + } _nmod_poly_set_length(R, n); _nmod_poly_normalise(R); /* copy quotient Q */ nmod_poly_fit_length(Q, len - n); - _nmod_vec_set(Q->coeffs, RQ + n, len - n); + if (lazy) + { + /* correct excess */ + const ulong modn = A->mod.n; + for (ulong i = 0; i < len - n; i++) + { + ulong v = RQ[n+i]; + if (v >= 2*modn) + v -= 2*modn; + else if (v >= modn) + v -= modn; + Q->coeffs[i] = v; + } + } + else + { + _nmod_vec_set(Q->coeffs, RQ + n, len - n); + } _nmod_poly_set_length(Q, len - n); _nmod_vec_clear(RQ); diff --git a/src/nmod_poly/evaluate_nmod.c b/src/nmod_poly/evaluate_nmod.c index 5e843005da..2dda5d3a87 100644 --- a/src/nmod_poly/evaluate_nmod.c +++ b/src/nmod_poly/evaluate_nmod.c @@ -119,20 +119,20 @@ nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c) return _nmod_poly_evaluate_nmod(poly->coeffs, poly->length, c, poly->mod); } - // if 3*mod.n - 1 <= 2**FLINT_BITS, use n_mulmod_shoup, lazy variant else { const ulong modn = poly->mod.n; + /* if 3*mod.n - 1 <= 2**FLINT_BITS, use n_mulmod_shoup, lazy variant */ #if FLINT_BITS == 64 if (modn <= UWORD(6148914691236517205)) -#else // FLINT_BITS == 32 +#else /* FLINT_BITS == 32 */ if (modn <= UWORD(1431655765)) #endif { const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); ulong val = _nmod_poly_evaluate_nmod_precomp_lazy(poly->coeffs, poly->length, c, c_precomp, modn); - // correct excess + /* correct excess */ if (val >= 2*modn) val -= 2*modn; else if (val >= modn) @@ -140,7 +140,7 @@ nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c) return val; } - // use n_mulmod_shoup, non-lazy variant + /* use n_mulmod_shoup, non-lazy variant */ else { const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c index 11260a8772..b500a0314b 100644 --- a/src/nmod_poly/test/t-divrem_xnmc.c +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -9,6 +9,7 @@ (at your option) any later version. See . */ +#include "nmod.h" #include "nmod_vec.h" #include "test_helpers.h" #include "ulong_extras.h" @@ -27,7 +28,14 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) nmod_t mod; /* modulus */ - modn = n_randtest_not_zero(state); /* modulus */ + if (i < 128) + modn = n_randtest_bits(state, FLINT_BITS); + else if (i < 256) + modn = n_randtest_bits(state, FLINT_BITS-1); + else if (i < 384) + modn = n_randtest_bits(state, FLINT_BITS-2); + else + modn = n_randtest_not_zero(state); nmod_init(&mod, modn); /* initialize polynomials */ @@ -42,11 +50,11 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) len = 5 + n_randint(state, 100); /* poly length, [5, 105) */ n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */ - if (i < 20) + if (i % 16 == 0 && modn > 1) c = 1; - else if (i < 40) + else if (i % 16 == 1) c = modn - 1; - else if (i < 60) + else if (i % 16 == 2) c = 0; else c = n_randint(state, modn); @@ -63,7 +71,7 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) // correct quotient/remainder nmod_poly_divrem(qok, rok, a, b); - // general variant, no assumption + // general interface nmod_poly_divrem_xnmc(q1, r1, a, n, c); result = (nmod_poly_equal(qok, q1) && nmod_poly_equal(rok, r1)); @@ -80,13 +88,14 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) /* testing in-place underscore version, requires length >= n */ if (a->length >= (slong)n) { + /* general function */ _nmod_poly_divrem_xnmc(qr1, qr1, a->length, n, c, a->mod); result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); if (!result) { - flint_printf("FAIL (vec 1):\n"); + flint_printf("FAIL (vec):\n"); flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); @@ -94,19 +103,60 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) flint_abort(); } - if (a->length >= (slong)n && (c == 1 || c == modn-1)) + /* special cases 1, -1 */ + _nmod_vec_set(qr1, a->coeffs, a->length); + if (c == 1 || c == modn - 1) { if (c == 1) - _nmod_poly_divrem_xnm1(a->coeffs, a->coeffs, a->length, n, a->mod); + _nmod_poly_divrem_xnm1(qr1, qr1, a->length, n, a->mod); else if (c == modn-1) - _nmod_poly_divrem_xnp1(a->coeffs, a->coeffs, a->length, n, a->mod); + _nmod_poly_divrem_xnp1(qr1, qr1, a->length, n, a->mod); - result = _nmod_vec_equal(qr1, a->coeffs, a->length); + result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) + && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); if (!result) { - flint_printf("FAIL (vec 1 - special):\n"); + flint_printf("FAIL (vec, +1 / - 1):\n"); flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); - flint_printf("n = %wu, c = %wu\n", n, c); + flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + } + + /* cases using precomp */ + _nmod_vec_set(qr1, a->coeffs, a->length); + if (NMOD_CAN_USE_SHOUP(mod)) + { + ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + +#if FLINT_BITS == 64 + if (n <= UWORD(6148914691236517205)) +#else // FLINT_BITS == 32 + if (n <= UWORD(1431655765)) +#endif + { + _nmod_poly_divrem_xnmc_precomp_lazy(qr1, qr1, a->length, n, c, c_precomp, modn); + for (long ii = 0; ii < a->length; ii++) + { + if (qr1[ii] >= 2*modn) + qr1[ii] -= 2*modn; + else if (qr1[ii] >= modn) + qr1[ii] -= modn; + } + } + else + _nmod_poly_divrem_xnmc_precomp(qr1, qr1, a->length, n, c, c_precomp, modn); + + result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) + && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); + if (!result) + { + flint_printf("FAIL (vec, precomp):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); fflush(stdout); flint_abort(); } @@ -122,7 +172,5 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) _nmod_vec_clear(qr1); } - /* special cases 1 and -1 */ - TEST_FUNCTION_END(state); } From d535b580be43d9cf4074316145658de7f052bdb3 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 12:39:36 +0100 Subject: [PATCH 5/9] update profile file --- src/nmod_poly/profile/p-divrem_xnmc.c | 216 +++++++++++--------------- 1 file changed, 93 insertions(+), 123 deletions(-) diff --git a/src/nmod_poly/profile/p-divrem_xnmc.c b/src/nmod_poly/profile/p-divrem_xnmc.c index 92c8478a5c..6afde3a30f 100644 --- a/src/nmod_poly/profile/p-divrem_xnmc.c +++ b/src/nmod_poly/profile/p-divrem_xnmc.c @@ -127,149 +127,135 @@ void sample_interface(void * arg, ulong count) void sample_generic(void * arg, ulong count) { - ulong n; + ulong modn; nmod_t mod; info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + ulong n = info->n; + ulong c; + slong i, j; - ulong * poly; - ulong pt; + nn_ptr qr; FLINT_TEST_INIT(state); - poly = _nmod_vec_init(length); - - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { - n = n_randbits(state, bits); - if (n == UWORD(0)) n++; - - nmod_init(&mod, n); + modn = n_randprime(state, bits, 1); + nmod_init(&mod, modn); + qr = _nmod_vec_init(length); for (j = 0; j < length; j++) - poly[j] = n_randint(state, n); + qr[j] = n_randint(state, modn); - pt = n_randint(state, n); + c = _get_c_from_ctype(info->ctype, state, modn); prof_start(); - for (j = 0; j < 100; j++) - _nmod_poly_evaluate_nmod(poly, length, pt, mod); + _nmod_poly_divrem_xnmc(qr, qr, length, n, c, mod); prof_stop(); - } - _nmod_vec_clear(poly); + _nmod_vec_clear(qr); + } FLINT_TEST_CLEAR(state); } void sample_precomp(void * arg, ulong count) { - ulong n; - nmod_t mod; + ulong modn; info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + ulong n = info->n; + ulong c; + slong i, j; - ulong * poly; - ulong pt; + nn_ptr qr; FLINT_TEST_INIT(state); - poly = _nmod_vec_init(length); - - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { - n = n_randbits(state, bits); - if (n == UWORD(0)) n++; - - nmod_init(&mod, n); + modn = n_randprime(state, bits, 1); + qr = _nmod_vec_init(length); for (j = 0; j < length; j++) - poly[j] = n_randint(state, n); + qr[j] = n_randint(state, modn); - pt = n_randint(state, n); + c = _get_c_from_ctype(info->ctype, state, modn); prof_start(); - for (j = 0; j < 100; j++) - { - const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); - _nmod_poly_evaluate_nmod_precomp(poly, length, pt, pt_precomp, n); - } + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + _nmod_poly_divrem_xnmc_precomp(qr, qr, length, n, c, c_precomp, modn); prof_stop(); - } - _nmod_vec_clear(poly); + _nmod_vec_clear(qr); + } FLINT_TEST_CLEAR(state); } void sample_precomp_lazy(void * arg, ulong count) { - ulong n; - nmod_t mod; + ulong modn; info_t * info = (info_t *) arg; flint_bitcnt_t bits = info->bits; slong length = info->length; + ulong n = info->n; + ulong c; + slong i, j; - ulong * poly; - ulong pt; + nn_ptr qr; FLINT_TEST_INIT(state); - poly = _nmod_vec_init(length); - - for (i = 0; i < count; i++) + for (i = 0; i < (slong)count; i++) { - n = n_randbits(state, bits); - if (n == UWORD(0)) n++; - - nmod_init(&mod, n); + modn = n_randprime(state, bits, 1); + qr = _nmod_vec_init(length); for (j = 0; j < length; j++) - poly[j] = n_randint(state, n); + qr[j] = n_randint(state, modn); - pt = n_randint(state, n); + c = _get_c_from_ctype(info->ctype, state, modn); prof_start(); - for (j = 0; j < 100; j++) - { - const ulong pt_precomp = n_mulmod_precomp_shoup(pt, n); - ulong val = _nmod_poly_evaluate_nmod_precomp_lazy(poly, length, pt, pt_precomp, n); - if (val >= 2*n) - val -= 2*n; - else if (val >= n) - val -= n; - } + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + _nmod_poly_divrem_xnmc_precomp_lazy(qr, qr, length, n, c, c_precomp, modn); prof_stop(); - } - _nmod_vec_clear(poly); + _nmod_vec_clear(qr); + } FLINT_TEST_CLEAR(state); } int main(void) { - const int nblengths = 17; - slong lengths[17] = {5, 10, 20, 40, 80, - 160, 320, 640, 1280, 2560, - 5120, 10240, 20480, 40960, 81920, - 163840, 327680}; + const int nblengths = 15; + slong lengths[15] = {5, 40, 80, 160, 320, + 640, 1280, 2560, 5120, 10240, + 20480, 40960, 81920, 163840, 327680}; double min, max; - double mins_divrem[nblengths]; - double mins_interface[nblengths]; - double mins_generic[nblengths]; - double mins_precomp[nblengths]; - double mins_precomp_lazy[nblengths]; + double mins_divrem; + double mins_interface; + double mins_generic; + double mins_precomp; + double mins_precomp_lazy; info_t info; flint_bitcnt_t i; - flint_printf("profiled: divrem | interface | generic | precomp | precomp_lazy\n"); + flint_printf("profiled:\n"); + flint_printf("\t1. divrem\n\t2. poly interface\n\t3. vec general\n"); + flint_printf("\t4. vec precomp\n\t5. precomp_lazy (no excess correction)\n"); + flint_printf("column c: 0 -> c == -1, 1 -> c == 1, 2 -> c general\n"); + flint_printf("note: variant 2. expected to be faster than variants 3.4.5. when c == 1 or c == -1\n\n"); + flint_printf("bit c len n 1. 2. 3. 4. 5.\n"); for (i = 62; i <= FLINT_BITS; i++) { @@ -278,66 +264,50 @@ int main(void) for (int ct = 0; ct <= 2; ct++) { info.ctype = ct; - printf("nbits = %ld, c type = %d\n", i, ct); for (int len = 0; len < nblengths; ++len) { info.length = lengths[len]; - info.n = FLINT_MAX(1, lengths[len] / 5); - - prof_repeat(&min, &max, sample_divrem, (void *) &info); - mins_divrem[len-1] = min; - prof_repeat(&min, &max, sample_interface, (void *) &info); - mins_interface[len-1] = min; - /* prof_repeat(&min, &max, sample_generic, (void *) &info); */ - mins_generic[len-1] = min; - /* prof_repeat(&min, &max, sample_precomp, (void *) &info); */ - mins_precomp[len-1] = min; - /* prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); */ - mins_precomp_lazy[len-1] = min; - } - if (i < FLINT_BITS-1) - { - for (int len = 0; len < nblengths; ++len) + info.n = 1; + while (info.n < (ulong)lengths[len]) { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t%.1e", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp_lazy[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); - } - } - else if (i < FLINT_BITS) - { - for (int len = 0; len < nblengths; ++len) - { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t%.1e\t na", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_precomp[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); - } - } - else // i == FLINT_BITS - { - for (int len = 0; len < nblengths; ++len) - { - flint_printf(" len %ld\t%.1e\t%.1e\t%.1e\t na \t na", - lengths[len], - mins_divrem[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_interface[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR, - mins_generic[len-1]/(double)FLINT_CLOCK_SCALE_FACTOR); - flint_printf("\n"); + prof_repeat(&min, &max, sample_divrem, (void *) &info); + mins_divrem = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_generic, (void *) &info); + mins_generic = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_precomp, (void *) &info); + mins_precomp = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); + mins_precomp_lazy = min/(double)FLINT_CLOCK_SCALE_FACTOR; + + if (i < FLINT_BITS-1) + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e", + info.bits, info.ctype, info.length, info.n, + mins_divrem, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy); + flint_printf("\n"); + } + else if (i < FLINT_BITS) + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na", + info.bits, info.ctype, info.length, info.n, + mins_divrem, mins_interface, mins_generic, mins_precomp); + flint_printf("\n"); + } + else /* i == FLINT_BITS */ + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e na na", + info.bits, info.ctype, info.length, info.n, + mins_divrem, mins_interface, mins_generic); + flint_printf("\n"); + } + + info.n = 5 * info.n; } } } - - flint_printf("\n"); } return 0; From 7ac03bb33c01ed54053a65a85243720e87f2821b Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 14:03:39 +0100 Subject: [PATCH 6/9] small fixes and update documentation --- doc/source/nmod_poly.rst | 38 ++++++++++++++++++++++++++++-- src/nmod_poly.h | 4 ++-- src/nmod_poly/divrem_xnmc.c | 16 ++++++------- src/nmod_poly/test/t-divrem_xnmc.c | 4 ++-- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst index 93b4c304c8..7f7a7b682b 100644 --- a/doc/source/nmod_poly.rst +++ b/doc/source/nmod_poly.rst @@ -1242,7 +1242,7 @@ Division with special divisors Sets `Q` to the quotient of `A` on division by `(x - c)`, and returns the remainder, equal to the value of `A` evaluated at `c`. -.. function:: void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); +.. function:: void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod) Sets the first `n` coefficients of ``RQ`` to the remainder of ``(A, len)`` on division by `x^n - c`, and the next ``len - n`` @@ -1252,11 +1252,45 @@ Division with special divisors Constraints: `len \ge n > 0`, and `c` is reduced modulo ``mod.n``. +.. function:: void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) + + Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = 1`, + that is, divisor is `x^n - 1`. + +.. function:: void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) + + Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = -1`, + that is, divisor is `x^n + 1`. + +.. function:: void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) + + Identical to :func:`_nmod_poly_divrem_xnmc` but exploiting the precomputed + ``c_precomp`` obtained via :func:`n_mulmod_precomp_shoup` for faster + modular multiplications by `c`. Additional constraint: the modulus ``modn`` + must be less than `2^{\mathtt{FLINT\_BITS} - 1}`. + +.. function:: void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) + + Identical to :func:`_nmod_poly_divrem_xnmc_precomp` but handling modular + reductions lazily. Precisely, if all coefficients of `A` are less than or + equal to `m`, the input requirement is `m + 2n \le + 2^{\mathtt{FLINT\_BITS}}`, and the output coefficients of ``RQ`` are in + `[0, m+2n)` and equal to the sought values modulo `n`. In particular the + coefficients of `A` need not be reduced modulo ``n``, and the output may + not be either. However, the value ``c`` should be reduced modulo `n`. + + In the case where `m = n-1` (coefficients of `A` are reduced modulo + `n`), then the above leads to the requirement `3n-1 \le + 2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits, + and `n \le 1431655765` for 32 bits), and reducing the output coefficients + just amounts to subtracting either `n` or `2n` to each of them. + .. function:: void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, ulong n, ulong c) Sets `Q` and `R` to the quotient and remainder of `A` on division by `x^n - c`. Constraints: `n` is nonzero and ``c`` is reduced modulo the modulus of - `A`. + `A`. Incorporates specialized code for the cases `c \in \{-1,0,1\}`, and + calls variants with precomputation on `c` when possible. Divisibility testing diff --git a/src/nmod_poly.h b/src/nmod_poly.h index f2d0aedafe..b445fbb541 100644 --- a/src/nmod_poly.h +++ b/src/nmod_poly.h @@ -507,9 +507,9 @@ void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R, const nmod_p ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod); ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c); -void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); -void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod); void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); +void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn); +void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn); void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c index 9eb605a6d1..de9612c475 100644 --- a/src/nmod_poly/divrem_xnmc.c +++ b/src/nmod_poly/divrem_xnmc.c @@ -15,7 +15,7 @@ #include "ulong_extras.h" /* division by x**n - 1 */ -void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) +void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn) { /* assumes len >= n */ slong i; @@ -29,19 +29,19 @@ void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t m i = len - r - n; /* multiple of n, >= 0 by assumption */ for (j = 0; j < r; j++) - RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], mod.n); + RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn); i -= n; while (i >= 0) { for (j = 0; j < n; j++) - RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], mod.n); + RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn); i -= n; } } /* division by x**n + 1 */ -void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod) +void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn) { /* assumes len >= n */ slong i; @@ -55,13 +55,13 @@ void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t m i = len - r - n; /* multiple of n, >= 0 by assumption */ for (j = 0; j < r; j++) - RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], mod.n); + RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn); i -= n; while (i >= 0) { for (j = 0; j < n; j++) - RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], mod.n); + RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn); i -= n; } } @@ -196,10 +196,10 @@ void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, /* perform division */ if (c == 1) - _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod); + _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod.n); else if (c == A->mod.n - 1) - _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod); + _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod.n); /* if degree below the n_mulmod_shoup threshold, */ /* or if modulus forbids n_mulmod_shoup usage, use general */ diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c index b500a0314b..72b976c41c 100644 --- a/src/nmod_poly/test/t-divrem_xnmc.c +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -108,9 +108,9 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) if (c == 1 || c == modn - 1) { if (c == 1) - _nmod_poly_divrem_xnm1(qr1, qr1, a->length, n, a->mod); + _nmod_poly_divrem_xnm1(qr1, qr1, a->length, n, a->mod.n); else if (c == modn-1) - _nmod_poly_divrem_xnp1(qr1, qr1, a->length, n, a->mod); + _nmod_poly_divrem_xnp1(qr1, qr1, a->length, n, a->mod.n); result = (_nmod_vec_equal(qr1, rok->coeffs, rok->length) && _nmod_vec_is_zero(qr1 + rok->length, n - rok->length) && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); From 4bf1d0af4584e24cad4eb638a9f338b7973eb56f Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 2 Nov 2025 14:30:12 +0100 Subject: [PATCH 7/9] fix typo in test file --- src/nmod_poly/test/t-divrem_xnmc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c index 72b976c41c..7b8e41dc2a 100644 --- a/src/nmod_poly/test/t-divrem_xnmc.c +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -130,13 +130,15 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) if (NMOD_CAN_USE_SHOUP(mod)) { ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + int lazy = 0; #if FLINT_BITS == 64 - if (n <= UWORD(6148914691236517205)) + if (modn <= UWORD(6148914691236517205)) #else // FLINT_BITS == 32 - if (n <= UWORD(1431655765)) + if (modn <= UWORD(1431655765)) #endif { + lazy = 1; _nmod_poly_divrem_xnmc_precomp_lazy(qr1, qr1, a->length, n, c, c_precomp, modn); for (long ii = 0; ii < a->length; ii++) { @@ -153,7 +155,10 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) && _nmod_vec_equal(qr1+n, qok->coeffs, qok->length)); if (!result) { - flint_printf("FAIL (vec, precomp):\n"); + if (lazy) + flint_printf("FAIL (vec, precomp lazy):\n"); + else + flint_printf("FAIL (vec, precomp):\n"); flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); flint_printf("n = %wu, c = %wu, input vec qr :\n", n, c); _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); From eca69f8414714b97682e7c41a12018aa6565451b Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 3 Nov 2025 16:27:49 +0100 Subject: [PATCH 8/9] add rem with special divisor, along with tests and profile --- src/nmod_poly.h | 7 + src/nmod_poly/divrem_xnmc.c | 1 - src/nmod_poly/profile/p-rem_xnmc.c | 317 +++++++++++++++++++++++++++++ src/nmod_poly/rem_xnmc.c | 250 +++++++++++++++++++++++ src/nmod_poly/test/main.c | 2 + src/nmod_poly/test/t-divrem_xnmc.c | 10 +- src/nmod_poly/test/t-rem_xnmc.c | 174 ++++++++++++++++ 7 files changed, 758 insertions(+), 3 deletions(-) create mode 100644 src/nmod_poly/profile/p-rem_xnmc.c create mode 100644 src/nmod_poly/rem_xnmc.c create mode 100644 src/nmod_poly/test/t-rem_xnmc.c diff --git a/src/nmod_poly.h b/src/nmod_poly.h index b445fbb541..c963a4ab65 100644 --- a/src/nmod_poly.h +++ b/src/nmod_poly.h @@ -514,6 +514,13 @@ void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); +void _nmod_poly_rem_xnmc(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod); +void _nmod_poly_rem_xnm1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn); +void _nmod_poly_rem_xnp1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn); +void _nmod_poly_rem_xnmc_precomp(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); +void _nmod_poly_rem_xnmc_precomp_lazy(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn); +void nmod_poly_rem_xnmc(nmod_poly_t R, nmod_poly_t A, ulong n, ulong c); + /* Divisibility testing *****************************************************/ int _nmod_poly_divides_classical(nn_ptr Q, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nmod_t mod); diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c index de9612c475..138cf5af54 100644 --- a/src/nmod_poly/divrem_xnmc.c +++ b/src/nmod_poly/divrem_xnmc.c @@ -9,7 +9,6 @@ (at your option) any later version. See . */ -/* #include "flint.h" */ #include "nmod_poly.h" #include "nmod_vec.h" #include "ulong_extras.h" diff --git a/src/nmod_poly/profile/p-rem_xnmc.c b/src/nmod_poly/profile/p-rem_xnmc.c new file mode 100644 index 0000000000..c702ccce40 --- /dev/null +++ b/src/nmod_poly/profile/p-rem_xnmc.c @@ -0,0 +1,317 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "nmod.h" +#include "profiler.h" +#include "ulong_extras.h" +#include "nmod_poly.h" +#include "nmod_vec.h" + +static inline ulong +_get_c_from_ctype(int ctype, flint_rand_t state, ulong modn) +{ + if (ctype == 0) + return modn - 1; + else if (ctype == 1) + return 1; + else + return n_randint(state, modn); +} + +typedef struct +{ + flint_bitcnt_t bits; + slong length; + ulong n; + int ctype; /* 0: c == -1; 1: c == 1; else: general*/ +} info_t; + +void sample_rem(void * arg, ulong count) +{ + ulong modn; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i, j; + + nmod_poly_t poly, div, rem; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + + nmod_poly_init2(poly, modn, length); + _nmod_poly_set_length(poly, length); + for (j = 0; j < length; j++) + poly->coeffs[j] = n_randint(state, modn); + _nmod_poly_normalise(poly); + + c = _get_c_from_ctype(info->ctype, state, modn); + + nmod_poly_init_mod(rem, poly->mod); + nmod_poly_init_mod(div, poly->mod); + + /* divisor xnmc: b == x**n - c */ + nmod_poly_set_coeff_ui(div, n, 1); + nmod_poly_set_coeff_ui(div, 0, n_negmod(c, modn)); + + prof_start(); + nmod_poly_rem(rem, poly, div); + prof_stop(); + + nmod_poly_clear(rem); + nmod_poly_clear(poly); + nmod_poly_clear(div); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_interface(void * arg, ulong count) +{ + ulong modn; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i, j; + + nmod_poly_t poly, rem; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + + nmod_poly_init2(poly, modn, length); + _nmod_poly_set_length(poly, length); + for (j = 0; j < length; j++) + poly->coeffs[j] = n_randint(state, modn); + _nmod_poly_normalise(poly); + + c = _get_c_from_ctype(info->ctype, state, modn); + + nmod_poly_init_mod(rem, poly->mod); + + prof_start(); + nmod_poly_rem_xnmc(rem, poly, n, c); + prof_stop(); + + nmod_poly_clear(rem); + nmod_poly_clear(poly); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_generic(void * arg, ulong count) +{ + ulong modn; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i; + + nn_ptr r, a; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + nmod_init(&mod, modn); + + r = _nmod_vec_init(n); + a = _nmod_vec_init(length); + _nmod_vec_rand(a, state, length, mod); + + c = _get_c_from_ctype(info->ctype, state, modn); + + prof_start(); + _nmod_poly_rem_xnmc(r, a, length, n, c, mod); + prof_stop(); + + _nmod_vec_clear(r); + _nmod_vec_clear(a); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_precomp(void * arg, ulong count) +{ + ulong modn; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i; + + nn_ptr r, a; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + nmod_init(&mod, modn); + + r = _nmod_vec_init(n); + a = _nmod_vec_init(length); + _nmod_vec_rand(a, state, length, mod); + + c = _get_c_from_ctype(info->ctype, state, modn); + + prof_start(); + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + _nmod_poly_rem_xnmc_precomp(r, a, length, n, c, c_precomp, modn); + prof_stop(); + + _nmod_vec_clear(a); + _nmod_vec_clear(r); + } + + FLINT_TEST_CLEAR(state); +} + +void sample_precomp_lazy(void * arg, ulong count) +{ + ulong modn; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i; + + nn_ptr r, a; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + nmod_init(&mod, modn); + + r = _nmod_vec_init(n); + a = _nmod_vec_init(length); + _nmod_vec_rand(a, state, length, mod); + + c = _get_c_from_ctype(info->ctype, state, modn); + + prof_start(); + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + _nmod_poly_rem_xnmc_precomp_lazy(r, a, length, n, c, c_precomp, modn); + prof_stop(); + + _nmod_vec_clear(r); + _nmod_vec_clear(a); + } + + FLINT_TEST_CLEAR(state); +} + +int main(void) +{ + const int nblengths = 15; + slong lengths[15] = {5, 40, 80, 160, 320, + 640, 1280, 2560, 5120, 10240, + 20480, 40960, 81920, 163840, 327680}; + + double min, max; + double mins_rem; + double mins_interface; + double mins_generic; + double mins_precomp; + double mins_precomp_lazy; + info_t info; + flint_bitcnt_t i; + + flint_printf("profiled:\n"); + flint_printf("\t1. rem\n\t2. poly interface\n\t3. vec general\n"); + flint_printf("\t4. vec precomp\n\t5. precomp_lazy (no excess correction)\n"); + flint_printf("column c: 0 -> c == -1, 1 -> c == 1, 2 -> c general\n"); + flint_printf("note: variant 2. expected to be faster than variants 3.4.5. when c == 1 or c == -1\n\n"); + flint_printf("bit c len n 1. 2. 3. 4. 5.\n"); + + for (i = 62; i <= FLINT_BITS; i++) + { + info.bits = i; + + for (int ct = 0; ct <= 2; ct++) + { + info.ctype = ct; + for (int len = 0; len < nblengths; ++len) + { + info.length = lengths[len]; + + info.n = 1; + while (info.n < (ulong)lengths[len]) + { + prof_repeat(&min, &max, sample_rem, (void *) &info); + mins_rem = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_interface, (void *) &info); + mins_interface = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_generic, (void *) &info); + mins_generic = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_precomp, (void *) &info); + mins_precomp = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_precomp_lazy, (void *) &info); + mins_precomp_lazy = min/(double)FLINT_CLOCK_SCALE_FACTOR; + + if (i < FLINT_BITS-1) + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e", + info.bits, info.ctype, info.length, info.n, + mins_rem, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy); + flint_printf("\n"); + } + else if (i < FLINT_BITS) + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na", + info.bits, info.ctype, info.length, info.n, + mins_rem, mins_interface, mins_generic, mins_precomp); + flint_printf("\n"); + } + else /* i == FLINT_BITS */ + { + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e na na", + info.bits, info.ctype, info.length, info.n, + mins_rem, mins_interface, mins_generic); + flint_printf("\n"); + } + + info.n = 5 * info.n; + } + } + } + } + + return 0; +} diff --git a/src/nmod_poly/rem_xnmc.c b/src/nmod_poly/rem_xnmc.c new file mode 100644 index 0000000000..c7e8439d3b --- /dev/null +++ b/src/nmod_poly/rem_xnmc.c @@ -0,0 +1,250 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "ulong_extras.h" + +/* division by x**n - 1 */ +void _nmod_poly_rem_xnm1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + R[j] = n_addmod(A[i+n+j], A[i+j], modn); + for (j = r; j < n; j++) + R[j] = A[i+j]; + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + R[j] = n_addmod(R[j], A[i+j], modn); + i -= n; + } +} + +/* division by x**n + 1 */ +void _nmod_poly_rem_xnp1(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + R[j] = n_submod(A[i+j], A[i+n+j], modn); + for (j = r; j < n; j++) + R[j] = A[i+j]; + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + R[j] = n_submod(A[i+j], R[j], modn); + i -= n; + } +} + +/* division by x**n - c, general variant */ +void _nmod_poly_rem_xnmc(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + val = nmod_mul(A[i+n+j], c, mod); + R[j] = n_addmod(val, A[i+j], mod.n); + } + for (j = r; j < n; j++) + R[j] = A[i+j]; + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + val = nmod_mul(R[j], c, mod); + R[j] = n_addmod(val, A[i+j], mod.n); + } + i -= n; + } +} + +/* division by x**n - c, with precomputation on c */ +/* constraint: modn < 2**(FLINT_BITS-1) */ +void _nmod_poly_rem_xnmc_precomp(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + val = n_mulmod_shoup(c, A[i+n+j], c_precomp, modn); + R[j] = n_addmod(val, A[i+j], modn); + } + for (j = r; j < n; j++) + R[j] = A[i+j]; + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + val = n_mulmod_shoup(c, R[j], c_precomp, modn); + R[j] = n_addmod(val, A[i+j], modn); + } + i -= n; + } +} + +/* division by x**n - c, lazy with precomputation */ +/* constraint: max(A) + 2*modn <= 2**FLINT_BITS */ +/* coeff bounds: in [0, max(A)] | out [0, max(A) + 2*modn) */ +void _nmod_poly_rem_xnmc_precomp_lazy(nn_ptr R, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn) +{ + /* assumes len >= n */ + slong i; + ulong j, r, val, p_hi, p_lo; + + r = len % n; + i = len - r - n; /* multiple of n, >= 0 by assumption */ + + for (j = 0; j < r; j++) + { + /* computes either val = (c*val mod n) or val = (c*val mod n) + n */ + val = A[i+n+j]; + umul_ppmm(p_hi, p_lo, c_precomp, val); + val = c * val - p_hi * modn; + /* lazy addition, yields R[i+j] in [0..k+2n), where max(R) <= k */ + R[j] = val + A[i+j]; + } + for (j = r; j < n; j++) + R[j] = A[i+j]; + + i -= n; + while (i >= 0) + { + for (j = 0; j < n; j++) + { + /* computes either val = (c*val mod n) or val = (c*val mod n) + n */ + val = R[j]; + umul_ppmm(p_hi, p_lo, c_precomp, val); + val = c * val - p_hi * modn; + /* lazy addition, yields R[i+j] in [0..k+2n), where max(R) <= k */ + R[j] = val + A[i+j]; + } + i -= n; + } +} + +void nmod_poly_rem_xnmc(nmod_poly_t R, nmod_poly_t A, ulong n, ulong c) +{ + const ulong len = A->length; + + if (len <= n) + { + nmod_poly_set(R, A); + return; + } + + if (c == 0) + { + nmod_poly_set_trunc(R, A, n); + return; + } + + int lazy = 0; + nn_ptr Rvec = _nmod_vec_init(n); + + /* perform division */ + if (c == 1) + _nmod_poly_rem_xnm1(Rvec, A->coeffs, len, n, A->mod.n); + + else if (c == A->mod.n - 1) + _nmod_poly_rem_xnp1(Rvec, A->coeffs, len, n, A->mod.n); + + /* if degree below the n_mulmod_shoup threshold, */ + /* or if modulus forbids n_mulmod_shoup usage, use general */ +#if FLINT_MULMOD_SHOUP_THRESHOLD <= 2 + else if (A->mod.norm == 0) /* here A->length >= threshold */ +#else + else if ((A->length < FLINT_MULMOD_SHOUP_THRESHOLD) + || (A->mod.norm == 0)) +#endif + { + _nmod_poly_rem_xnmc(Rvec, A->coeffs, len, n, c, A->mod); + } + + else + { + const ulong modn = A->mod.n; + const ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + + /* if 3*mod.n - 1 <= 2**FLINT_BITS, use precomp+lazy variant */ +#if FLINT_BITS == 64 + if (modn <= UWORD(6148914691236517205)) +#else /* FLINT_BITS == 32 */ + if (modn <= UWORD(1431655765)) +#endif + { + lazy = 1; + _nmod_poly_rem_xnmc_precomp_lazy(Rvec, A->coeffs, len, n, c, c_precomp, modn); + } + + /* use n_mulmod_shoup, non-lazy variant */ + else + { + _nmod_poly_rem_xnmc_precomp(Rvec, A->coeffs, len, n, c, c_precomp, modn); + } + } + + /* copy remainder R */ + nmod_poly_fit_length(R, n); + if (lazy) + { + /* correct excess */ + const ulong modn = A->mod.n; + for (ulong i = 0; i < n; i++) + { + ulong v = Rvec[i]; + if (v >= 2*modn) + v -= 2*modn; + else if (v >= modn) + v -= modn; + R->coeffs[i] = v; + } + } + else + { + _nmod_vec_set(R->coeffs, Rvec, n); + } + _nmod_poly_set_length(R, n); + _nmod_poly_normalise(R); + + _nmod_vec_clear(Rvec); +} diff --git a/src/nmod_poly/test/main.c b/src/nmod_poly/test/main.c index 5872f6b213..b6df1a5e2c 100644 --- a/src/nmod_poly/test/main.c +++ b/src/nmod_poly/test/main.c @@ -109,6 +109,7 @@ #include "t-pow_trunc.c" #include "t-product_roots_nmod_vec.c" #include "t-rem.c" +#include "t-rem_xnmc.c" #include "t-resultant.c" #include "t-resultant_euclidean.c" #include "t-resultant_hgcd.c" @@ -236,6 +237,7 @@ test_struct tests[] = TEST_FUNCTION(nmod_poly_pow_trunc), TEST_FUNCTION(nmod_poly_product_roots_nmod_vec), TEST_FUNCTION(nmod_poly_rem), + TEST_FUNCTION(nmod_poly_rem_xnmc), TEST_FUNCTION(nmod_poly_resultant), TEST_FUNCTION(nmod_poly_resultant_euclidean), TEST_FUNCTION(nmod_poly_resultant_hgcd), diff --git a/src/nmod_poly/test/t-divrem_xnmc.c b/src/nmod_poly/test/t-divrem_xnmc.c index 7b8e41dc2a..a3457be816 100644 --- a/src/nmod_poly/test/t-divrem_xnmc.c +++ b/src/nmod_poly/test/t-divrem_xnmc.c @@ -19,7 +19,7 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) { int i, result = 1; - for (i = 0; i < 1000 * flint_test_multiplier(); i++) + for (i = 0; i < 5000 * flint_test_multiplier(); i++) { nmod_poly_t a, b; nmod_poly_t qok, rok, q1, r1; @@ -69,7 +69,13 @@ TEST_FUNCTION_START(nmod_poly_divrem_xnmc, state) nmod_poly_set_coeff_ui(b, 0, n_negmod(c, modn)); // correct quotient/remainder - nmod_poly_divrem(qok, rok, a, b); + if (modn == 1) + { + nmod_poly_zero(rok); + nmod_poly_zero(qok); + } + else + nmod_poly_divrem(qok, rok, a, b); // general interface nmod_poly_divrem_xnmc(q1, r1, a, n, c); diff --git a/src/nmod_poly/test/t-rem_xnmc.c b/src/nmod_poly/test/t-rem_xnmc.c new file mode 100644 index 0000000000..99b26980c6 --- /dev/null +++ b/src/nmod_poly/test/t-rem_xnmc.c @@ -0,0 +1,174 @@ +/* + Copyright (C) 2025 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "nmod.h" +#include "nmod_vec.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod_poly.h" + +TEST_FUNCTION_START(nmod_poly_rem_xnmc, state) +{ + int i, result = 1; + + for (i = 0; i < 5000 * flint_test_multiplier(); i++) + { + nmod_poly_t a, b; + nmod_poly_t rok, r1; + nn_ptr rvec1; + ulong modn, len, n, c; + nmod_t mod; + + /* modulus */ + if (i < 128) + modn = n_randtest_bits(state, FLINT_BITS); + else if (i < 256) + modn = n_randtest_bits(state, FLINT_BITS-1); + else if (i < 384) + modn = n_randtest_bits(state, FLINT_BITS-2); + else + modn = n_randtest_not_zero(state); + nmod_init(&mod, modn); + + /* initialize polynomials */ + nmod_poly_init_mod(a, mod); + nmod_poly_init_mod(b, mod); + nmod_poly_init_mod(rok, mod); + nmod_poly_init_mod(r1, mod); + + /* pick parameters */ + len = 5 + n_randint(state, 100); /* poly length, [5, 105) */ + n = 1 + n_randint(state, 30); /* divisor degree, [1, 30) */ + + if (i % 16 == 0 && modn > 1) + c = 1; + else if (i % 16 == 1) + c = modn - 1; + else if (i % 16 == 2) + c = 0; + else + c = n_randint(state, modn); + + /* random a of length <= len; rvec1 of length == n */ + nmod_poly_randtest(a, state, len); + rvec1 = _nmod_vec_init(n); + + /* divisor xnmc: b == x**n - c */ + nmod_poly_set_coeff_ui(b, n, 1); + nmod_poly_set_coeff_ui(b, 0, n_negmod(c, modn)); + + // correct quotient/remainder + if (modn == 1) + nmod_poly_zero(rok); + else + nmod_poly_rem(rok, a, b); + + // general interface + nmod_poly_rem_xnmc(r1, a, n, c); + + result = (nmod_poly_equal(rok, r1)); + if (!result) + { + flint_printf("FAIL (poly 1):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, poly a :\n", n, c); + nmod_poly_print(a), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + + /* testing in-place underscore version, requires length >= n */ + if (a->length >= (slong)n) + { + /* general function */ + _nmod_poly_rem_xnmc(rvec1, a->coeffs, a->length, n, c, a->mod); + + result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length)); + if (!result) + { + flint_printf("FAIL (vec):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + + /* special cases 1, -1 */ + if (c == 1 || c == modn - 1) + { + if (c == 1) + _nmod_poly_rem_xnm1(rvec1, a->coeffs, a->length, n, a->mod.n); + else if (c == modn-1) + _nmod_poly_rem_xnp1(rvec1, a->coeffs, a->length, n, a->mod.n); + + result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length)); + if (!result) + { + flint_printf("FAIL (vec, +1 / - 1):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + } + + /* cases using precomp */ + if (NMOD_CAN_USE_SHOUP(mod)) + { + ulong c_precomp = n_mulmod_precomp_shoup(c, modn); + int lazy = 0; + +#if FLINT_BITS == 64 + if (modn <= UWORD(6148914691236517205)) +#else // FLINT_BITS == 32 + if (modn <= UWORD(1431655765)) +#endif + { + lazy = 1; + _nmod_poly_rem_xnmc_precomp_lazy(rvec1, a->coeffs, a->length, n, c, c_precomp, modn); + for (ulong ii = 0; ii < n; ii++) + { + if (rvec1[ii] >= 2*modn) + rvec1[ii] -= 2*modn; + else if (rvec1[ii] >= modn) + rvec1[ii] -= modn; + } + } + else + _nmod_poly_rem_xnmc_precomp(rvec1, a->coeffs, a->length, n, c, c_precomp, modn); + + result = (_nmod_vec_equal(rvec1, rok->coeffs, rok->length) && _nmod_vec_is_zero(rvec1 + rok->length, n - rok->length)); + if (!result) + { + if (lazy) + flint_printf("FAIL (vec, precomp lazy):\n"); + else + flint_printf("FAIL (vec, precomp):\n"); + flint_printf("a->length = %wd, len = %wu, modn = %wu\n", a->length, len, a->mod.n); + flint_printf("n = %wu, c = %wu, input vec rvec :\n", n, c); + _nmod_vec_print(a->coeffs, a->length, mod), flint_printf("\n\n"); + fflush(stdout); + flint_abort(); + } + } + } + + nmod_poly_clear(a); + nmod_poly_clear(b); + nmod_poly_clear(rok); + nmod_poly_clear(r1); + _nmod_vec_clear(rvec1); + } + + TEST_FUNCTION_END(state); +} From ed77123aaa27e19ade58ba91b89be87335098c4e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 3 Nov 2025 20:39:09 +0100 Subject: [PATCH 9/9] add try_sparse comparison --- src/nmod_poly/profile/p-divrem_xnmc.c | 69 +++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/src/nmod_poly/profile/p-divrem_xnmc.c b/src/nmod_poly/profile/p-divrem_xnmc.c index 6afde3a30f..b718557c6b 100644 --- a/src/nmod_poly/profile/p-divrem_xnmc.c +++ b/src/nmod_poly/profile/p-divrem_xnmc.c @@ -16,6 +16,11 @@ #include "nmod_poly.h" #include "nmod_vec.h" +int +_nmod_poly_divrem_try_sparse(nn_ptr Q, nn_ptr R, nn_srcptr A, + slong lenA, nn_srcptr B, slong lenB, + nn_srcptr Binv, slong FLINT_UNUSED(lenBinv), nmod_t mod); + static inline ulong _get_c_from_ctype(int ctype, flint_rand_t state, ulong modn) { @@ -83,6 +88,51 @@ void sample_divrem(void * arg, ulong count) FLINT_TEST_CLEAR(state); } +void sample_divrem_sparse(void * arg, ulong count) +{ + ulong modn; + nmod_t mod; + info_t * info = (info_t *) arg; + flint_bitcnt_t bits = info->bits; + slong length = info->length; + ulong n = info->n; + ulong c; + + slong i; + + nn_ptr Q, R, A, B; + + FLINT_TEST_INIT(state); + + for (i = 0; i < (slong)count; i++) + { + modn = n_randprime(state, bits, 1); + nmod_init(&mod, modn); + + Q = _nmod_vec_init(length - n); + R = _nmod_vec_init(n); + A = _nmod_vec_init(length); + _nmod_vec_rand(A, state, length, mod); + B = _nmod_vec_init(n+1); + _nmod_vec_zero(B, n+1); + + c = _get_c_from_ctype(info->ctype, state, modn); + B[0] = modn - c; + B[n] = 1; + + prof_start(); + _nmod_poly_divrem_try_sparse(Q, R, A, length, B, n+1, NULL, 0, mod); + prof_stop(); + + _nmod_vec_clear(Q); + _nmod_vec_clear(R); + _nmod_vec_clear(A); + _nmod_vec_clear(B); + } + + FLINT_TEST_CLEAR(state); +} + void sample_interface(void * arg, ulong count) { ulong modn; @@ -243,6 +293,7 @@ int main(void) double min, max; double mins_divrem; + double mins_divrem_sparse; double mins_interface; double mins_generic; double mins_precomp; @@ -251,11 +302,11 @@ int main(void) flint_bitcnt_t i; flint_printf("profiled:\n"); - flint_printf("\t1. divrem\n\t2. poly interface\n\t3. vec general\n"); + flint_printf("\t1. divrem\n\t1bis. divrem sparse\n\t2. poly interface\n\t3. vec general\n"); flint_printf("\t4. vec precomp\n\t5. precomp_lazy (no excess correction)\n"); flint_printf("column c: 0 -> c == -1, 1 -> c == 1, 2 -> c general\n"); flint_printf("note: variant 2. expected to be faster than variants 3.4.5. when c == 1 or c == -1\n\n"); - flint_printf("bit c len n 1. 2. 3. 4. 5.\n"); + flint_printf("bit c len n 1. 1bis. 2. 3. 4. 5.\n"); for (i = 62; i <= FLINT_BITS; i++) { @@ -273,6 +324,8 @@ int main(void) { prof_repeat(&min, &max, sample_divrem, (void *) &info); mins_divrem = min/(double)FLINT_CLOCK_SCALE_FACTOR; + prof_repeat(&min, &max, sample_divrem_sparse, (void *) &info); + mins_divrem_sparse = min/(double)FLINT_CLOCK_SCALE_FACTOR; prof_repeat(&min, &max, sample_interface, (void *) &info); mins_interface = min/(double)FLINT_CLOCK_SCALE_FACTOR; prof_repeat(&min, &max, sample_generic, (void *) &info); @@ -284,23 +337,23 @@ int main(void) if (i < FLINT_BITS-1) { - flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e", + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e %.1e", info.bits, info.ctype, info.length, info.n, - mins_divrem, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy); + mins_divrem, mins_divrem_sparse, mins_interface, mins_generic, mins_precomp, mins_precomp_lazy); flint_printf("\n"); } else if (i < FLINT_BITS) { - flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na", + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e %.1e na", info.bits, info.ctype, info.length, info.n, - mins_divrem, mins_interface, mins_generic, mins_precomp); + mins_divrem, mins_divrem_sparse, mins_interface, mins_generic, mins_precomp); flint_printf("\n"); } else /* i == FLINT_BITS */ { - flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e na na", + flint_printf("%-4d%-2d%-7d%-7d%.1e %.1e %.1e %.1e na na", info.bits, info.ctype, info.length, info.n, - mins_divrem, mins_interface, mins_generic); + mins_divrem, mins_divrem_sparse, mins_interface, mins_generic); flint_printf("\n"); }