#include "nmod.h"
#include "nmod_poly.h"
#include "fft_small.h"
#include "profiler.h"

/*

speed up of 2n-3 by n division with divisor precomp:

    ---> log2(n)
|
| nbits(mod.n) with ** denoting a special fft prime
v

1 thread:

     9.0  9.6 10.1 10.7 11.3 11.9 12.5 13.1 13.6 14.2 14.8 15.4 16.0 16.6 17.2 17.7 18.3 18.9 19.5
**: 15.3 15.1 14.9 20.3 19.9 28.3 24.5 19.0 25.0 25.5 30.2 28.7 29.9 31.2 28.6 32.7 28.9 36.0 29.9
 2:  1.0  1.0  1.0  1.5  1.5  1.9  2.1  2.1  2.6  2.9  3.6  3.3  4.1  4.2  3.8  4.3  3.6  5.1  5.5
 4:  1.2  1.4  1.4  1.8  2.0  2.7  2.6  2.7  3.7  3.8  4.4  4.3  5.4  4.3  3.9  4.8  5.0  5.9  6.1
 6:  1.6  1.8  1.8  2.4  2.4  3.1  3.3  3.1  4.4  4.2  5.2  4.7  5.8  5.5  5.0  6.8  5.3  6.6  6.1
 8:  2.0  2.0  2.0  2.8  2.9  3.9  3.9  3.8  5.2  5.0  6.2  5.3  7.4  6.8  5.6  6.1  6.1  8.7  7.4
10:  2.4  2.4  2.5  3.3  3.3  4.4  4.6  4.7  5.9  5.9  6.6  6.6  7.9  6.9  6.2  8.6  8.4  9.5  7.6
12:  2.9  2.7  2.9  3.9  3.9  5.1  5.3  5.3  6.5  6.0  7.7  7.4  9.4  6.7  6.4  9.4  8.3 10.5  8.0
14:  3.2  3.4  3.3  4.5  4.5  6.4  6.2  5.7  7.4  6.8  8.4  7.3  9.5  9.4  8.5  9.7  7.8 10.7 11.1
16:  3.8  3.5  3.6  4.9  5.1  6.6  6.2  6.7  8.3  7.5  9.5  9.2 11.7  6.1  3.6  4.9  5.3  6.0  5.3
18:  4.2  4.0  4.1  5.6  5.5  7.8  4.4  3.3  4.2  3.5  4.3  4.7  5.3  4.9  3.6  5.4  5.3  6.0  5.6
20:  2.5  2.0  2.1  2.7  2.8  3.7  3.7  3.6  4.7  4.3  5.1  4.4  5.4  5.0  5.2  6.8  6.0  6.0  6.3
22:  2.1  2.2  2.3  2.9  3.2  4.2  4.0  3.8  4.7  4.4  5.7  5.3  5.2  5.0  5.2  6.3  6.1  6.0  6.9
24:  2.3  2.4  2.5  3.3  3.5  4.4  4.5  4.2  5.5  4.7  5.7  5.3  6.9  6.9  5.6  6.9  6.2  7.3  8.0
26:  2.5  2.6  2.8  3.6  3.6  4.7  4.9  4.7  5.5  6.0  5.9  5.5  7.5  7.0  5.8  6.9  6.1  8.5  7.8
28:  3.0  3.0  3.1  4.1  4.2  5.3  5.3  4.8  6.1  5.6  7.2  6.8  7.8  6.6  6.7  6.6  6.5  9.4  8.6
30:  3.2  3.3  3.3  4.4  4.5  5.7  6.0  5.3  6.1  5.7  7.9  6.8  8.0  6.8  6.4  9.4  7.8 10.0  8.1
32:  3.6  3.6  3.6  4.7  4.7  6.2  5.9  5.7  6.3  6.4  7.8  7.1  8.9  6.8  6.4  9.3  7.8  9.9  8.8
34:  3.7  3.9  3.8  5.0  5.0  6.8  6.6  6.0  7.3  6.9  7.8  7.1  9.1  9.2  8.9 10.7  7.8 10.5  8.8
36:  3.8  4.1  4.1  5.4  5.5  7.3  6.6  6.0  7.5  7.2  8.1  7.2  9.0  9.9  8.5 10.5  7.6 11.1 11.0
38:  4.1  4.4  4.2  5.8  5.7  7.6  7.4  6.8  8.2  7.3  9.7  7.1  8.7  9.8  8.6 10.0  7.7 11.4 11.0
40:  4.4  4.7  4.5  6.1  6.2  8.1  7.5  6.8  8.4  7.4  9.4  9.3 11.2 10.3  8.4 11.0  6.1  9.1  6.8
42:  4.5  5.0  4.6  6.5  6.3  8.5  7.9  7.0  8.6  5.5  6.8  6.0  7.1  6.1  5.5  6.8  7.1  8.7  7.9
44:  4.9  5.2  3.8  4.9  4.2  5.4  5.2  4.4  5.7  4.8  5.8  6.1  7.0  6.4  5.1  7.0  7.2  8.7  8.0
46:  3.4  3.5  3.4  4.4  4.4  5.7  5.3  4.8  5.8  5.7  7.3  6.4  7.0  8.0  5.2  7.2  7.7  9.0  8.1
48:  3.6  3.6  3.5  4.7  4.5  6.3  5.3  4.8  6.1  5.8  7.2  6.7  7.2  7.6  7.3  8.6  8.2  9.3  8.6
50:  3.7  3.7  3.7  4.8  4.9  6.4  6.0  4.7  6.0  6.2  7.2  6.9  7.1  7.3  7.3  8.5  7.8  9.4  8.9
52:  3.7  3.7  3.8  4.8  5.1  6.2  6.1  5.0  6.4  6.1  7.5  7.0  7.1  7.7  7.7  8.6  8.0  8.8  9.2
54:  3.7  3.9  4.0  5.1  5.3  6.7  6.4  5.7  7.1  6.3  7.5  7.0  7.0  7.2  7.7  8.7  9.0  8.8  9.5
56:  3.9  4.0  4.1  5.3  5.4  6.7  6.4  5.8  7.1  6.2  7.5  7.0  9.7  9.9  8.0  8.8  9.0  8.9  9.6
58:  4.0  4.2  4.3  5.6  5.5  6.7  6.4  5.8  7.2  6.3  7.4  7.0 10.0 10.1  8.2  8.9  9.0 10.6 11.1
60:  4.3  4.3  4.6  5.7  5.8  7.3  7.0  6.4  8.0  7.6  7.5  7.1 10.4  9.5  8.8  8.7  9.5 11.7 11.0
62:  4.5  4.6  4.8  6.0  6.1  7.8  7.3  6.6  8.2  7.5  7.6  7.3 10.4  9.8  9.0  8.8  9.1 11.8 10.8
64:  4.7  4.8  5.0  6.4  6.4  8.0  7.3  6.5  7.8  7.6  9.9  9.2 10.6  9.2  8.9  8.7 10.1 12.4 12.0

8 threads:

     9.0  9.6 10.1 10.7 11.3 11.9 12.5 13.1 13.6 14.2 14.8 15.4 16.0 16.6 17.2 17.7 18.3 18.9 19.5
**: 15.4 16.1 14.9 20.2 20.1 26.4 25.8 19.0 26.4 25.5 29.6 26.8 30.1 29.3 30.2 32.8 28.8 37.1 29.9
 2:  1.0  1.1  1.1  1.5  1.5  1.9  2.0  2.1  2.8  3.0  3.5  3.3  4.4  4.0  4.0  4.5  3.6  4.9  5.8
 4:  1.4  1.4  1.3  1.8  2.0  2.7  2.6  2.6  3.4  3.7  4.4  4.2  5.3  4.6  3.9  4.8  5.2  6.2  5.9
 6:  1.6  1.7  1.7  2.3  2.3  3.1  3.4  3.1  4.4  4.4  5.5  4.7  5.5  5.5  5.2  6.8  5.2  6.6  6.3
 8:  2.0  2.1  2.1  2.7  2.8  4.0  3.9  3.6  5.2  4.7  5.8  5.6  7.5  7.1  5.6  6.1  6.2  8.7  7.4
10:  2.4  2.5  2.5  3.3  3.3  4.4  4.7  4.7  5.6  5.9  6.2  6.6  7.4  6.9  6.3  8.5  8.5  9.5  7.6
12:  2.8  2.9  2.7  3.8  3.9  5.1  5.3  5.3  6.4  5.8  7.6  7.5  8.8  6.9  6.4  9.0  8.2 10.4  7.6
14:  3.2  3.3  3.2  4.5  4.5  6.0  6.1  6.0  7.1  7.3  7.9  7.3  9.5  8.9  8.6  9.7  7.8 10.8 11.1
16:  3.6  3.8  3.5  4.9  5.1  6.6  6.6  6.4  8.2  7.5  9.6  9.3 11.7  9.7  6.7  9.0  9.7 10.6  9.2
18:  4.2  4.0  4.1  5.6  5.8  7.4  5.5  6.0  7.4  6.6  8.0  9.0 10.2  9.1  6.8  9.8  9.4 10.2  9.5
20:  1.9  1.9  2.5  3.4  4.1  5.7  6.4  6.3  8.4  7.9  9.6  8.5 10.2  9.7  9.5 12.5 10.8 10.6 10.6
22:  1.4  2.1  2.8  3.9  4.7  6.5  7.0  7.0  8.4  8.2 10.7 10.1  9.9  9.4  9.4 11.4 11.1 10.6 11.8
24:  1.5  2.3  3.0  4.2  5.3  6.8  7.7  7.7  9.9  8.9 10.4 10.1 13.5 12.8 10.3 13.0 11.0 12.9 13.8
26:  1.6  2.5  3.3  4.6  5.5  7.2  8.5  8.3 10.1 11.2 10.6 10.5 14.2 13.2 10.7 12.5 10.8 15.1 13.2
28:  1.9  2.9  3.8  5.4  6.1  8.2  9.2  8.7 11.0 10.5 13.0 12.8 15.3 12.3 12.5 11.8 11.2 17.1 14.3
30:  2.1  3.1  4.0  5.7  6.7  9.2 10.2  9.6 11.0 10.7 14.4 13.0 15.3 12.4 12.1 17.7 13.6 17.4 14.1
32:  2.3  3.4  4.3  5.9  6.9  9.5 10.2 10.2 11.8 11.6 14.2 13.6 16.9 12.4 12.1 16.7 13.8 17.5 14.6
34:  2.5  3.6  4.6  6.5  7.4 10.3 11.1 10.8 13.4 12.8 15.2 13.4 16.1 16.8 16.0 19.2 13.9 18.8 15.1
36:  2.5  3.9  4.9  6.8  8.0 11.4 11.1 10.8 13.5 13.6 15.4 14.1 17.1 18.5 15.0 19.5 14.2 19.5 19.0
38:  2.7  4.1  5.1  7.4  8.3 11.9 12.7 12.3 14.2 13.6 18.0 13.7 16.3 18.7 15.7 18.4 14.3 19.7 19.1
40:  2.8  4.4  5.5  8.1  9.4 12.6 12.9 12.2 15.4 13.8 17.0 17.7 22.1 18.9 15.8 20.1 12.3 18.9 14.9
42:  3.0  4.6  5.8  8.1  9.6 12.7 13.5 12.6 14.9 12.4 15.8 16.1 19.1 16.5 13.7 16.5 16.4 20.2 16.5
44:  3.1  4.9  5.8  7.8  9.3 12.1 12.8 11.7 15.6 14.0 15.8 17.2 18.8 16.5 12.9 17.7 16.0 20.1 16.8
46:  3.4  4.3  5.8  8.3  9.6 12.6 12.8 12.8 15.8 15.8 20.4 17.8 18.9 21.4 12.5 17.2 17.4 20.7 17.1
48:  3.6  4.5  6.1  8.4 10.0 14.0 12.8 12.9 15.5 16.4 20.4 18.9 18.9 21.0 19.0 20.6 18.7 21.4 18.0
50:  3.8  4.6  6.4  9.2 10.9 14.5 14.7 12.4 15.5 17.0 19.7 19.4 19.1 19.6 18.2 21.7 17.3 21.8 19.4
52:  4.0  4.7  6.4  9.3 11.5 14.0 15.1 13.7 17.6 17.3 20.7 19.9 18.3 21.2 19.9 21.0 17.4 20.4 20.4
54:  4.0  5.0  6.8 10.0 11.6 15.2 15.9 14.9 19.2 17.0 20.9 19.1 19.3 19.8 19.4 22.2 20.0 20.4 20.1
56:  4.3  5.0  7.1 10.2 12.0 14.8 15.4 15.6 18.4 17.4 20.7 19.1 26.5 27.0 20.1 21.5 19.4 20.5 20.3
58:  4.3  5.4  7.7 10.6 12.2 14.7 15.9 15.8 19.1 17.1 21.6 20.1 26.8 27.2 20.1 22.6 20.6 24.6 23.8
60:  4.5  5.5  7.9 10.9 13.3 16.3 17.3 16.8 20.9 19.8 20.3 20.1 27.6 25.5 22.7 22.0 21.0 26.9 23.3
62:  4.7  5.7  8.0 11.3 13.4 17.5 18.1 18.0 21.3 20.7 21.5 20.3 27.6 26.1 22.8 21.1 20.5 26.9 22.9
64:  4.9  5.9  8.5 11.8 14.3 17.8 18.1 17.4 20.2 20.8 28.1 25.9 28.5 24.5 21.9 21.2 22.6 29.2 25.8

*/

int main(void)
{
    flint_bitcnt_t nbits;
    mpn_ctx_t R;
    nmod_t mod;
    flint_rand_t state;
    timeit_t timer;
    double time1, time2;
    ulong * a, * b, * q1, * q2, * r1, * r2;
    ulong an, bn, n, i, nreps;
    ulong nmax = 1000000;

    flint_randinit(state);
    mpn_ctx_init(R, UWORD(0x0003f00000000001));

    flint_set_num_threads(8);

    flint_printf("   ");
    for (n = 500; n < nmax; n += 1 + n/2)
        flint_printf("%5.1f", log2(n));
    flint_printf("\n");

    for (nbits = 0; nbits <= FLINT_BITS; nbits += 2)
    {
        if (nbits == 0)
        {
            nmod_init(&mod, R->ffts[1].mod.n);
            flint_printf("**:");
        }
        else
        {
            nmod_init(&mod, n_randbits(state, nbits));
            flint_printf("%2wu:", nbits);
        }

        a = FLINT_ARRAY_ALLOC(2*nmax, ulong);
        b = FLINT_ARRAY_ALLOC(nmax, ulong);
        q1 = FLINT_ARRAY_ALLOC(nmax, ulong);
        q2 = FLINT_ARRAY_ALLOC(nmax, ulong);
        r1 = FLINT_ARRAY_ALLOC(nmax, ulong);
        r2 = FLINT_ARRAY_ALLOC(nmax, ulong);

        for (n = 500; n < nmax; n += 1 + n/2)
        {
            an = 2*n-3;
            bn = n;

            nreps = 1 + 10000000/(n*n_nbits(n));

            for (i = 0; i < an; i++)
                a[i] = n_randint(state, mod.n);

            for (i = 0; i < bn; i++)
                b[i] = n_randint(state, mod.n);
            while (n_gcd(b[bn-1], mod.n) != 1)
                b[bn-1] = n_randint(state, mod.n);

#if 1
        {
            ulong* B = FLINT_ARRAY_ALLOC(n-2, ulong);

            _nmod_poly_reverse(b, b, bn, bn);
            _nmod_poly_inv_series(B, b, bn, n-2, mod);
            _nmod_poly_reverse(b, b, bn, bn);

            timeit_start(timer);
            for (i = 0; i < nreps; i++)
                _nmod_poly_divrem_newton_n_preinv(q1, r1, a, an, b, bn, B, n-2, mod);
            timeit_stop(timer);
            time1 = timer->wall*1e6/(nreps*log2(n)*n);

            flint_free(B);
        }

        {
            nmod_poly_divrem_precomp_struct M[1];
            _nmod_poly_divrem_precomp_init(M, b, bn, n-2, mod, R);

            timeit_start(timer);
            for (i = 0; i < nreps; i++)
                _nmod_poly_divrem_precomp(q2, r2, a, an, M, mod, R);
            timeit_stop(timer);
            time2 = timer->wall*1e6/(nreps*log2(n)*n);

            _nmod_poly_divrem_precomp_clear(M);
        }
#else

            timeit_start(timer);
            for (i = 0; i < nreps; i++)
                _nmod_poly_divrem(q1, r1, a, an, b, bn, mod);
            timeit_stop(timer);
            time1 = timer->wall*1e6/(nreps*log2(n)*n);

            timeit_start(timer);
            for (i = 0; i < nreps; i++)
                _nmod_poly_divrem_mpn_ctx(q2, r2, a, an, b, bn, mod, R);
            timeit_stop(timer);
            time2 = timer->wall*1e6/(nreps*log2(n)*n);
#endif

            flint_printf("%5.1f", /*time2*/time1/time2);
            fflush(stdout);

            for (i = 0; i < an-bn+1; i++)
            {
                if (q1[i] != q2[i])
                {
                    flint_printf("error at index %wu\n", i);
                    flint_printf("mod: %wu\n", mod.n);
                    flint_abort();
                }
            }

            for (i = 0; i < bn-1; i++)
            {
                if (r1[i] != r2[i])
                {
                    flint_printf("error at index %wu\n", i);
                    flint_printf("mod: %wu\n", mod.n);
                    flint_abort();
                }
            }
        }

        flint_printf("\n");

        flint_free(a);
        flint_free(b);
        flint_free(q1);
        flint_free(q2);
        flint_free(r1);
        flint_free(r2);
    }

    mpn_ctx_clear(R);
    flint_randclear(state);
    
    return 0;
}

