/*
Program to check status of the line conjecture for cubic extensions.
Written by Geoff Bailey, Magma group, Feb 2017.

Sample compile line (Linux):

    gcc -O4 -o L3check L3check.c -lpthread

For more explanation, see

    http://magma.maths.usyd.edu.au/~geoff/L3/
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <pthread.h>

#define	USE_ASSERTIONS		1
#define	USE_MOD_BY_MULT		1
#define	QLIMIT			5792


typedef	uint32_t	t_uint32;
typedef	uint64_t	t_uint64;

typedef	int		t_boolean;

#define	FALSE		0
#define	TRUE		1


/* global options */
t_boolean		opt_verbose;
t_boolean		opt_antisort;
t_uint32		opt_nthreads;
t_boolean		opt_find_all;


/*************** Miscellaneous utility stuff ***************/

#if USE_ASSERTIONS
#define	ASSERT(expr)							\
    do {								\
	if (!(expr))							\
	{								\
	    fprintf(stderr, "Assertion failed: %s\n", #expr);		\
	    exit(1);							\
	}								\
    } while (0)
#else
#define	ASSERT(expr)	do { /* nothing */ } while (0)
#endif

#define	IF_VERBOSE(code)						\
    if (opt_verbose) {							\
	code;								\
	fflush(stdout);							\
    }

void *checked_calloc(size_t nitems, size_t itemsize)
{
    void	*mem;

    mem = calloc(nitems, itemsize);
    if (mem == NULL)
    {
	fprintf(stderr, "Calloc(%zu, %zu) failed\n", nitems, itemsize);
	exit(1);
    }
    return mem;
}

#define	ARRAY_ALLOC(n, t)	checked_calloc((n), sizeof(t))
#define	ARRAY_FREE(a)		free(a)

#define	ALLOC(t)		ARRAY_ALLOC(1, t)

#define	ONES32(n)	(~((~(t_uint32)0) << (n)))
#define	ONES64(n)	(~((~(t_uint64)0) << (n)))

t_uint32 highbitpos32(t_uint32 x)
/*
Returns the index of the highest set bit in x.  x must not be zero.
If k is the returned index, then 2^k <= x < 2^(k + 1).
*/
{
    t_uint32	k;

    for (k = 31; k > 0; k--)
	if ((x >> k) != 0)
	    break;
    return k;
}

double time_current()
{
    struct timeval	tv;

    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + ((double)tv.tv_usec) / 1000000.0;
}

double time_since(double start)
{
    return time_current() - start;
}

#define	START_TIMER()		double tm = time_current()
#define	RESTART_TIMER()		tm = time_current()
#define	STOP_TIMER(desc)						\
    IF_VERBOSE( printf("Time for %s: %.2f s\n", desc, time_since(tm)) )


/*
As should be well-known, division and modulus are slow operations.
Provided certain conditions are met, we can replace division by a
multiply and shift.

Suppose that x < 2^A and m < 2^B, and set M = Ceiling(2^(A + B)/m).
Then Floor(x / m) = Floor((M*x) / 2^(A + B)).  This is the promised
multiply and shift, and is fine provided that M*x does not overflow.
Although better versions are possible, for our needs it is enough to
take the simple restriction 2*A + B <= 64 which will guarantee that.

Of course, once we have cheap division, then we can get modulus by
a further multiply and subtract.
*/
#if MOD_BY_MULT
#define	MOD_INIT(modulus, shiftvalue)					\
    t_uint32	mm_modulus;						\
    int		mm_shift;						\
    t_uint64	mm_mult;						\
									\
    mm_modulus = modulus;						\
    mm_shift = shiftvalue;						\
    mm_mult = 1 + (ONES64(Mshift) / (t_uint64)mm_modulus)

#define	MOD_REDUCE(x)							\
    x -= mm_modulus*(t_uint32)((mm_mult*(t_uint64)x) >> mm_shift)
#else
#define	MOD_INIT(modulus, shiftvalue)	t_uint32 mm_modulus = modulus
#define	MOD_REDUCE(x)			x %= mm_modulus;
#endif


/*************** Multi-iterator ***************/

typedef struct
{
    t_uint64	lbit, hbit, multicounter, moduli;
    t_uint32	shift, largeprime, largecounter;
    t_boolean	uselarge;
} str_multi_iterator;

void compute_multi_iterator(str_multi_iterator *mi, t_uint32 nprimes, t_uint32 *primes)
{
    t_uint64	lbit, hbit, multicounter, moduli;
    t_uint32	shift, largeprime, largecounter;
    t_boolean	uselarge;
    t_uint32	np, W, i;
    t_uint32	*pptr;

    /* skip the prime two; it will be handled differently by other code */
    pptr = primes;
    np = nprimes;
    if (pptr[0] == 2)
    {
	pptr++;
	np--;
    }

    largeprime = pptr[np - 1];
    W = 2 + highbitpos32(largeprime);
    uselarge = (np*W > 64);
    if (uselarge)	/* largest prime does not fit into the multicounter */
    {
	np--;
	W = 2 + highbitpos32(pptr[np - 1]);
	/*
	The remaining primes must fit, since the first case where they
	would not is q = 12391, which is much large than we allow.
	*/
	ASSERT(np*W <= 64);
    }

    lbit = 0;
    moduli = 0;
    for (i = 0; i < np; i++)
    {
	lbit |= ((t_uint64)1) << (W*i);
	moduli |= ((t_uint64)pptr[i]) << (W*i);
    }
    shift = W - 1;
    hbit = lbit << shift;

    multicounter = hbit - moduli;
    largecounter = 0;

    IF_VERBOSE(
	printf("Multicounter:\n");
	printf("  %u small primes (%u", np, pptr[0]);
	for (i = 1; i < np; i++)
	    printf(" %u", pptr[i]);
	printf("), %u bits per prime\n", W);
	if (uselarge)
	    printf("  large prime: %u\n", largeprime);
    )

    mi->lbit = lbit;
    mi->hbit = hbit;
    mi->multicounter = multicounter;
    mi->moduli = moduli;
    mi->shift = shift;
    mi->largeprime = largeprime;
    mi->largecounter = largecounter;
    mi->uselarge = uselarge;
}

#define	INIT_MULTI_ITERATOR(mi)						\
    t_uint64		lbit, hbit, multicounter, moduli;		\
    t_uint32		shift, largeprime, largecounter;		\
    t_boolean		uselarge;					\
    t_uint64		H, haszero;					\
    									\
    lbit = mi.lbit;							\
    hbit = mi.hbit;							\
    multicounter = mi.multicounter;					\
    moduli = mi.moduli;							\
    shift = mi.shift;							\
    largeprime = mi.largeprime;						\
    largecounter = mi.largecounter;					\
    uselarge = mi.uselarge;						\
    haszero = TRUE

#define	ADVANCE_MULTI_ITERATOR()					\
    do {								\
	multicounter += lbit;						\
	H = multicounter & hbit;					\
	multicounter -= (moduli & (H - (H >> shift)));			\
	haszero = H;							\
	if (uselarge)							\
	{								\
	    largecounter++;						\
	    if (largecounter == largeprime)				\
	    {								\
		largecounter = 0;					\
		haszero = 1;						\
	    }								\
	}								\
    } while (0)

#define	ADVANCE_MULTI_ITERATOR_SMALL()					\
    do {								\
	multicounter += lbit;						\
	H = multicounter & hbit;					\
	multicounter -= (moduli & (H - (H >> shift)));			\
	haszero = H;							\
    } while (0)

#define	ADVANCE_MULTI_ITERATOR_LARGE()					\
    do {								\
	multicounter += lbit;						\
	H = multicounter & hbit;					\
	multicounter -= (moduli & (H - (H >> shift)));			\
	haszero = H;							\
	largecounter++;							\
	if (largecounter == largeprime)					\
	{								\
	    largecounter = 0;						\
	    haszero = 1;						\
	}								\
    } while (0)


/*************** Finite field poly rep (over F_p) ***************/

/*
This is for field elements represented as polynomials over GF(p).
Essentially the only arithmetic operation we need is to multiply
by the primitive element.
*/

typedef struct
{
    t_uint32	p, n, q;	/* q = p^n */
    t_uint32	*f;		/* primitive elt z has z^n = Sum(f[i]*z^i) */
} str_ffpoly;

typedef	str_ffpoly	*t_ffpoly;
typedef	t_uint32	*t_ffpoly_elt;


t_ffpoly ffpoly_create(t_uint32 p, t_uint32 n, t_uint32 *poly)
/*
Returns the ffpoly structure F for the given arguments.
IMPORTANT: Must have p^n < 2^32.
*/
{
    t_ffpoly	F;
    t_uint32	q, i, x;
    t_uint32	*f;

    q = 1;
    for (i = 0; i < n; i++)
	q *= p;

    f = ARRAY_ALLOC(n, t_uint32);
    for (i = 0; i < n; i++)
    {
	x = p - poly[i];
	if (x == p)
	    x = 0;
	f[i] = x;
    }

    F = ALLOC(str_ffpoly);
    F->p = p;
    F->n = n;
    F->q = q;
    F->f = f;

    return F;
}

void ffpoly_free(t_ffpoly F)
{
    ARRAY_FREE(F->f);
    free(F);
}

t_ffpoly_elt ffpoly_zero(t_ffpoly F)
/*
Returns an element of F equal to 0.
*/
{
    return ARRAY_ALLOC(F->n, t_uint32);
}

t_ffpoly_elt ffpoly_one(t_ffpoly F)
/*
Returns an element of F equal to 1.
*/
{
    t_ffpoly_elt	one;

    one = ffpoly_zero(F);
    one[0] = 1;
    return one;
}

void ffpoly_elt_free(t_ffpoly_elt a)
{
    ARRAY_FREE(a);
}

void ffpoly_elt_mult_prim_into(t_ffpoly F, t_ffpoly_elt a, t_ffpoly_elt b)
/*
Sets b to be a*z, where z is the primitive element of F.
*/
{
    t_uint32	p, n, i, x;
    t_uint32	*f;

    p = F->p;
    n = F->n;
    f = F->f;

    x = a[n - 1];
    for (i = n - 1; i > 0; i--)
	b[i] = (a[i - 1] + x*f[i]) % p;
    b[0] = (x*f[0]) % p;
}

t_uint32 ffpoly_elt_index(t_ffpoly F, t_ffpoly_elt a)
/*
Returns a unique integer in [0, p^n) corresponding to a, by treating
the coefficients in a as a base-p representation of an integer.
*/
{
    t_uint32	p, n, index, i;

    p = F->p;
    n = F->n;
    index = a[n - 1];
    for (i = n - 1; i > 0; i--)
	index = index*p + a[i - 1];
    return index;
}


/*************** Finite field zech rep ***************/

/*
This is for field elements represented by their logs with respect
to a primitive element.  All arithmetic operations are carried out
by use of the Zech table.
*/

typedef struct
{
    t_ffpoly	F;
    t_uint32	q,	/* size of field */
    		zero,	/* value representing zero (in practice: q - 1) */
		logm1;	/* log of minus one (0 or (q - 1)/2) */
    t_uint32	*zech;	/* zech table mapping log(x) to log(x + 1) */
} str_ffzech;

typedef	str_ffzech	*t_ffzech;
typedef	t_uint32	t_ffzech_elt;


t_ffzech ffzech_create(t_ffpoly F)
/*
Creates the Zech representation for the ffpoly F.
*/
{
    t_ffzech		Z;
    t_uint32		p, n, q, logm1, log, index, k;
    t_uint32		*itol, *zech;
    t_ffpoly_elt	zk;

    START_TIMER();

    Z = ALLOC(str_ffzech);
    p = F->p;
    n = F->n;
    q = F->q;

    itol = ARRAY_ALLOC(q, t_uint32);
    zech = ARRAY_ALLOC(q, t_uint32);

    zk = ffpoly_one(F);
    for (log = 0; log < q - 1; log++)
    {
	index = ffpoly_elt_index(F, zk);
	itol[index] = log;
	ffpoly_elt_mult_prim_into(F, zk, zk);
    }
    ffpoly_elt_free(zk);

    /* We use q - 1 for the log of zero, completing the permutations */
    log = q - 1;
    index = 0;
    itol[index] = log;

    for (index = 0; index < q; index += p)
    {
	for (k = 0; k < p - 1; k++)
	    zech[itol[index + k]] = itol[index + k + 1];
	zech[itol[index + k]] = itol[index];
    }
    ARRAY_FREE(itol);

    if (p == 2)
	logm1 = 0;
    else
	logm1 = (q - 1) / 2;
    ASSERT(zech[logm1] == q - 1);

    Z->F = F;
    Z->q = q;
    Z->zero = q - 1;
    Z->logm1 = logm1;
    Z->zech = zech;

    STOP_TIMER("zech creation");

    return Z;
}

#define	ZECH_ORDER(Z)	((Z)->q - 1)
#define	ZECH_ZERO(Z)	((Z)->q - 1)
#define	ZECH_ONE	0
#define	ZECH_PRIM	1

void ffzech_free(t_ffzech Z)
{
    ARRAY_FREE(Z->zech);
    ffpoly_free(Z->F);
    free(Z);
}

t_ffzech_elt ffzech_elt_add(t_ffzech Z, t_ffzech_elt a, t_ffzech_elt b)
/*
Returns a + b.
*/
{
    t_uint32	zero, order, c;
    t_uint32	*zech;

    zero = ZECH_ZERO(Z);
    if (a == zero)
	return b;
    if (b == zero)
	return a;

    order = ZECH_ORDER(Z);
    zech = Z->zech;
    if (a <= b)
    {
	c = zech[b - a];
	if (c == zero)
	    return zero;

	c += a;
	if (c >= order)
	    c -= order;
	return c;
    }
    else
    {
	c = zech[a - b];
	if (c == zero)
	    return zero;

	c += b;
	if (c >= order)
	    c -= order;
	return c;
    }
}

t_ffzech_elt ffzech_elt_mult(t_ffzech Z, t_ffzech_elt a, t_ffzech_elt b)
/*
Returns a*b.
*/
{
    t_uint32	zero, order, c;

    zero = ZECH_ZERO(Z);
    if (a == zero)
	return zero;
    if (b == zero)
	return zero;

    order = ZECH_ORDER(Z);
    c = a + b;
    if (c >= order)
	c -= order;
    return c;
}

t_ffzech_elt ffzech_elt_add_mult(t_ffzech Z, t_ffzech_elt a, t_ffzech_elt b, t_ffzech_elt c)
/*
Returns a + b*c.
*/
{
    t_ffzech_elt	d;

    d = ffzech_elt_mult(Z, b, c);
    return ffzech_elt_add(Z, a, d);
}

t_ffzech_elt ffzech_elt_negate(t_ffzech Z, t_ffzech_elt a)
/*
Returns -a.
*/
{
    return ffzech_elt_mult(Z, a, Z->logm1);
}

t_uint32 prime_zech_to_integer(t_ffzech Z, t_ffzech_elt a)
/*
When Z is a prime field, return the integer corresponding to a.
*/
{
    t_ffpoly	F;
    t_uint32	p, z, m, zpow, bitpos;

    if (a == Z->zero)
	return 0;

    if (a == ZECH_ONE)
	return 1;

    F = Z->F;
    ASSERT(F->n == 1);

    p = F->p;
    z = F->f[0];
    m = (t_uint32)a;

    zpow = z;
    bitpos = highbitpos32(m);
    while (bitpos > 0)
    {
	zpow = (zpow*zpow) % p;
	bitpos--;
	if ((m >> bitpos) & 1)
	    zpow = (zpow*z) % p;
    }

    return zpow;
}


/*************** Finite field poly-over-zech rep ***************/

/*
This is for field elements represented as polynomials over a zech-rep
field.  Essentially the only arithmetic operation we need is to
multiply by the primitive element.

We store a few extra items in the field that don't strictly belong
there but are convenient to pass around in this way.
*/

typedef enum
{
    FFPZ_CHAR_2,
    FFPZ_P3_F20,
    FFPZ_CHAR_ODD
} t_ffpz_type;

typedef struct
{
    t_ffzech		Z;
    t_uint32		deg;	/* deg = deg(f), f is over Z */
    t_uint64		q;	/* (Z->q)^deg */
    t_ffzech_elt	*f;	/* primitive elt w has w^n = Sum(f[i]*w^i) */
    t_ffpz_type		type;
    t_uint32		nprimes;
    t_uint32		*primes;
    t_uint64		rad;
    str_multi_iterator	mi;
} str_ffpz;

typedef	str_ffpz	*t_ffpz;
typedef	t_ffzech_elt	*t_ffpz_elt;


t_ffpz ffpz_create(t_ffzech Z, t_uint32 deg, t_ffzech_elt *poly)
/*
Returns the ffpz structure for the given arguments.
*/
{
    t_ffpz		K;
    t_ffzech_elt	*f;
    t_uint32		i;
    t_uint64		Zq, q;
    t_ffpz_type		type;

    f = ARRAY_ALLOC(deg, t_ffzech_elt);
    for (i = 0; i < deg; i++)
	f[i] = ffzech_elt_negate(Z, poly[i]);

    Zq = Z->q;
    q = 1;
    for (i = 0; i < deg; i++)
	q *= Zq;

    if ((Zq & 1) == 0)
	type = FFPZ_CHAR_2;
    else if (Z->F->n == 1 && f[2] == Z->zero)
	type = FFPZ_P3_F20;
    else
	type = FFPZ_CHAR_ODD;

    K = ALLOC(str_ffpz);
    K->Z = Z;
    K->deg = deg;
    K->q = q;
    K->f = f;
    K->type = type;

    return K;
}

void ffpz_store_primes(t_ffpz K, t_uint32 nprimes, t_uint32 *primes)
{
    t_uint64	rad;
    t_uint32	i;

    rad = 1;
    for (i = 0; i < nprimes; i++)
	rad *= (t_uint64)primes[i];

    compute_multi_iterator(&(K->mi), nprimes, primes);

    K->nprimes = nprimes;
    K->primes = primes;
    K->rad = rad;
}

void ffpz_free(t_ffpz K)
{
    if (K->nprimes > 0)
	ARRAY_FREE(K->primes);
    ARRAY_FREE(K->f);
    ffzech_free(K->Z);
    free(K);
}

t_ffpz_elt ffpz_zero(t_ffpz K)
{
    t_uint32		deg, i;
    t_ffzech_elt	zero;
    t_ffpz_elt		A;

    deg = K->deg;
    zero = ZECH_ZERO(K->Z);
    A = ARRAY_ALLOC(deg, t_ffzech_elt);
    for (i = 0; i < deg; i++)
	A[i] = zero;
    return A;
}

t_ffpz_elt ffpz_one(t_ffpz K)
{
    t_ffpz_elt	one;

    one = ffpz_zero(K);
    one[0] = ZECH_ONE;
    return one;
}

void ffpz_elt_free(t_ffpz_elt a)
{
    ARRAY_FREE(a);
}

void ffpz_elt_mult_prim_into(t_ffpz K, t_ffpz_elt a, t_ffpz_elt b)
/*
Sets b to be a*w, where w is the primitive element of K.
*/
{
    t_ffzech		Z;
    t_uint32		deg, i;
    t_ffzech_elt	*f;
    t_ffzech_elt	x;

    Z = K->Z;
    deg = K->deg;
    f = K->f;

    x = a[deg - 1];
    for (i = deg - 1; i > 0; i--)
	b[i] = ffzech_elt_add_mult(Z, a[i - 1], x, f[i]);
    b[0] = ffzech_elt_mult(Z, x, f[0]);
}


/*************** Gamma-alpha table ***************/

t_uint64	*ga_table;
t_uint32	*ga_table_large;
t_uint32	*ga_counts;

void build_gamma_alpha_table_char2(t_ffpz K)
{
    t_ffzech		Z;
    t_uint32		q, q2, index;
    t_uint64		Korder, k;
    t_ffzech_elt	zero;
    t_ffpz_elt		wk;
    t_ffzech_elt	a, g;
    t_uint32		largeadj;
    t_uint64		adjustment;

    START_TIMER();

    IF_VERBOSE(
	printf("Gamma-alpha table: Characteristic two (general case)\n");
    )

    Z = K->Z;
    q = Z->q;
    q2 = q*q;
    Korder = K->q - 1;
    zero = ZECH_ZERO(Z);

    ga_table = ARRAY_ALLOC(q2 + q, t_uint64);
    ga_table_large = ARRAY_ALLOC(q2 + q, t_uint32);

    INIT_MULTI_ITERATOR(K->mi);

    /*
    Note that our multi-iterator lags one step behind k in the loop
    below.  This is intentional!  It produces the right values to make
    the complement adjustment simpler.
    */

    /* adjustment hack to shift iterator to easily-complemented form */
    adjustment = hbit - moduli;
    largeadj = largeprime - 1;

    wk = ffpz_one(K);
    for (k = 1; k < Korder; k++)
    {
	ffpz_elt_mult_prim_into(K, wk, wk);
	if (
	    (wk[2] == ZECH_ONE) ||
	    (wk[2] == zero && wk[1] == ZECH_ONE)
	)
	{
	    if (wk[2] == ZECH_ONE)
		g = wk[1];
	    else
		g = q;
	    a = wk[0];
	    index = q*g + a;

	    ga_table[index] = multicounter - adjustment;
	    if (uselarge)
		ga_table_large[index] = largeadj - largecounter;
	}

	ADVANCE_MULTI_ITERATOR();
    }
    ffpz_elt_free(wk);

    STOP_TIMER("gamma-alpha table");
}

void build_gamma_alpha_table_oddchar(t_ffpz K)
{
    t_ffzech		Z;
    t_uint32		q, q2, base, index;
    t_uint64		Korder, k;
    t_ffzech_elt	zero;
    t_ffzech_elt	a, g;
    t_uint32		largeadj;
    t_uint64		adjustment;
    t_ffzech_elt	wk0, wk1, wk2, f0, f1, f2, u;

    START_TIMER();

    IF_VERBOSE(
	printf("Gamma-alpha table: Odd characteristic (general case)\n");
    )

    Z = K->Z;
    q = Z->q;
    q2 = q*q;
    Korder = K->q - 1;
    zero = ZECH_ZERO(Z);

    ga_table = ARRAY_ALLOC(q2 + q, t_uint64);
    ga_table_large = ARRAY_ALLOC(q2 + q, t_uint32);
    ga_counts = ARRAY_ALLOC(2*(q + 1), t_uint32);

    INIT_MULTI_ITERATOR(K->mi);

    /*
    Note that our multi-iterator lags one step behind k in the loop
    below.  This is intentional!  It produces the right values to make
    the complement adjustment simpler.
    */

    /* adjustment hack to shift iterator to easily-complemented form */
    adjustment = hbit - moduli;
    largeadj = largeprime - 1;

    /*
    We loop through w^k, but we explicitly represent it by its coefficients
    (w^k = wk2*w^2 + wk1*w + wk0) instead of using the ffpz_elt.  This
    means we also have to inline the multiplication by w.  The formulae
    for this are

	u = wk2
	wk2 = u*f2 + wk1
	wk1 = u*f1 + wk0
	wk0 = u*f0
    
    This turns out to save considerable time, although arguably it's not
    worth it compared to the time taken by the beta loop later.
    */
    wk2 = zero;
    wk1 = zero;
    wk0 = ZECH_ONE;
    f2 = K->f[2];
    f1 = K->f[1];
    f0 = K->f[0];
    for (k = 1; k < Korder; k++)
    {
	u = wk2;
	wk2 = ffzech_elt_add_mult(Z, wk1, u, f2);
	wk1 = ffzech_elt_add_mult(Z, wk0, u, f1);
	wk0 = ffzech_elt_mult(Z, u, f0);
	if (wk2 == ZECH_ONE || (wk2 == zero && wk1 == ZECH_ONE))
	{
	    if (wk2 == ZECH_ONE)
		g = wk1;
	    else
		g = q;
	    a = wk0;
	    base = q*g;

	    if (k & 1)
	    {
		index = base + ga_counts[2*g];
		ga_counts[2*g]++;
	    }
	    else
	    {
		ga_counts[2*g + 1]++;
		index = base + q - ga_counts[2*g + 1];
	    }

	    ga_table[index] = multicounter - adjustment;
	    if (uselarge)
		ga_table_large[index] = largeadj - largecounter;
	}
	ADVANCE_MULTI_ITERATOR();
    }

    STOP_TIMER("gamma-alpha table");
}

void build_gamma_alpha_table_p3_f20(t_ffpz K)
{
    t_ffzech		Z;
    t_uint32		q, q2, base, index;
    t_uint64		Korder, k;
    t_ffzech_elt	zero;
    t_ffzech_elt	a, g;
    t_uint32		largeadj;
    t_uint64		adjustment;
    t_uint32		wk0, wk1, wk2, f0, f1, f2, u;
    t_uint64		M;
    t_uint32		Mshift;

    START_TIMER();

    IF_VERBOSE(
	printf("Gamma-alpha table: Odd characteristic (prime case, f2 = 0)\n");
    )

    Z = K->Z;
    q = Z->q;
    q2 = q*q;
    Korder = K->q - 1;
    zero = ZECH_ZERO(Z);

    ga_table = ARRAY_ALLOC(q2 + q, t_uint64);
    ga_table_large = ARRAY_ALLOC(q2 + q, t_uint32);
    ga_counts = ARRAY_ALLOC(2*(q + 1), t_uint32);

    INIT_MULTI_ITERATOR(K->mi);

    /*
    Note that our multi-iterator lags one step behind k in the loop
    below.  This is intentional!  It produces the right values to make
    the complement adjustment simpler.
    */

    /* adjustment hack to shift iterator to easily-complemented form */
    adjustment = hbit - moduli;
    largeadj = largeprime - 1;

    /*
    We loop through w^k, but we explicitly represent it by its coefficients
    (w^k = wk2*w^2 + wk1*w + wk0) instead of using the ffpz_elt.  This
    means we also have to inline the multiplication by w.  The formulae
    for this are

	u = wk2
	wk2 = u*f2 + wk1
	wk1 = u*f1 + wk0
	wk0 = u*f0

    But here we are specialising for the case where f2 = 0 (since this
    pretty much always happens when q is prime), so it simplifies to:

	u = wk2
	wk2 = wk1
	wk1 = u*f1 + wk0
	wk0 = u*f0

    This is the case where q is prime, so we can bypass the zech
    machinery and go straight to machine integers.  However, we have
    to reconstruct the polynomial coefficients, since they were given
    as zech values.

    This all turns out to save considerable time, although arguably
    it's not worth it compared to the time taken by the beta loop later.
    */
    f2 = prime_zech_to_integer(Z, K->f[2]);
    f1 = prime_zech_to_integer(Z, K->f[1]);
    f0 = prime_zech_to_integer(Z, K->f[0]);

    /*
    MOD_BY_MULT: We do reductions mod q < 2^13, and the values being
    reduced are less than q^2 < 2^25 (due to QLIMIT: Sqrt(2^25) = 5792+).
    There will not be overflow since 2*25 + 13 <= 64.
    */
    MOD_INIT(q, 25 + 13);

    wk2 = 0;
    wk1 = 0;
    wk0 = 1;
    for (k = 1; k < Korder; k++)
    {
	u = wk2;
	wk2 = wk1;
	wk1 = u*f1 + wk0;
	MOD_REDUCE(wk1);
	wk0 = u*f0;
	MOD_REDUCE(wk0);

	if (wk2 == 1 || (wk2 == 0 && wk1 == 1))
	{
	    if (wk2 == 1)
		g = wk1;
	    else
		g = q;
	    a = wk0;
	    base = q*g;

	    if (k & 1)
	    {
		index = base + ga_counts[2*g];
		ga_counts[2*g]++;
	    }
	    else
	    {
		ga_counts[2*g + 1]++;
		index = base + q - ga_counts[2*g + 1];
	    }

	    ga_table[index] = multicounter - adjustment;
	    if (uselarge)
		ga_table_large[index] = largeadj - largecounter;
	}
	ADVANCE_MULTI_ITERATOR();
    }

    STOP_TIMER("gamma-alpha table");
}


typedef struct
{
    t_uint64	*B1, *T1;	/* ga_table, and temporary */
    t_uint32	*B2, *T2;	/* ga_table_large, and temporary */
    t_uint32	*counts,
    		*pos;
    t_uint32	p;
    t_uint64	pmask;		/* for extracting values mod p from B1[] */
    t_boolean	uselarge;	/* whether to apply antisort to B2[] */
} str_antisort_data;

int uint32_revcmp(const void *ap, const void *bp)
{
    t_uint32	a, b;

    a = *(t_uint32 *)ap;
    b = *(t_uint32 *)bp;
    if (a > b)
	return -1;
    if (a < b)
	return 1;
    return 0;
}

void antisort_range(str_antisort_data *ad, t_uint32 start, t_uint32 end)
{
    t_uint64	*B1, *T1;
    t_uint32	*B2, *T2, *counts, *pos;
    t_uint32	p;
    t_uint64	pmask;
    t_boolean	uselarge;
    t_uint32	cshift, cmask, chigh, i, vb, pstart, vc, vpos, vcount, next;

    B1 = ad->B1;
    T1 = ad->T1;
    B2 = ad->B2;
    T2 = ad->T2;
    counts = ad->counts;
    pos = ad->pos;
    p = ad->p;
    pmask = ad->pmask;
    uselarge = ad->uselarge;

    /*
    Store packed tuples in counts[], giving a value mod p and the
    number of items of B1 with value.  (Note that these values are
    not the same as the corresponding value of the multicounter;
    all we need is that they are distinct and in [0..p-1].)

    By putting the number of items in the high component, we can
    sort easily to get a (reverse) sort on count.
    */
    cshift = 16;
    cmask = ONES32(cshift);
    chigh = (1 << cshift);

    for (i = 0; i < p; i++)
	counts[i] = i;
    for (i = start; i < end; i++)
    {
	vb = B1[i] & pmask;
	ASSERT(0 <= vb && vb < p);
	counts[vb] += chigh;
    }

    qsort(counts, p, sizeof(t_uint32), uint32_revcmp);

    /*
    Now re-bucket the B[] arrays into T[], using pos[] to determine
    the positions of each bucket.
    */
    pstart = start;
    for (i = 0; i < p; i++)
    {
	vc = counts[i];
	vpos = vc & cmask;
	vcount = vc >> cshift;
	pos[vpos] = pstart;
	pstart += vcount;
    }
    ASSERT(pstart == end);
    for (i = start; i < end; i++)
    {
	vb = B1[i] & pmask;
	vpos = pos[vb];
	T1[vpos] = B1[i];
	if (uselarge)
	    T2[vpos] = B2[i];
	pos[vb] = vpos + 1;
    }

    /*
    And redistribute the T[] arrays back into B[], using one entry
    from each bucket at a time.
    */
    pstart = start;
    for (i = 0; i < p; i++)
    {
	vcount = counts[i] >> cshift;
	pos[i] = pstart;
	pstart += vcount;
	counts[i] = vcount;
    }
    ASSERT(pstart == end);

    next = 0;
    for (i = start; i < end; i++)
    {
	vpos = pos[next];
	B1[i] = T1[vpos];
	if (uselarge)
	    B2[i] = T2[vpos];
	pos[next] = vpos + 1;
	counts[next]--;
	next++;
	if (counts[next] == 0)
	    next = 0;
    }
}

void antisort_ga_tables(t_ffpz K)
{
    t_uint32		q, p, g;
    t_uint64		pmask;
    t_boolean		uselarge, char2;
    str_antisort_data	ad;

    START_TIMER();

    q = K->Z->q;
    p = K->primes[0];
    if (p == 2)
	p = K->primes[1];
    if (2*p > q)
    {
	IF_VERBOSE(
	    printf("Smallest odd prime (%u) is too large -- skipping antisort\n", p);
	)
	return;
    }

    pmask = ONES64(K->mi.shift);
    uselarge = K->mi.uselarge;

    ad.B1 = ga_table;
    ad.T1 = ARRAY_ALLOC(q*q + q, t_uint64);
    if (uselarge)
    {
	ad.B2 = ga_table_large;
	ad.T2 = ARRAY_ALLOC(q*q + q, t_uint32);
    }
    ad.counts = ARRAY_ALLOC(p, t_uint32);
    ad.pos = ARRAY_ALLOC(p, t_uint32);
    ad.p = p;
    ad.pmask = pmask;
    ad.uselarge = uselarge;

    char2 = ((q & 1) == 0);
    if (char2)
    {
	for (g = 0; g < q + 1; g++)
	    antisort_range(&ad, q*g, q*g + q);
    }
    else
    {
	for (g = 0; g < q + 1; g++)
	{
	    t_uint32	oddstart, oddend, evenstart, evenend;

	    oddstart = q*g;
	    oddend = oddstart + ga_counts[2*g];
	    evenstart = oddend;
	    evenend = evenstart + ga_counts[2*g + 1];
	    ASSERT(evenend == q*g + q);

	    antisort_range(&ad, oddstart, oddend);
	    antisort_range(&ad, evenstart, evenend);
	}
    }

    ARRAY_FREE(ad.pos);
    ARRAY_FREE(ad.counts);
    if (uselarge)
	ARRAY_FREE(ad.T2);
    ARRAY_FREE(ad.T1);

    STOP_TIMER("antisort");
}

void build_gamma_alpha_table(t_ffpz K)
{
    switch (K->type)
    {
    case FFPZ_CHAR_2:
	build_gamma_alpha_table_char2(K);
	break;

    case FFPZ_P3_F20:
	build_gamma_alpha_table_p3_f20(K);
	break;

    case FFPZ_CHAR_ODD:
	build_gamma_alpha_table_oddchar(K);
	break;
    }

    if (opt_antisort)
	antisort_ga_tables(K);
}

void print_failure(t_ffpz K, t_uint32 q, t_uint64 logbeta, t_uint32 g)
{
    printf("q = %u:  FAIL for beta = ", q);

    if (logbeta == 0)
	printf("1");
    else if (logbeta == 1)
	printf("w");
    else
	printf("w^%lu", logbeta);

    printf(", gamma = ");

    if (K->type == FFPZ_P3_F20)
    {
	if (g == q)
	    printf("w");
	else if (g == 0)
	    printf("w^2");
	else if (g == 1)
	    printf("w^2 + w");
	else
	    printf("w^2 + %u*w", g);
    }
    else
    {
	if (g == q)
	    printf("w");
	else if (g == ZECH_ZERO(K->Z))
	    printf("w^2");
	else if (g == ZECH_ONE)
	    printf("w^2 + w");
	else if (g == 1)
	    printf("w^2 + z*w");
	else
	    printf("w^2 + z^%u*w", g);
    }

    printf("\n");
}

t_boolean gamma_beta_loop_char2(t_ffpz K, t_uint32 gstart, t_uint32 gend)
{
    t_uint32	q, q2, g, start, end, index, largebit;
    t_uint64	logbeta, rad, zero, x, M0;
    t_boolean	allgood;

    allgood = TRUE;

    q = K->Z->q;
    q2 = q*q;

    rad = K->rad;
    if (rad == 0)
	rad = K->q - 1;

    INIT_MULTI_ITERATOR(K->mi);

    largebit = 0;
    if (uselarge)
	largebit = ((t_uint32)1) << (1 + highbitpos32(largeprime));

    M0 = multicounter;
    for (g = gstart; g < gend; g++)
    {
	ASSERT(multicounter == M0);
	if (uselarge)
	    ASSERT(largecounter == 0);

	start = q*g;
	end = start + q;

	for (logbeta = 0; logbeta < rad; logbeta++)
	{
	    for (index = start; index < end; index++)
	    {
		x = ga_table[index];
		zero = ((x ^ multicounter) + lbit) & hbit;
		if (!zero)
		{
		    if (uselarge)
		    {
			if (ga_table_large[index] == largecounter)
			    continue;
		    }
		    goto NEXT_BETA;
		}
	    }

	    print_failure(K, q, logbeta, g);
	    allgood = FALSE;
	    if (!opt_find_all)
		exit(2);

	NEXT_BETA:
	    ADVANCE_MULTI_ITERATOR();
	}
    }

    return allgood;
}

t_boolean gamma_beta_loop_oddchar_small(t_ffpz K, t_uint32 gstart, t_uint32 gend)
{
    t_uint32	q, q2, g, oddstart, oddend, evenstart, evenend, index;
    t_uint64	logbeta, rad, halfrad, zero, x, M0;
    t_boolean	allgood;

    allgood = TRUE;

    q = K->Z->q;
    q2 = q*q;

    rad = K->rad;
    if (rad == 0)
	rad = K->q - 1;
    ASSERT((rad % 2) == 0);

    halfrad = rad / 2;
    ASSERT((halfrad % 2) == 1);

    INIT_MULTI_ITERATOR(K->mi);
    ASSERT(!uselarge);

    M0 = multicounter;
    for (g = gstart; g < gend; g++)
    {
	ASSERT(haszero);
	ASSERT(multicounter == M0);

	oddstart = q*g;
	oddend = oddstart + ga_counts[2*g];
	evenstart = oddend;
	evenend = evenstart + ga_counts[2*g + 1];

	for (logbeta = 0; logbeta < halfrad; logbeta++)
	{
	    for (index = oddstart; index < oddend; index++)
	    {
		x = ga_table[index];
		zero = ((x ^ multicounter) + lbit) & hbit;
		if (!zero)
		    goto EVEN_ALPHA;
	    }

	    if (logbeta & 1)
		print_failure(K, q, logbeta + halfrad, g);
	    else
		print_failure(K, q, logbeta, g);
	    allgood = FALSE;
	    if (!opt_find_all)
		exit(2);

	EVEN_ALPHA:
	    for (index = evenstart; index < evenend; index++)
	    {
		x = ga_table[index];
		zero = ((x ^ multicounter) + lbit) & hbit;
		if (!zero)
		    goto NEXT_BETA;
	    }

	    if (logbeta & 1)
		print_failure(K, q, logbeta, g);
	    else
		print_failure(K, q, logbeta + halfrad, g);
	    allgood = FALSE;
	    if (!opt_find_all)
		exit(2);

	NEXT_BETA:
	    ADVANCE_MULTI_ITERATOR_SMALL();
	}
    }

    return allgood;
}

t_boolean gamma_beta_loop_oddchar_large(t_ffpz K, t_uint32 gstart, t_uint32 gend)
{
    t_uint32	q, q2, g, oddstart, oddend, evenstart, evenend, index;
    t_uint64	logbeta, rad, halfrad, zero, x, M0;
    t_boolean	allgood;

    allgood = TRUE;

    q = K->Z->q;
    q2 = q*q;

    rad = K->rad;
    if (rad == 0)
	rad = K->q - 1;
    ASSERT((rad % 2) == 0);

    halfrad = rad / 2;
    ASSERT((halfrad % 2) == 1);

    INIT_MULTI_ITERATOR(K->mi);
    ASSERT(uselarge);

    M0 = multicounter;
    for (g = gstart; g < gend; g++)
    {
	ASSERT(haszero);
	ASSERT(multicounter == M0);
	ASSERT(largecounter == 0);

	oddstart = q*g;
	oddend = oddstart + ga_counts[2*g];
	evenstart = oddend;
	evenend = evenstart + ga_counts[2*g + 1];

	for (logbeta = 0; logbeta < halfrad; logbeta++)
	{
	    for (index = oddstart; index < oddend; index++)
	    {
		x = ga_table[index];
		zero = ((x ^ multicounter) + lbit) & hbit;
		if (!zero)
		{
		    if (ga_table_large[index] == largecounter)
			continue;
		    goto EVEN_ALPHA;
		}
	    }

	    if (logbeta & 1)
		print_failure(K, q, logbeta + halfrad, g);
	    else
		print_failure(K, q, logbeta, g);
	    allgood = FALSE;
	    if (!opt_find_all)
		exit(2);

	EVEN_ALPHA:
	    for (index = evenstart; index < evenend; index++)
	    {
		x = ga_table[index];
		zero = ((x ^ multicounter) + lbit) & hbit;
		if (!zero)
		{
		    if (ga_table_large[index] == largecounter)
			continue;
		    goto NEXT_BETA;
		}
	    }

	    if (logbeta & 1)
		print_failure(K, q, logbeta, g);
	    else
		print_failure(K, q, logbeta + halfrad, g);
	    allgood = FALSE;
	    if (!opt_find_all)
		exit(2);

	NEXT_BETA:
	    ADVANCE_MULTI_ITERATOR_LARGE();
	}
    }

    return allgood;
}

t_boolean gamma_beta_loop_range(t_ffpz K, t_uint32 gstart, t_uint32 gend)
{
    switch (K->type)
    {
    case FFPZ_CHAR_2:
	return gamma_beta_loop_char2(K, gstart, gend);
	break;

    case FFPZ_P3_F20:
    case FFPZ_CHAR_ODD:
	if (K->mi.uselarge)
	    return gamma_beta_loop_oddchar_large(K, gstart, gend);
	else
	    return gamma_beta_loop_oddchar_small(K, gstart, gend);
	break;
    }
}

typedef struct
{
    t_ffpz	K;
    t_uint32	gstart, gend;
    t_boolean	allgood;
} str_thread_info;

void *gamma_beta_loop_worker(void *vinfo)
{
    str_thread_info	*info;

    info = (str_thread_info *)vinfo;
    info->allgood = gamma_beta_loop_range(info->K, info->gstart, info->gend);
    return NULL;
}

t_boolean gamma_beta_loop(t_ffpz K)
{
    t_uint32		q, i, A, B, gstart, gend;
    pthread_t		*threads;
    str_thread_info	*args;
    t_boolean		allgood;

    ASSERT(opt_nthreads >= 1);

    START_TIMER();

    q = K->Z->q;

    IF_VERBOSE( 
	switch (K->type)
	{
	case FFPZ_CHAR_2:
	    printf("Gamma-beta loop: Characteristic two (general case)\n");
	    break;

	case FFPZ_P3_F20:
	case FFPZ_CHAR_ODD:
	    if (K->mi.uselarge)
		printf("Gamma-beta loop: Odd characteristic (large case)\n");
	    else
		printf("Gamma-beta loop: Odd characteristic (small case)\n");
	    break;
	}
	if (opt_nthreads == 1)
	    printf("Using 1 thread\n");
	else
	    printf("Using %u threads\n", opt_nthreads);
    )

    threads = ARRAY_ALLOC(opt_nthreads, pthread_t);
    args = ARRAY_ALLOC(opt_nthreads, str_thread_info);

    A = (q + 1) / opt_nthreads;
    B = (q + 1) % opt_nthreads;
    gend = 0;
    for (i = 0; i < opt_nthreads; i++)
    {
	gstart = gend;
	gend = gstart + A;
	if (i < B)
	    gend++;

	args[i].K = K;
	args[i].gstart = gstart;
	args[i].gend = gend;
    }
    ASSERT(gend == q + 1);

    /* thread 0 is this main thread */
    for (i = 1; i < opt_nthreads; i++)
    {
	if (pthread_create(&(threads[i]), NULL, gamma_beta_loop_worker, &(args[i])))
	{
	    fprintf(stderr, "Error launching thread %u\n", i);
	    exit(1);
	}
    }

    gamma_beta_loop_worker(&(args[0]));

    for (i = 1; i < opt_nthreads; i++)
	pthread_join(threads[i], NULL);

    allgood = TRUE;
    for (i = 0; i < opt_nthreads; i++)
	if (!args[i].allgood)
	    allgood = FALSE;

    ARRAY_FREE(args);
    ARRAY_FREE(threads);

    STOP_TIMER("gamma-beta loop");

    return allgood;
}


/*************** Main driver routine ***************/

void usage(t_boolean extended)
{
    fprintf(stderr, "Usage: L3check [-h] [-m] [-v] [-na] [-t nthreads] [-all] p <Fq-poly> <Fq^3-poly> <prime list>\n");
    fprintf(stderr, "    -h      extended usage message with (long) explanation of argument format\n");
    fprintf(stderr, "    -m      show Magma code to generate arguments\n");
    fprintf(stderr, "    -v      verbose printing\n");
    fprintf(stderr, "    -na     disable the antisort step\n");
    fprintf(stderr, "    -t <n>  use <n> threads for the main search\n");
    fprintf(stderr, "    -all    continue searching even if a counterexample is found\n");

    if (!extended)
	return;

    fprintf(stderr, "\n");
    fprintf(stderr, "The Fq-poly gives the coefficients of a polynomial over GF(p).  The first item is the degree, n, then the next n + 1 items are the polynomial coefficients starting with the constant term.  This polynomial is required to be monic and primitive, so that its root is a primitive element of GF(q) (where q = p^n).\n\n");
    fprintf(stderr, "In the common case where q is prime, the polynomial will be x - z where z is some primitive element mod p.  The coefficients will then be p-z 1, and the full argument list for the polynomial would be 1 p-z 1.\n\n");
    fprintf(stderr, "Examples: For q = 7 where 3 is a primitive element, the Fq-poly part of the argument list would be 1 4 1.  For q = 8, where the appropriate minimal polynomial is x^3 + x + 1, we would have 3 1 1 0 1.\n\n");

    fprintf(stderr, "\n");
    fprintf(stderr, "The Fq^3-poly gives the coefficients of a polynomial over GF(q).  Again, the first item is the degree (which will always be 3), and the ensuing values are the coefficients starting with the constant term.  Again, this polynomial is required to be monic and primitive, so that its root is a primitive element of GF(q^3).\n\n");
    fprintf(stderr, "The coefficients are specified in log form with respect to the primitive element of GF(q).  That is, if the coefficient c = z^e, then it is e that is used as the argument.  To represent 0, the special value q - 1 is used.\n\n");
    fprintf(stderr, "Examples: For q = 7, a suitable minimal polynomial is x^3 + 6*x^2 + 4.  In GF(7), 1 is 3^0, 6 is 3^3, and 4 is 3^4; 0 is represented by 7 - 1 = 6.  The appropriate coefficients are thus 4 6 3 0, and the Fq^3-poly part of the argument list would be 3 4 6 3 0.\n\n");
    fprintf(stderr, "For q = 8, a minimal polynomial is x^3 + z*x^2 + z^5*x + z, where z is a root of t^3 + t + 1 = 0 in GF(8).  The argument list component is thus 3 1 5 1 0.\n\n");

    fprintf(stderr, "\n");
    fprintf(stderr, "The prime list gives the primes that divide q^3 - 1.  It consists simply of the number of such primes, followed by those primes in ascending order.\n\n");
    fprintf(stderr, "Examples: For q = 7, 7^3 - 1 = 342 = 2 * 3^2 * 19, and so the prime list would be 3 2 3 19.  For q = 8, 8^3 - 1 = 511 = 7*73, so the prime list would be 2 7 73.\n\n");

    fprintf(stderr, "\n");
    fprintf(stderr, "Putting all of the above together, we can see that the appropriate invocations for the example values are:\n\n");
    fprintf(stderr, "    q = 7:   L3check 7 1 4 1 3 4 6 3 0 3 2 3 19\n");
    fprintf(stderr, "    q = 8:   L3check 2 3 1 1 0 1 3 1 5 1 0 2 7 73\n\n");

    fprintf(stderr, "\n");
    fprintf(stderr, "Magma code to automate the generation of these arguments can be displayed by using the '-m' argument\n");
    fprintf(stderr, "\n");
}

void usage_exit()
{
    usage(FALSE);
    exit(1);
}

void show_magma_code()
{
    printf("\
L3args := function(q)\n\
    F := GF(q);\n\
    p := Characteristic(F);\n\
    n := Degree(F);\n\
    a := PrimitiveElement(F);\n\
    pol1 := MinimalPolynomial(a);\n\
    C1 := ChangeUniverse(Eltseq(pol1), Integers());\n\
    K := ext<F | 3>;\n\
    w := PrimitiveElement(K);\n\
    pol2 := MinimalPolynomial(w, F);\n\
    C2 := [ IsZero(c) select q-1 else Log(a, c) : c in Eltseq(pol2) ];\n\
    primes := [ t[1] : t in Factorisation(q^3 - 1) ];\n\
    args := [p,n] cat C1 cat [3] cat C2 cat [#primes] cat primes;\n\
    return Prune(&cat [ IntegerToString(x) cat \" \" : x in args ]);\n\
end function;\n\
");
}

void init_options()
{
    opt_verbose = FALSE;
    opt_antisort = TRUE;
    opt_nthreads = 1;
    opt_find_all = FALSE;
}

#define	REQUIRE_ARGS(n, code)						\
    {									\
	if (argc < n)							\
	    usage_exit();						\
	{ code }							\
	argc -= n;							\
	argv += n;							\
    }

void process_options(int *argcp, char ***argvp)
{
    int		argc;
    char	**argv;

    argc = *argcp;
    argv = *argvp;

    while (argc > 0 && argv[0][0] == '-')
    {
	if (strcmp(argv[0], "-h") == 0)
	{
	    usage(TRUE);
	    exit(0);
	}
	else if (strcmp(argv[0], "-m") == 0)
	{
	    show_magma_code();
	    exit(0);
	}
	else if (strcmp(argv[0], "-v") == 0)
	{
	    REQUIRE_ARGS(1,
		opt_verbose = TRUE;
	    )
	}
	else if (strcmp(argv[0], "-na") == 0)
	{
	    REQUIRE_ARGS(1,
		opt_antisort = FALSE;
	    )
	}
	else if (strcmp(argv[0], "-t") == 0)
	{
	    REQUIRE_ARGS(2,
		opt_nthreads = atoi(argv[1]);
	    )
	    if (opt_nthreads < 1)
	    {
		fprintf(stderr, "Invalid number of threads specified\n");
		usage_exit();
	    }
	}
	else if (strcmp(argv[0], "-all") == 0)
	{
	    REQUIRE_ARGS(1,
		opt_find_all = TRUE;
	    )
	}
	else
	{
	    fprintf(stderr, "Unrecognised option '%s'\n", argv[0]);
	    usage_exit();
	}
    }

    *argcp = argc;
    *argvp = argv;
}

t_uint32 *args_to_array(char *argv[], t_uint32 n)
{
    t_uint32	*arr;
    t_uint32	i;

    arr = ARRAY_ALLOC(n, t_uint32);
    for (i = 0; i < n; i++)
	arr[i] = atoi(argv[i]);
    return arr;
}

t_ffpz args_to_field(int argc, char *argv[])
{
    t_uint32		p, n, deg;
    t_uint32		*Fqpoly;
    t_ffpoly		F;
    t_ffzech		Z;
    t_ffzech_elt	*Fq3poly;
    t_ffpz		K;
    t_uint32		nprimes;
    t_uint32		*primes;

    REQUIRE_ARGS(1, /* skip past program name */)
    process_options(&argc, &argv);

    REQUIRE_ARGS(2,
	p = atoi(argv[0]);
	n = atoi(argv[1]);
    )

    REQUIRE_ARGS(n + 1,
	Fqpoly = args_to_array(argv, n + 1);
	if (Fqpoly[n] != 1)
	    usage_exit();
    )

    F = ffpoly_create(p, n, Fqpoly);
    ARRAY_FREE(Fqpoly);
    if (F->q > QLIMIT)
    {
	fprintf(stderr, "q = p^n must be at most %u\n", QLIMIT);
	usage_exit();
    }

    Z = ffzech_create(F);

    REQUIRE_ARGS(1,
	deg = atoi(argv[0]);
    )
    REQUIRE_ARGS(deg + 1,
	Fq3poly = args_to_array(argv, deg + 1);
	if (Fq3poly[deg] != 0)
	    usage_exit();
    )

    K = ffpz_create(Z, deg, Fq3poly);
    ARRAY_FREE(Fq3poly);

    REQUIRE_ARGS(1,
	nprimes = atoi(argv[0]);
    )
    REQUIRE_ARGS(nprimes,
	primes = args_to_array(argv, nprimes);
    )

    ffpz_store_primes(K, nprimes, primes);

    return K;
}

int main(int argc, char *argv[])
{
    t_ffpz		K;

    init_options();
    K = args_to_field(argc, argv);
    build_gamma_alpha_table(K);
    if (gamma_beta_loop(K))
	printf("q = %u:  SUCCESS\n", K->Z->q);

    return 0;
}