Macros
#define	CLEAR_FLAGS(empty_reg) "xorq " empty_reg ", " empty_reg " \n\t"

#define	LOAD_FIELD_ELEMENT(a, lolo, lohi, hilo, hihi)

#define	STORE_FIELD_ELEMENT(r, lolo, lohi, hilo, hihi)

#define	ADD(b)

#define	SUB(b)

#define	ADD_REDUCE(b, twice_not_modulus_0, twice_not_modulus_1, twice_not_modulus_2, twice_not_modulus_3)

#define	CONDITIONAL_ADD(b_0, b_1, b_2, b_3)

#define	MUL(a1, a2, a3, a4, b)

Macro Definition Documentation

◆ ADD

#define ADD ( b )

Value:

        "addq 0(" b "), %%r12                   \n\t"                                                                   \
        "adcq 8(" b "), %%r13                   \n\t"                                                                   \
        "adcq 16(" b "), %%r14                  \n\t"                                                                   \
        "adcq 24(" b "), %%r15                  \n\t"

Take a 4-limb field element, in (r12, r13, r14, r15), and add 4-limb field element pointed to by b

Definition at line 42 of file asm_macros.hpp.

◆ ADD_REDUCE

#define ADD_REDUCE	(	b,
		twice_not_modulus_0,
		twice_not_modulus_1,
		twice_not_modulus_2,
		twice_not_modulus_3
	)

Value:

        "addq 0(" b "), %%r12                   \n\t"                                                                   \
        "adcq 8(" b "), %%r13                   \n\t"                                                                   \
        "adcq 16(" b "), %%r14                  \n\t"                                                                   \
        "adcq 24(" b "), %%r15                  \n\t"                                                                   \
        "movq  %%r12, %%r8                      \n\t"                                                                   \
        "movq %%r13, %%r9                       \n\t"                                                                   \
        "movq %%r14, %%r10                      \n\t"                                                                   \
        "movq %%r15, %%r11                      \n\t"                                                                   \
        "addq " twice_not_modulus_0 ", %%r12    \n\t" /* r'[0] += ~(2p)[0]+1  (subtract 2p via two's complement) */     \
        "adcq " twice_not_modulus_1 ", %%r13    \n\t" /* r'[1] += ~(2p)[1] */                                           \
        "adcq " twice_not_modulus_2 ", %%r14    \n\t" /* r'[2] += ~(2p)[2] */                                           \
        "adcq " twice_not_modulus_3 ", %%r15    \n\t" /* r'[3] += ~(2p)[3] */                                           \
        "cmovncq %%r8, %%r12                    \n\t"                                                                   \
        "cmovncq %%r9, %%r13                    \n\t"                                                                   \
        "cmovncq %%r10, %%r14                   \n\t"                                                                   \
        "cmovncq %%r11, %%r15                   \n\t"

Take a 4-limb field element, in (r12, r13, r14, r15), add 4-limb field element pointed to by b, and reduce modulo 2p Works as follows: adds the number, moves the result, adds (twos complement of 2p), conditionally restores.

Note: the 2p is because for 254 bit fields, we use the coarse representation.

Definition at line 66 of file asm_macros.hpp.

◆ CLEAR_FLAGS

#define CLEAR_FLAGS ( empty_reg ) "xorq " empty_reg ", " empty_reg " \n\t"

Definition at line 13 of file asm_macros.hpp.

◆ CONDITIONAL_ADD

#define CONDITIONAL_ADD	(	b_0,
		b_1,
		b_2,
		b_3
	)

Value:

        /* Duplicate `r` */                                                                                             \
        "movq %%r12, %%r8                       \n\t"                                                                   \
        "movq %%r13, %%r9                       \n\t"                                                                   \
        "movq %%r14, %%r10                      \n\t"                                                                   \
        "movq %%r15, %%r11                      \n\t"                                                                   \
        "addq " b_0 ", %%r12                    \n\t" /* r'[0] += b[0] */                                               \
        "adcq " b_1 ", %%r13                    \n\t" /* r'[1] += b[1] */                                               \
        "adcq " b_2 ", %%r14                    \n\t" /* r'[2] += b[2] */                                               \
        "adcq " b_3 ", %%r15                    \n\t" /* r'[3] += b[3] */                                               \
                                                                                                                        \
        /* if the addition did not carry, restore the original r */                                                     \
        "cmovncq %%r8, %%r12                    \n\t"                                                                   \
        "cmovncq %%r9, %%r13                    \n\t"                                                                   \
        "cmovncq %%r10, %%r14                   \n\t"                                                                   \
        "cmovncq %%r11, %%r15                   \n\t"

Takes a 4-limb integer, r in (r12, r13, r14, r15), and conditionally adds (b_0, b_1, b_2, b_3). Computes r' = r + b. If there is a carry, replace r with r'; otherwise leave r untouched. Two use cases:

asm_reduce_once: b = -p. If r >= p, return r - p.
asm_sub_with_coarse_reduction: b = 2p. If r wrapped negative, return r + 2p.

Definition at line 95 of file asm_macros.hpp.

◆ LOAD_FIELD_ELEMENT

#define LOAD_FIELD_ELEMENT	(	a,
		lolo,
		lohi,
		hilo,
		hihi
	)

Value:

        "movq 0(" a "), " lolo "                \n\t"                                                                   \
        "movq 8(" a "), " lohi "                \n\t"                                                                   \
        "movq 16(" a "), " hilo "               \n\t"                                                                   \
        "movq 24(" a "), " hihi "               \n\t"

Load 4-limb field element, pointed to by a, into registers (lolo, lohi, hilo, hihi)

Definition at line 20 of file asm_macros.hpp.

◆ MUL

#define MUL	(	a1,
		a2,
		a3,
		a4,
		b
	)

We don't have a bespoke non-ADX SQR implementation. The former implementation had a rare bug, so in practice we call MUL(a, a) instead. Montgomery multiplication (non-ADX): computes a * b * R^{-1} mod p, with output in [0, 2p).

Algorithm: CIOS (Coarsely Integrated Operand Scanning). Set S = 0. For i = 0..3: S += a[i] * b (multiply-accumulate) k = S[0] * (-p^{-1}) mod 2^64 (Montgomery quotient) S += k * p (cancel lowest limb: S[0] + k*p[0] = 0 mod 2^64) S >>= 64 (shift out the zeroed limb) Output S = (a*b + K*p) / R, where K = sum(k_i * 2^{64i}), R = 2^{256}. Since a, b < 2p < 2^{255} and 4p < R, the output satisfies S < 2p.

Modulus assumption: p < 2^{254}, so p[3] < 2^{62}. All bit-width bounds below rely on this. (For BN254 and grumpkin, p[3] ~ 0x30644e72... < 2^{62}.)

Registers: rdx = multiplier (a[i] during multiply, k during reduce; implicit source for mulxq) r8, r9, rdi, r11 = scratch (partial products from mulxq) r[0..4+] = accumulator limbs, rotating each round (see per-round maps below)

Carry chain structure (same pattern each round): The reduction of k*p uses THREE separate addq/adcq chains (A, B, C) to add the 8 partial products (lo/hi of k*p[0..3]) into 5 limb positions. Each addq kills the previous CF, so the killed chain's terminal CF must be 0. This holds because the top limb stays < 2^64.

mulxq is flag-preserving: it does NOT modify CF. This is critical because mulxq instructions appear between addq (which starts a carry chain) and subsequent adcq instructions.

Bit-width invariant: entering each round i >= 1, all limbs are < 2^{64} and the top limb is <= 2^{63} + 2^{62} + 5. This ensures every chain-terminal CF = 0 (no carry is lost).

Result is stored in (%r12, %r13, %r14, %r15) for STORE_FIELD_ELEMENT.

Definition at line 150 of file asm_macros.hpp.

◆ STORE_FIELD_ELEMENT

#define STORE_FIELD_ELEMENT	(	r,
		lolo,
		lohi,
		hilo,
		hihi
	)

Value:

        "movq " lolo ", 0(" r ")                \n\t"                                                                   \
        "movq " lohi ", 8(" r ")                \n\t"                                                                   \
        "movq " hilo ", 16(" r ")               \n\t"                                                                   \
        "movq " hihi ", 24(" r ")               \n\t"

Store 4-limb field element located in registers (lolo, lohi, hilo, hihi), into memory pointed to by r

Definition at line 31 of file asm_macros.hpp.

◆ SUB

#define SUB ( b )

Value:

        "subq 0(" b "), %%r12                   \n\t"                                                                   \
        "sbbq 8(" b "), %%r13                   \n\t"                                                                   \
        "sbbq 16(" b "), %%r14                  \n\t"                                                                   \
        "sbbq 24(" b "), %%r15                  \n\t"

Take a 4-limb field element, in (r12, r13, r14, r15), and subtract 4-limb field element pointed to by b

Definition at line 52 of file asm_macros.hpp.

Macros

Macro Definition Documentation

◆ ADD

◆ ADD_REDUCE

◆ CLEAR_FLAGS

◆ CONDITIONAL_ADD

◆ LOAD_FIELD_ELEMENT

◆ MUL

◆ STORE_FIELD_ELEMENT

◆ SUB