Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
asm_macros.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Completed, auditors: [Raju], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
7#pragma once
8// clang-format off
9
10/*
11 * Clear all flags via xorq opcode
12 **/
13#define CLEAR_FLAGS(empty_reg) \
14 "xorq " empty_reg ", " empty_reg " \n\t"
15
20#define LOAD_FIELD_ELEMENT(a, lolo, lohi, hilo, hihi) \
21 "movq 0(" a "), " lolo " \n\t" \
22 "movq 8(" a "), " lohi " \n\t" \
23 "movq 16(" a "), " hilo " \n\t" \
24 "movq 24(" a "), " hihi " \n\t"
25
31#define STORE_FIELD_ELEMENT(r, lolo, lohi, hilo, hihi) \
32 "movq " lolo ", 0(" r ") \n\t" \
33 "movq " lohi ", 8(" r ") \n\t" \
34 "movq " hilo ", 16(" r ") \n\t" \
35 "movq " hihi ", 24(" r ") \n\t"
36
37#if !defined(__ADX__) || defined(DISABLE_ADX)
42#define ADD(b) \
43 "addq 0(" b "), %%r12 \n\t" \
44 "adcq 8(" b "), %%r13 \n\t" \
45 "adcq 16(" b "), %%r14 \n\t" \
46 "adcq 24(" b "), %%r15 \n\t"
47
52#define SUB(b) \
53 "subq 0(" b "), %%r12 \n\t" \
54 "sbbq 8(" b "), %%r13 \n\t" \
55 "sbbq 16(" b "), %%r14 \n\t" \
56 "sbbq 24(" b "), %%r15 \n\t"
57
58
66#define ADD_REDUCE(b, twice_not_modulus_0, twice_not_modulus_1, twice_not_modulus_2, twice_not_modulus_3) \
67 "addq 0(" b "), %%r12 \n\t" \
68 "adcq 8(" b "), %%r13 \n\t" \
69 "adcq 16(" b "), %%r14 \n\t" \
70 "adcq 24(" b "), %%r15 \n\t" \
71 "movq %%r12, %%r8 \n\t" \
72 "movq %%r13, %%r9 \n\t" \
73 "movq %%r14, %%r10 \n\t" \
74 "movq %%r15, %%r11 \n\t" \
75 "addq " twice_not_modulus_0 ", %%r12 \n\t" /* r'[0] += ~(2p)[0]+1 (subtract 2p via two's complement) */ \
76 "adcq " twice_not_modulus_1 ", %%r13 \n\t" /* r'[1] += ~(2p)[1] */ \
77 "adcq " twice_not_modulus_2 ", %%r14 \n\t" /* r'[2] += ~(2p)[2] */ \
78 "adcq " twice_not_modulus_3 ", %%r15 \n\t" /* r'[3] += ~(2p)[3] */ \
79 "cmovncq %%r8, %%r12 \n\t" \
80 "cmovncq %%r9, %%r13 \n\t" \
81 "cmovncq %%r10, %%r14 \n\t" \
82 "cmovncq %%r11, %%r15 \n\t"
83
84
85
95#define CONDITIONAL_ADD(b_0, b_1, b_2, b_3) \
96 /* Duplicate `r` */ \
97 "movq %%r12, %%r8 \n\t" \
98 "movq %%r13, %%r9 \n\t" \
99 "movq %%r14, %%r10 \n\t" \
100 "movq %%r15, %%r11 \n\t" \
101 "addq " b_0 ", %%r12 \n\t" /* r'[0] += b[0] */ \
102 "adcq " b_1 ", %%r13 \n\t" /* r'[1] += b[1] */ \
103 "adcq " b_2 ", %%r14 \n\t" /* r'[2] += b[2] */ \
104 "adcq " b_3 ", %%r15 \n\t" /* r'[3] += b[3] */ \
105 \
106 /* if the addition did not carry, restore the original r */ \
107 "cmovncq %%r8, %%r12 \n\t" \
108 "cmovncq %%r9, %%r13 \n\t" \
109 "cmovncq %%r10, %%r14 \n\t" \
110 "cmovncq %%r11, %%r15 \n\t"
111
150#define MUL(a1, a2, a3, a4, b) \
151 /* ===================================================================================== */ \
152 /* ROUND 0: accumulate a[0]*b, then reduce by k0*p. Shift out r[0]. */ \
153 /* Register map: r13=r[0] r14=r[1] r15=r[2] r10=r[3] r12=r[4] */ \
154 /* ===================================================================================== */ \
155 "movq " a1 ", %%rdx \n\t" /* rdx = a[0] */ \
156 "xorq %%r8, %%r8 \n\t" /* clear r8; also clears CF for first addq */ \
157 \
158 /* --- a[0] * b: four independent multiplies ------------------------------------------- */ \
159 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[0] * b[1] */ \
160 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (rdi, r12) = a[0] * b[3] */ \
161 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r13, r14) = a[0] * b[0] -> (r[0], r[1]) */ \
162 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r15, r10) = a[0] * b[2] -> (r[2], r[3]) */ \
163 \
164 /* --- k0 computation (before addition chain so mulxq can overlap) --------------------- */ \
165 "movq %%r13, %%rdx \n\t" /* rdx = r[0] */ \
166 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* rdx = k0 = r[0] * r_inv mod 2^64 */ \
167 \
168 /* --- Chain 0A: assemble a[0]*b cross-terms into r[1..4] ------------------------------ */ \
169 /* Killed CF: from xorq (= 0). Safe. */ \
170 "addq %%r8, %%r14 \n\t" /* r[1] += lo(a0*b1) */ \
171 "adcq %%r9, %%r15 \n\t" /* r[2] += hi(a0*b1) + CF */ \
172 "adcq %%rdi, %%r10 \n\t" /* r[3] += lo(a0*b3) + CF */ \
173 "adcq $0, %%r12 \n\t" /* r[4] += CF [max: < 2^63 + 1 (b[3]<2^63)] */ \
174 \
175 /* --- k0 * p reduction ---------------------------------------------------------------- */ \
176 /* Per-limb totals of k0*p added to S (verified across chains A, B, C below): */ \
177 /* r[0] += lo(k0*p0) -> zeroed mod 2^64 */ \
178 /* r[1] += hi(k0*p0) + lo(k0*p1) */ \
179 /* r[2] += hi(k0*p1) + lo(k0*p2) */ \
180 /* r[3] += hi(k0*p2) + lo(k0*p3) */ \
181 /* r[4] += hi(k0*p3) */ \
182 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k0 * p[0] */ \
183 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k0 * p[1] */ \
184 /* Chain 0B: lo(k0*p0), lo(k0*p1), hi(k0*p1), 0, 0 */ \
185 /* Killed CF: terminal of chain 0A (r[4] < 2^63 + 1 < 2^64). Safe. */ \
186 "addq %%r8, %%r13 \n\t" /* r[0] += lo(k0*p0) -> 0 mod 2^64 */ \
187 "adcq %%rdi, %%r14 \n\t" /* r[1] += lo(k0*p1) + CF */ \
188 "adcq %%r11, %%r15 \n\t" /* r[2] += hi(k0*p1) + CF */ \
189 "adcq $0, %%r10 \n\t" /* r[3] += CF */ \
190 "adcq $0, %%r12 \n\t" /* r[4] += CF [max: <= 2^63 + 1] */ \
191 /* Chain 0C: hi(k0*p0), lo(k0*p2), lo(k0*p3), hi(k0*p3) */ \
192 /* Killed CF: terminal of chain 0B (r[4] <= 2^63 + 1 < 2^64). Safe. */ \
193 "addq %%r9, %%r14 \n\t" /* r[1] += hi(k0*p0) */ \
194 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k0 * p[2] */ \
195 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k0 * p[3] */ \
196 "adcq %%r8, %%r15 \n\t" /* r[2] += lo(k0*p2) + CF */ \
197 "adcq %%rdi, %%r10 \n\t" /* r[3] += lo(k0*p3) + CF */ \
198 "adcq %%r11, %%r12 \n\t" /* r[4] += hi(k0*p3) + CF [max: <= 2^63 + 2^62 + 2] */ \
199 /* Chain 0D: hi(k0*p2), 0 */ \
200 /* Killed CF: terminal of chain 0C (r[4] <= 2^63 + 2^62 + 2 < 2^64). Safe. */ \
201 "addq %%r9, %%r10 \n\t" /* r[3] += hi(k0*p2) */ \
202 "adcq $0, %%r12 \n\t" /* r[4] += CF [max: <= 2^63 + 2^62 + 3] */ \
203 \
204 /* Post-round 0: r[4] <= 2^63 + 2^62 + 3 < 2^64. No 5th limb needed. */ \
205 \
206 /* ===================================================================================== */ \
207 /* ROUND 1: accumulate a[1]*b, then reduce by k1*p. Shift out r[1]. */ \
208 /* Register map: r14=r[1] r15=r[2] r10=r[3] r12=r[4] r13=r[5] */ \
209 /* ===================================================================================== */ \
210 "movq " a2 ", %%rdx \n\t" /* rdx = a[1] */ \
211 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[1] * b[0] */ \
212 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[1] * b[1] */ \
213 /* Chain 1A: lo(a1*b0), lo(a1*b1), hi(a1*b1), 0 */ \
214 /* Killed CF: terminal of chain 0D (r[4] <= 2^63 + 2^62 + 3 < 2^64). Safe. */ \
215 "addq %%r8, %%r14 \n\t" /* r[1] += lo(a1*b0) */ \
216 "adcq %%rdi, %%r15 \n\t" /* r[2] += lo(a1*b1) + CF */ \
217 "adcq %%r11, %%r10 \n\t" /* r[3] += hi(a1*b1) + CF */ \
218 "adcq $0, %%r12 \n\t" /* r[4] += CF [max: <= 2^63 + 2^62 + 4] */ \
219 /* Chain 1B: hi(a1*b0), lo(a1*b2), lo(a1*b3), 0 */ \
220 /* Killed CF: terminal of chain 1A (r[4] <= 2^63 + 2^62 + 4 < 2^64). Safe. */ \
221 "addq %%r9, %%r15 \n\t" /* r[2] += hi(a1*b0) */ \
222 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[1] * b[2] */ \
223 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (rdi, r13) = a[1] * b[3] -> r13 = r[5] = hi(a1*b3) */ \
224 "adcq %%r8, %%r10 \n\t" /* r[3] += lo(a1*b2) + CF */ \
225 "adcq %%rdi, %%r12 \n\t" /* r[4] += lo(a1*b3) + CF */ \
226 "adcq $0, %%r13 \n\t" /* r[5] += CF [max: < 2^63 + 1] */ \
227 /* Chain 1C: hi(a1*b2), 0 */ \
228 /* Killed CF: terminal of chain 1B (r[5] < 2^63 + 1). Safe. */ \
229 "addq %%r9, %%r12 \n\t" /* r[4] += hi(a1*b2) */ \
230 "adcq $0, %%r13 \n\t" /* r[5] += CF [max: < 2^63 + 2] */ \
231 \
232 /* --- k1 * p reduction ---------------------------------------------------------------- */ \
233 /* Per-limb totals of k1*p added (same decomposition as round 0, shifted by one): */ \
234 /* r[1] += lo(k1*p0) -> zeroed mod 2^64 */ \
235 /* r[2] += hi(k1*p0) + lo(k1*p1) */ \
236 /* r[3] += hi(k1*p1) + lo(k1*p2) */ \
237 /* r[4] += hi(k1*p2) + lo(k1*p3) */ \
238 /* r[5] += hi(k1*p3) */ \
239 "movq %%r14, %%rdx \n\t" /* rdx = r[1] */ \
240 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k1 = r[1] * r_inv mod 2^64 */ \
241 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k1 * p[0] */ \
242 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k1 * p[1] */ \
243 /* Chain 1D: lo(k1*p0), lo(k1*p1), hi(k1*p1), 0, 0 */ \
244 /* Killed CF: terminal of chain 1C (r[5] < 2^63 + 2 < 2^64). Safe. */ \
245 "addq %%r8, %%r14 \n\t" /* r[1] += lo(k1*p0) -> 0 mod 2^64 */ \
246 "adcq %%rdi, %%r15 \n\t" /* r[2] += lo(k1*p1) + CF */ \
247 "adcq %%r11, %%r10 \n\t" /* r[3] += hi(k1*p1) + CF */ \
248 "adcq $0, %%r12 \n\t" /* r[4] += CF */ \
249 "adcq $0, %%r13 \n\t" /* r[5] += CF [max: < 2^63 + 3] */ \
250 /* Chain 1E: hi(k1*p0), lo(k1*p2), hi(k1*p2), hi(k1*p3) */ \
251 /* Killed CF: terminal of chain 1D (r[5] < 2^63 + 3 < 2^64). Safe. */ \
252 "addq %%r9, %%r15 \n\t" /* r[2] += hi(k1*p0) */ \
253 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k1 * p[2] */ \
254 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k1 * p[3] */ \
255 "adcq %%r8, %%r10 \n\t" /* r[3] += lo(k1*p2) + CF */ \
256 "adcq %%r9, %%r12 \n\t" /* r[4] += hi(k1*p2) + CF */ \
257 "adcq %%r11, %%r13 \n\t" /* r[5] += hi(k1*p3) + CF [max: <= 2^63 + 2^62 + 5] */ \
258 /* Chain 1F: lo(k1*p3), 0 */ \
259 /* Killed CF: terminal of chain 1E (r[5] <= 2^63 + 2^62 + 5 < 2^64). Safe. */ \
260 "addq %%rdi, %%r12 \n\t" /* r[4] += lo(k1*p3) */ \
261 "adcq $0, %%r13 \n\t" /* r[5] += CF [max: <= 2^63 + 2^62 + 6] */ \
262 \
263 /* Post-round 1: r[5] <= 2^63 + 2^62 + 6 < 2^64. Invariant holds. */ \
264 \
265 /* ===================================================================================== */ \
266 /* ROUND 2: accumulate a[2]*b, then reduce by k2*p. Shift out r[2]. */ \
267 /* Register map: r15=r[2] r10=r[3] r12=r[4] r13=r[5] r14=r[6] */ \
268 /* ===================================================================================== */ \
269 "movq " a3 ", %%rdx \n\t" /* rdx = a[2] */ \
270 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[2] * b[0] */ \
271 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[2] * b[1] */ \
272 /* Chain 2A: lo(a2*b0), hi(a2*b0), hi(a2*b1), 0 */ \
273 /* Killed CF: terminal of chain 1F (r[5] <= 2^63 + 2^62 + 6 < 2^64). Safe. */ \
274 "addq %%r8, %%r15 \n\t" /* r[2] += lo(a2*b0) */ \
275 "adcq %%r9, %%r10 \n\t" /* r[3] += hi(a2*b0) + CF */ \
276 "adcq %%r11, %%r12 \n\t" /* r[4] += hi(a2*b1) + CF */ \
277 "adcq $0, %%r13 \n\t" /* r[5] += CF */ \
278 /* Chain 2B: lo(a2*b1), lo(a2*b2), hi(a2*b2), 0 */ \
279 /* Killed CF: terminal of chain 2A (r[5] < 2^64). Safe. */ \
280 "addq %%rdi, %%r10 \n\t" /* r[3] += lo(a2*b1) */ \
281 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[2] * b[2] */ \
282 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (rdi, r14) = a[2] * b[3] -> r14 = r[6] = hi(a2*b3) */ \
283 "adcq %%r8, %%r12 \n\t" /* r[4] += lo(a2*b2) + CF */ \
284 "adcq %%r9, %%r13 \n\t" /* r[5] += hi(a2*b2) + CF */ \
285 "adcq $0, %%r14 \n\t" /* r[6] += CF [max: < 2^63 + 1] */ \
286 /* Chain 2C: lo(a2*b3), 0 */ \
287 "addq %%rdi, %%r13 \n\t" /* r[5] += lo(a2*b3) */ \
288 "adcq $0, %%r14 \n\t" /* r[6] += CF [max: < 2^63 + 2] */ \
289 \
290 /* --- k2 * p reduction ---------------------------------------------------------------- */ \
291 /* Per-limb totals of k2*p added: */ \
292 /* r[2] += lo(k2*p0) -> zeroed mod 2^64 */ \
293 /* r[3] += hi(k2*p0) + lo(k2*p1) */ \
294 /* r[4] += hi(k2*p1) + lo(k2*p2) */ \
295 /* r[5] += hi(k2*p2) + lo(k2*p3) */ \
296 /* r[6] += hi(k2*p3) */ \
297 "movq %%r15, %%rdx \n\t" /* rdx = r[2] */ \
298 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k2 = r[2] * r_inv mod 2^64 */ \
299 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[0] */ \
300 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k2 * p[1] */ \
301 /* Chain 2D: lo(k2*p0), hi(k2*p0), hi(k2*p1), 0, 0 */ \
302 /* Note: chain structure differs from rounds 0-1! Here adcq carries hi(k2*p0) at r[3], */ \
303 /* not lo(k2*p1). Both partial products reach the correct limb across chains D+E. */ \
304 /* Killed CF: terminal of chain 2C (r[6] < 2^63 + 2 < 2^64). Safe. */ \
305 "addq %%r8, %%r15 \n\t" /* r[2] += lo(k2*p0) -> 0 mod 2^64 */ \
306 "adcq %%r9, %%r10 \n\t" /* r[3] += hi(k2*p0) + CF */ \
307 "adcq %%r11, %%r12 \n\t" /* r[4] += hi(k2*p1) + CF */ \
308 "adcq $0, %%r13 \n\t" /* r[5] += CF */ \
309 "adcq $0, %%r14 \n\t" /* r[6] += CF [max: < 2^63 + 3] */ \
310 /* Chain 2E: lo(k2*p1), lo(k2*p2), hi(k2*p2), hi(k2*p3) */ \
311 /* Killed CF: terminal of chain 2D (r[6] < 2^63 + 3 < 2^64). Safe. */ \
312 "addq %%rdi, %%r10 \n\t" /* r[3] += lo(k2*p1) */ \
313 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[2] */ \
314 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k2 * p[3] */ \
315 "adcq %%r8, %%r12 \n\t" /* r[4] += lo(k2*p2) + CF */ \
316 "adcq %%r9, %%r13 \n\t" /* r[5] += hi(k2*p2) + CF */ \
317 "adcq %%r11, %%r14 \n\t" /* r[6] += hi(k2*p3) + CF [max: <= 2^63 + 2^62 + 5] */ \
318 /* Chain 2F: lo(k2*p3), 0 */ \
319 /* Killed CF: terminal of chain 2E (r[6] <= 2^63 + 2^62 + 5 < 2^64). Safe. */ \
320 "addq %%rdi, %%r13 \n\t" /* r[5] += lo(k2*p3) */ \
321 "adcq $0, %%r14 \n\t" /* r[6] += CF [max: <= 2^63 + 2^62 + 6] */ \
322 \
323 /* Post-round 2: r[6] <= 2^63 + 2^62 + 6 < 2^64. Invariant holds. */ \
324 \
325 /* ===================================================================================== */ \
326 /* ROUND 3: accumulate a[3]*b, then reduce by k3*p. Shift out r[3]. */ \
327 /* Register map: r10=r[3] r12=r[4] r13=r[5] r14=r[6] r15=r[7] */ \
328 /* Tighter bound: a[3] < 2^63 (not 2^64), so hi(a3*b3) < 2^62. */ \
329 /* ===================================================================================== */ \
330 "movq " a4 ", %%rdx \n\t" /* rdx = a[3] (< 2^63) */ \
331 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[3] * b[0] */ \
332 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[3] * b[1] */ \
333 /* Chain 3A: lo(a3*b0), hi(a3*b0), hi(a3*b1), 0 */ \
334 /* Killed CF: terminal of chain 2F (r[6] <= 2^63 + 2^62 + 6 < 2^64). Safe. */ \
335 "addq %%r8, %%r10 \n\t" /* r[3] += lo(a3*b0) */ \
336 "adcq %%r9, %%r12 \n\t" /* r[4] += hi(a3*b0) + CF */ \
337 "adcq %%r11, %%r13 \n\t" /* r[5] += hi(a3*b1) + CF */ \
338 "adcq $0, %%r14 \n\t" /* r[6] += CF */ \
339 /* Chain 3B: lo(a3*b1), lo(a3*b2), hi(a3*b2), 0 */ \
340 /* Killed CF: terminal of chain 3A (r[6] < 2^64). Safe. */ \
341 "addq %%rdi, %%r12 \n\t" /* r[4] += lo(a3*b1) */ \
342 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[3] * b[2] */ \
343 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (rdi, r15) = a[3] * b[3] -> r15 = r[7] = hi(a3*b3) */ \
344 "adcq %%r8, %%r13 \n\t" /* r[5] += lo(a3*b2) + CF */ \
345 "adcq %%r9, %%r14 \n\t" /* r[6] += hi(a3*b2) + CF */ \
346 "adcq $0, %%r15 \n\t" /* r[7] += CF [max: < 2^62 + 1] */ \
347 /* Chain 3C: lo(a3*b3), 0 */ \
348 "addq %%rdi, %%r14 \n\t" /* r[6] += lo(a3*b3) */ \
349 "adcq $0, %%r15 \n\t" /* r[7] += CF [max: < 2^62 + 2] */ \
350 \
351 /* --- k3 * p reduction ---------------------------------------------------------------- */ \
352 /* Per-limb totals of k3*p added: */ \
353 /* r[3] += lo(k3*p0) -> zeroed mod 2^64 */ \
354 /* r[4] += hi(k3*p0) + lo(k3*p1) */ \
355 /* r[5] += hi(k3*p1) + lo(k3*p2) */ \
356 /* r[6] += hi(k3*p2) + lo(k3*p3) */ \
357 /* r[7] += hi(k3*p3) */ \
358 "movq %%r10, %%rdx \n\t" /* rdx = r[3] */ \
359 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k3 = r[3] * r_inv mod 2^64 */ \
360 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[0] */ \
361 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k3 * p[1] */ \
362 /* Chain 3D: lo(k3*p0), hi(k3*p0), hi(k3*p1), 0, 0 */ \
363 /* Killed CF: terminal of chain 3C (r[7] < 2^62 + 2 < 2^64). Safe. */ \
364 "addq %%r8, %%r10 \n\t" /* r[3] += lo(k3*p0) -> 0 mod 2^64 */ \
365 "adcq %%r9, %%r12 \n\t" /* r[4] += hi(k3*p0) + CF */ \
366 "adcq %%r11, %%r13 \n\t" /* r[5] += hi(k3*p1) + CF */ \
367 "adcq $0, %%r14 \n\t" /* r[6] += CF */ \
368 "adcq $0, %%r15 \n\t" /* r[7] += CF [max: < 2^62 + 3] */ \
369 /* Chain 3E: lo(k3*p1), lo(k3*p2), hi(k3*p2), hi(k3*p3) */ \
370 /* Killed CF: terminal of chain 3D (r[7] < 2^62 + 3 < 2^64). Safe. */ \
371 "addq %%rdi, %%r12 \n\t" /* r[4] += lo(k3*p1) */ \
372 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[2] */ \
373 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (rdi, rdx) = k3 * p[3] (overwrites rdx; last mulxq) */ \
374 "adcq %%r8, %%r13 \n\t" /* r[5] += lo(k3*p2) + CF */ \
375 "adcq %%r9, %%r14 \n\t" /* r[6] += hi(k3*p2) + CF */ \
376 "adcq %%rdx, %%r15 \n\t" /* r[7] += hi(k3*p3) + CF [max: < 2^63 + 5] */ \
377 /* Chain 3F: lo(k3*p3), 0 */ \
378 /* Killed CF: terminal of chain 3E (r[7] < 2^63 + 5 < 2^64). Safe. */ \
379 "addq %%rdi, %%r14 \n\t" /* r[6] += lo(k3*p3) */ \
380 "adcq $0, %%r15 \n\t" /* r[7] += CF [max: < 2^63 + 6] */ \
381 \
382 /* Output in (r12, r13, r14, r15) = (r[4], r[5], r[6], r[7]). */ \
383 /* Since S < 2p < 2^255, the top limb r[7] < 2^63. Valid input for another MUL. */
384
385
386#else // 6047895us
391#define ADD(b) \
392 "adcxq 0(" b "), %%r12 \n\t" \
393 "adcxq 8(" b "), %%r13 \n\t" \
394 "adcxq 16(" b "), %%r14 \n\t" \
395 "adcxq 24(" b "), %%r15 \n\t"
396
401#define SUB(b) \
402 "subq 0(" b "), %%r12 \n\t" \
403 "sbbq 8(" b "), %%r13 \n\t" \
404 "sbbq 16(" b "), %%r14 \n\t" \
405 "sbbq 24(" b "), %%r15 \n\t"
406
411#define ADD_REDUCE(b, twice_not_modulus_0, twice_not_modulus_1, twice_not_modulus_2, twice_not_modulus_3) \
412 "adcxq 0(" b "), %%r12 \n\t" \
413 "movq %%r12, %%r8 \n\t" \
414 "adoxq " twice_not_modulus_0 ", %%r12 \n\t" \
415 "adcxq 8(" b "), %%r13 \n\t" \
416 "movq %%r13, %%r9 \n\t" \
417 "adoxq " twice_not_modulus_1 ", %%r13 \n\t" \
418 "adcxq 16(" b "), %%r14 \n\t" \
419 "movq %%r14, %%r10 \n\t" \
420 "adoxq " twice_not_modulus_2 ", %%r14 \n\t" \
421 "adcxq 24(" b "), %%r15 \n\t" \
422 "movq %%r15, %%r11 \n\t" \
423 "adoxq " twice_not_modulus_3 ", %%r15 \n\t" \
424 "cmovnoq %%r8, %%r12 \n\t" \
425 "cmovnoq %%r9, %%r13 \n\t" \
426 "cmovnoq %%r10, %%r14 \n\t" \
427 "cmovnoq %%r11, %%r15 \n\t"
428
429
438#define CONDITIONAL_ADD(b_0, b_1, b_2, b_3) \
439 /* Duplicate `r` */ \
440 "movq %%r12, %%r8 \n\t" \
441 "movq %%r13, %%r9 \n\t" \
442 "movq %%r14, %%r10 \n\t" \
443 "movq %%r15, %%r11 \n\t" \
444 "adoxq " b_0 ", %%r12 \n\t" /* r'[0] += b[0] */ \
445 "adoxq " b_1 ", %%r13 \n\t" /* r'[1] += b[1] */ \
446 "adoxq " b_2 ", %%r14 \n\t" /* r'[2] += b[2] */ \
447 "adoxq " b_3 ", %%r15 \n\t" /* r'[3] += b[3] */ \
448 \
449 /* if the addition did not overflow, restore the original r */ \
450 "cmovnoq %%r8, %%r12 \n\t" \
451 "cmovnoq %%r9, %%r13 \n\t" \
452 "cmovnoq %%r10, %%r14 \n\t" \
453 "cmovnoq %%r11, %%r15 \n\t"
454
455
476#define SQR(a) \
477 /* ===================================================================================== */ \
478 /* PHASE 1: Cross-products. Compute sum_{i<j} a[i]*a[j] into r[1..6]. */ \
479 /* */ \
480 /* The 6 products and their limb positions (lo, hi): */ \
481 /* a[0]*a[1] -> (r[1], r[2]) a[0]*a[2] -> (r[2], r[3]) a[0]*a[3] -> (r[3], r[4]) */ \
482 /* a[1]*a[2] -> (r[3], r[4]) a[1]*a[3] -> (r[4], r[5]) a[2]*a[3] -> (r[5], r[6]) */ \
483 /* */ \
484 /* Per-limb totals (cross-products only, before doubling): */ \
485 /* r[1] = lo(a0*a1) */ \
486 /* r[2] = hi(a0*a1) + lo(a0*a2) */ \
487 /* r[3] = hi(a0*a2) + lo(a0*a3) + lo(a1*a2) */ \
488 /* r[4] = hi(a0*a3) + hi(a1*a2) + lo(a1*a3) */ \
489 /* r[5] = hi(a1*a3) + lo(a2*a3) */ \
490 /* r[6] = hi(a2*a3) */ \
491 /* */ \
492 /* adcx sequence (CF): adds into r[3], r[4], r[5], r[6] (hi cross-terms + flushes) */ \
493 /* adox sequence (OF): adds into r[2], r[3], r[4], r[5], r[6] (lo cross-terms + flushes) */ \
494 /* ===================================================================================== */ \
495 "movq 0(" a "), %%rdx \n\t" /* rdx = a[0] */ \
496 "xorq %%r8, %%r8 \n\t" /* clear r8; sets CF=0, OF=0 */ \
497 \
498 /* --- a[0] * a[1..3] ------------------------------------------------------------------ */ \
499 "mulxq 8(" a "), %%r9, %%r10 \n\t" /* (r9, r10) = a[0]*a[1] -> (r[1], r[2]) */ \
500 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (r8, r15) = a[0]*a[2] -> needs r[2], r[3] */ \
501 "mulxq 24(" a "), %%r11, %%r12 \n\t" /* (r11, r12) = a[0]*a[3] -> (r[3], r[4]) */ \
502 \
503 /* {CF=0, OF=0} */ \
504 "adoxq %%r8, %%r10 \n\t" /* r[2] += lo(a0*a2) {OF->OF1} */ \
505 "adcxq %%r15, %%r11 \n\t" /* r[3] += hi(a0*a2) + CF=0 {CF->CF1} */ \
506 \
507 /* --- a[1] * a[2..3] ------------------------------------------------------------------ */ \
508 "movq 8(" a "), %%rdx \n\t" /* rdx = a[1] */ \
509 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (r8, r15) = a[1]*a[2] */ \
510 "mulxq 24(" a "), %%rdi, %%rcx \n\t" /* (rdi, rcx) = a[1]*a[3] */ \
511 \
512 /* --- a[2] * a[3] -------------------------------------------------------------------- */ \
513 "movq 24(" a "), %%rdx \n\t" /* rdx = a[3] */ \
514 "mulxq 16(" a "), %%r13, %%r14 \n\t" /* (r13, r14) = a[2]*a[3] -> (r[5], r[6]) */ \
515 \
516 /* --- Accumulate remaining cross-terms ------------------------------------------------ */ \
517 "adoxq %%r8, %%r11 \n\t" /* r[3] += lo(a1*a2) + OF1 {OF->OF2} */ \
518 "adcxq %%rdi, %%r12 \n\t" /* r[4] += lo(a1*a3) + CF1 {CF->CF2} */ \
519 "adoxq %%r15, %%r12 \n\t" /* r[4] += hi(a1*a2) + OF2 {OF->OF3} */ \
520 "adcxq %%rcx, %%r13 \n\t" /* r[5] += hi(a1*a3) + CF2 {CF->CF3} */ \
521 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += OF3 {OF->OF4} */ \
522 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF3 {CF->CF4} */ \
523 /* r[6] = hi(a2*a3) + CF3. Since a[2],a[3] < 2^{63}: hi(a2*a3) < 2^{62}, */ \
524 /* so r[6] < 2^{62} + 1 < 2^{64}. CF4 = 0. */ \
525 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += OF4 {OF->OF5} */ \
526 /* r[6] < 2^{62} + 2 < 2^{64}. OF5 = 0. */ \
527 \
528 /* Post-phase 1: r[1..6] hold cross-products. */ \
529 /* After flushing both chains into r[6], no further carry is possible because */ \
530 /* r[6] = hi(a2*a3) + CF3 + OF4 < 2^{62} + 2 < 2^{64}. So CF4 = OF5 = 0, */ \
531 /* meaning Phase 2 doubling starts with zero incoming carry on both chains. */ \
532 \
533 /* ===================================================================================== */ \
534 /* PHASE 2: Double r[1..6] via self-addition. */ \
535 /* */ \
536 /* Two independent 3-limb self-additions (doubling): */ \
537 /* adox: r[1], r[2], r[3] (low half, entering with OF5=0) */ \
538 /* adcx: r[4], r[5], r[6] (high half, entering with CF4=0) */ \
539 /* */ \
540 /* OF=0 and CF=0 entering are critical: doubling via r += r requires no incoming carry. */ \
541 /* ===================================================================================== */ \
542 "adoxq %%r9, %%r9 \n\t" /* r[1] = 2*r[1] {OF->OF6} */ \
543 "adcxq %%r12, %%r12 \n\t" /* r[4] = 2*r[4] {CF->CF5} */ \
544 "adoxq %%r10, %%r10 \n\t" /* r[2] = 2*r[2] + OF6 {OF->OF7} */ \
545 "adcxq %%r13, %%r13 \n\t" /* r[5] = 2*r[5] + CF5 {CF->CF6} */ \
546 "adoxq %%r11, %%r11 \n\t" /* r[3] = 2*r[3] + OF7 {OF->OF8} */ \
547 "adcxq %%r14, %%r14 \n\t" /* r[6] = 2*r[6] + CF6 {CF->CF7} */ \
548 /* r[6]_old < 2^{62}+2, so 2*r[6]+CF6 < 2^{63}+5 < 2^{64}. CF7 = 0. */ \
549 \
550 /* Post-phase 2: r[1..6] = 2 * cross_products. */ \
551 /* OF8 (from low-half doubling) is pending — consumed by the first adox in Phase 3. */ \
552 /* CF7 (from high-half doubling) is pending — consumed by the first adcx in Phase 3. */ \
553 \
554 /* ===================================================================================== */ \
555 /* PHASE 3: Add squared terms a[i]^2 to complete a^2 = 2*cross + squares. */ \
556 /* */ \
557 /* The pending CF7 from Phase 2's high-half doubling is consumed first when adding */ \
558 /* hi(a0^2) into r[1] via adcx. The pending OF8 from low-half doubling is consumed */ \
559 /* first when adding lo(a2^2) into r[4] via adox. */ \
560 /* */ \
561 /* Each a[i]^2 splits into (lo, hi) at positions (r[2i], r[2i+1]): */ \
562 /* a[0]^2 -> (r[0], r[1]), a[1]^2 -> (r[2], r[3]) */ \
563 /* a[2]^2 -> (r[4], r[5]), a[3]^2 -> (r[6], r[7]) */ \
564 /* ===================================================================================== */ \
565 "movq 0(" a "), %%rdx \n\t" /* rdx = a[0] */ \
566 "mulxq %%rdx, %%r8, %%rcx \n\t" /* (r8, rcx) = a[0]^2 -> r[0] = r8, hi goes to r[1] */ \
567 "movq 16(" a "), %%rdx \n\t" /* rdx = a[2] */ \
568 "mulxq %%rdx, %%rdx, %%rdi \n\t" /* (rdx, rdi) = a[2]^2 -> lo to r[4], hi to r[5] */ \
569 \
570 /* {CF=CF7 from Phase 2, OF=OF8 from Phase 2} */ \
571 "adcxq %%rcx, %%r9 \n\t" /* r[1] += hi(a0^2) + CF7 {CF->CF8} (consumes Phase 2 CF) */\
572 "adoxq %%rdx, %%r12 \n\t" /* r[4] += lo(a2^2) + OF8 {OF->OF9} (consumes Phase 2 OF) */\
573 "adoxq %%rdi, %%r13 \n\t" /* r[5] += hi(a2^2) + OF9 {OF->OF10} */ \
574 "movq 24(" a "), %%rdx \n\t" /* rdx = a[3] */ \
575 "mulxq %%rdx, %%rcx, %%r15 \n\t" /* (rcx, r15) = a[3]^2 -> lo to r[6], r[7] = hi(a3^2) */ \
576 "movq 8(" a "), %%rdx \n\t" /* rdx = a[1] */ \
577 "mulxq %%rdx, %%rdi, %%rdx \n\t" /* (rdi, rdx) = a[1]^2 -> lo to r[2], hi to r[3] */ \
578 "adcxq %%rdi, %%r10 \n\t" /* r[2] += lo(a1^2) + CF8 {CF->CF9} */ \
579 "adcxq %%rdx, %%r11 \n\t" /* r[3] += hi(a1^2) + CF9 {CF->CF10} */ \
580 "adoxq %%rcx, %%r14 \n\t" /* r[6] += lo(a3^2) + OF10 {OF->OF11} */ \
581 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF11 {OF->OF12} */ \
582 /* r[7] = hi(a3^2) + OF11. Since a[3] < 2^{63}: hi(a3^2) < 2^{62}. */ \
583 /* So r[7] < 2^{62} + 1 < 2^{64}. OF12 = 0. */ \
584 \
585 /* Post-phase 3: (r8,r9,..,r15) = a^2 as 8-limb number. OF12 = 0. CF10 pending. */ \
586 \
587 /* ===================================================================================== */ \
588 /* PHASE 4: Montgomery reduction — 4 rounds, identical structure to MUL. */ \
589 /* */ \
590 /* Each round i: k_i = r[i] * (-p^{-1}) mod 2^{64}, add k_i*p, shift out r[i]. */ \
591 /* The total is (a^2 + K*p) / R where K = sum(k_i * 2^{64i}). */ \
592 /* Since a < 2p: a^2 < 4p^2, K < R, and 4p < R, so output < 2p. */ \
593 /* */ \
594 /* CF and OF from Phase 3 are reused (not reset). Each round's adcx/adox */ \
595 /* instructions consume and produce flags in interleaved order — understand the */ \
596 /* flag state instruction-by-instruction. Per-limb totals for k_i*p: */ \
597 /* r[i] += lo(ki*p0) -> zeroed mod 2^{64} */ \
598 /* r[i+1] += hi(ki*p0) + lo(ki*p1) */ \
599 /* r[i+2] += hi(ki*p1) + lo(ki*p2) */ \
600 /* r[i+3] += hi(ki*p2) + lo(ki*p3) */ \
601 /* r[i+4] += hi(ki*p3) */ \
602 /* ===================================================================================== */ \
603 \
604 /* --- Reduction round 0: reduce r[0] (r8) -------------------------------------------- */ \
605 /* Register map: r8=r[0] r9=r[1] r10=r[2] r11=r[3] r12=r[4] r13=r[5] r14=r[6] r15=r[7] */ \
606 /* */ \
607 /* Per-limb totals for k0*p: */ \
608 /* r[0] += lo(k0*p0) -> zeroed mod 2^{64} */ \
609 /* r[1] += hi(k0*p0) + lo(k0*p1) */ \
610 /* r[2] += hi(k0*p1) + lo(k0*p2) */ \
611 /* r[3] += hi(k0*p2) + lo(k0*p3) */ \
612 /* r[4] += hi(k0*p3) */ \
613 /* */ \
614 /* adcx sequence (CF): r[4], r[5], r[6], r[7], then r[1], r[2], r[3] */ \
615 /* adox sequence (OF): r[0], r[1], r[2], r[3] */ \
616 /* {CF=CF10 from Phase 3, OF=OF12=0} */ \
617 "movq %%r8, %%rdx \n\t" /* rdx = r[0] */ \
618 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* rdx = k0 = r[0] * r_inv */ \
619 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k0 * p[0] */ \
620 "adoxq %%rdi, %%r8 \n\t" /* r[0] += lo(k0*p0) + OF12=0 -> 0 mod 2^64 {OF->OF1} */ \
621 "mulxq %[modulus_3], %%r8, %%rdi \n\t" /* (r8, rdi) = k0 * p[3] */ \
622 "adcxq %%rdi, %%r12 \n\t" /* r[4] += hi(k0*p3) + CF10 {CF->CF1} */ \
623 "adoxq %%rcx, %%r9 \n\t" /* r[1] += hi(k0*p0) + OF1 {OF->OF2} */ \
624 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += CF1 {CF->CF2} */ \
625 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF2 {CF->CF3} */ \
626 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k0 * p[1] */ \
627 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += CF3 {CF->CF4} */ \
628 /* adcx flushes CF through r[5..7]; CF4=0 since r[7] < 2^{62}+2+1 < 2^{64}. */ \
629 "adoxq %%rcx, %%r10 \n\t" /* r[2] += hi(k0*p1) + OF2 {OF->OF3} */ \
630 "adcxq %%rdi, %%r9 \n\t" /* r[1] += lo(k0*p1) + CF4=0 {CF->CF5} (CF4=0: safe) */ \
631 "adoxq %%r8, %%r11 \n\t" /* r[3] += lo(k0*p3) + OF3 {OF->OF4} */ \
632 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k0 * p[2] */ \
633 "adcxq %%rdi, %%r10 \n\t" /* r[2] += lo(k0*p2) + CF5 {CF->CF6} */ \
634 "adcxq %%rcx, %%r11 \n\t" /* r[3] += hi(k0*p2) + CF6 {CF->CF7} */ \
635 \
636 /* --- Reduction round 1: reduce r[1] (r9) -------------------------------------------- */ \
637 /* Per-limb totals for k1*p: */ \
638 /* r[1] += lo(k1*p0) -> zeroed mod 2^{64} */ \
639 /* r[2] += hi(k1*p0) + lo(k1*p1) */ \
640 /* r[3] += hi(k1*p1) + lo(k1*p2) */ \
641 /* r[4] += hi(k1*p2) + lo(k1*p3) */ \
642 /* r[5] += hi(k1*p3) */ \
643 /* */ \
644 /* adcx sequence (CF): r[4], r[5], r[6], r[7], then r[1], r[2], r[3] */ \
645 /* adox sequence (OF): r[4], r[5], r[6], r[7], then r[2], r[3] */ \
646 /* {CF=CF7, OF=OF4} */ \
647 "movq %%r9, %%rdx \n\t" /* rdx = r[1] */ \
648 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* rdx = k1 = r[1] * r_inv */ \
649 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k1 * p[2] */ \
650 "adoxq %%rcx, %%r12 \n\t" /* r[4] += hi(k1*p2) + OF4 {OF->OF5} */ \
651 "mulxq %[modulus_3], %%r8, %%rcx \n\t" /* (r8, rcx) = k1 * p[3] */ \
652 "adcxq %%r8, %%r12 \n\t" /* r[4] += lo(k1*p3) + CF7 {CF->CF8} */ \
653 "adoxq %%rcx, %%r13 \n\t" /* r[5] += hi(k1*p3) + OF5 {OF->OF6} */ \
654 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += CF8 {CF->CF9} */ \
655 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += OF6 {OF->OF7} */ \
656 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF9 {CF->CF10} */ \
657 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF7 {OF->OF8} */ \
658 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += CF10 {CF->CF11} */ \
659 /* adcx/adox flush CF/OF through r[5..7]. CF11=0 and OF8=0 because each */ \
660 /* flush adds at most 1 to r[5..7], and r[7] < 2^{62}+2+4 < 2^{64}. */ \
661 "mulxq %[modulus_0], %%r8, %%rcx \n\t" /* (r8, rcx) = k1 * p[0] */ \
662 "adcxq %%r8, %%r9 \n\t" /* r[1] += lo(k1*p0) + CF11=0 -> 0 mod 2^64 {CF->CF12} */ \
663 "adoxq %%rcx, %%r10 \n\t" /* r[2] += hi(k1*p0) + OF8=0 {OF->OF9} (OF8=0: safe) */ \
664 "mulxq %[modulus_1], %%r8, %%rcx \n\t" /* (r8, rcx) = k1 * p[1] */ \
665 "adcxq %%r8, %%r10 \n\t" /* r[2] += lo(k1*p1) + CF12 {CF->CF13} */ \
666 "adoxq %%rcx, %%r11 \n\t" /* r[3] += hi(k1*p1) + OF9 {OF->OF10} */ \
667 "adcxq %%rdi, %%r11 \n\t" /* r[3] += lo(k1*p2) + CF13 {CF->CF14} */ \
668 \
669 /* --- Reduction round 2: reduce r[2] (r10) ------------------------------------------- */ \
670 /* Per-limb totals for k2*p: */ \
671 /* r[2] += lo(k2*p0) -> zeroed mod 2^{64} */ \
672 /* r[3] += hi(k2*p0) + lo(k2*p1) */ \
673 /* r[4] += hi(k2*p1) + lo(k2*p2) */ \
674 /* r[5] += hi(k2*p2) + lo(k2*p3) */ \
675 /* r[6] += hi(k2*p3) */ \
676 /* */ \
677 /* adcx sequence (CF): r[4], r[5], r[6], r[7], then r[2], r[3] */ \
678 /* adox sequence (OF): r[4], r[5], r[6], r[7], then r[3], r[4], r[5] */ \
679 /* {CF=CF14, OF=OF10} */ \
680 "movq %%r10, %%rdx \n\t" /* rdx = r[2] */ \
681 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* rdx = k2 = r[2] * r_inv */ \
682 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k2 * p[1] */ \
683 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[2] */ \
684 "adoxq %%rcx, %%r12 \n\t" /* r[4] += hi(k2*p1) + OF10 {OF->OF11} */ \
685 "adcxq %%r8, %%r12 \n\t" /* r[4] += lo(k2*p2) + CF14 {CF->CF15} */ \
686 "adoxq %%r9, %%r13 \n\t" /* r[5] += hi(k2*p2) + OF11 {OF->OF12} */ \
687 "mulxq %[modulus_3], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[3] */ \
688 "adcxq %%r8, %%r13 \n\t" /* r[5] += lo(k2*p3) + CF15 {CF->CF16} */ \
689 "adoxq %%r9, %%r14 \n\t" /* r[6] += hi(k2*p3) + OF12 {OF->OF13} */ \
690 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF16 {CF->CF17} */ \
691 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF13 {OF->OF14} */ \
692 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += CF17 {CF->CF18} */ \
693 /* adcx/adox flush CF/OF through r[6..7]. CF18=0 and OF14=0 because each */ \
694 /* flush adds at most 1, and r[7] < 2^{62}+2+4+2 < 2^{64}. */ \
695 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[0] */ \
696 "adcxq %%r8, %%r10 \n\t" /* r[2] += lo(k2*p0) + CF18=0 -> 0 mod 2^64 {CF->CF19} */ \
697 "adoxq %%r9, %%r11 \n\t" /* r[3] += hi(k2*p0) + OF14=0 {OF->OF15} (OF14=0: safe) */ \
698 "adcxq %%rdi, %%r11 \n\t" /* r[3] += lo(k2*p1) + CF19 {CF->CF20} */ \
699 "adoxq %[zero_reference], %%r12 \n\t" /* r[4] += OF15 {OF->OF16} */ \
700 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += OF16 {OF->OF17} */ \
701 \
702 /* --- Reduction round 3: reduce r[3] (r11) ------------------------------------------- */ \
703 /* Per-limb totals for k3*p: */ \
704 /* r[3] += lo(k3*p0) -> zeroed mod 2^{64} */ \
705 /* r[4] += hi(k3*p0) + lo(k3*p1) */ \
706 /* r[5] += hi(k3*p1) + lo(k3*p2) */ \
707 /* r[6] += hi(k3*p2) + lo(k3*p3) */ \
708 /* r[7] += hi(k3*p3) */ \
709 /* */ \
710 /* adcx sequence (CF): r[4], r[5], r[6], r[7] */ \
711 /* adox sequence (OF): r[3], r[4], r[5], r[6], r[7] */ \
712 /* {CF=CF20, OF=OF17} */ \
713 /* OF17=0: OF15 (from r[3] += hi(k2*p0)) flushes through r[4] and r[5] at lines above, */ \
714 /* dissipating to 0 since neither limb is near 2^64. */ \
715 "movq %%r11, %%rdx \n\t" /* rdx = r[3] */ \
716 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* rdx = k3 = r[3] * r_inv */ \
717 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (rdi, rcx) = k3 * p[0] */ \
718 "mulxq %[modulus_1], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[1] */ \
719 "adoxq %%rdi, %%r11 \n\t" /* r[3] += lo(k3*p0) + OF17 -> 0 mod 2^64 {OF->OF18} */ \
720 "adcxq %%r8, %%r12 \n\t" /* r[4] += lo(k3*p1) + CF20 {CF->CF21} */ \
721 "adoxq %%rcx, %%r12 \n\t" /* r[4] += hi(k3*p0) + OF18 {OF->OF19} */ \
722 "adcxq %%r9, %%r13 \n\t" /* r[5] += hi(k3*p1) + CF21 {CF->CF22} */ \
723 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[2] */ \
724 "mulxq %[modulus_3], %%r10, %%r11 \n\t" /* (r10, r11) = k3 * p[3] */ \
725 "adoxq %%r8, %%r13 \n\t" /* r[5] += lo(k3*p2) + OF19 {OF->OF20} */ \
726 "adcxq %%r10, %%r14 \n\t" /* r[6] += lo(k3*p3) + CF22 {CF->CF23} */ \
727 "adoxq %%r9, %%r14 \n\t" /* r[6] += hi(k3*p2) + OF20 {OF->OF21} */ \
728 "adcxq %%r11, %%r15 \n\t" /* r[7] += hi(k3*p3) + CF23 {CF->CF24} */ \
729 /* Result = (a^2 + K*p)/R < 2p < 2^{255}, so r[7] < 2^{63}. */ \
730 /* Since r[7] + hi(k3*p3) + CF23 < 2^{63} + 2^{62} + 1 < 2^{64}, CF24 = 0. */ \
731 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF21 {OF->OF22} */ \
732 /* r[7] < 2^{63} + 1 < 2^{64}, so OF22 = 0. Both terminal flags are zero. */ \
733 \
734 /* Output in (r12, r13, r14, r15) = (r[4], r[5], r[6], r[7]). */ \
735 /* Since S < 2p < 2^{255}, the top limb r[7] < 2^{63}. Valid input for another SQR/MUL. */
736
770#define MUL(a1, a2, a3, a4, b) \
771 /* ===================================================================================== */ \
772 /* ROUND 0: accumulate a[0]*b, then reduce by k0*p. Shift out r[0]. */ \
773 /* Register map: r13=r[0] r14=r[1] r15=r[2] r10=r[3] r12=r[4] */ \
774 /* ===================================================================================== */ \
775 "movq " a1 ", %%rdx \n\t" /* rdx = a[0] */ \
776 "xorq %%r8, %%r8 \n\t" /* clear r8; sets CF=0, OF=0 */ \
777 \
778 /* --- a[0] * b: four independent multiplies ------------------------------------------- */ \
779 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r13, r14) = a[0] * b[0] -> (r[0], r[1]) */ \
780 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[0] * b[1] */ \
781 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r15, r10) = a[0] * b[2] -> (r[2], r[3]) */ \
782 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (rdi, r12) = a[0] * b[3] -> r12 = r[4] = hi(a0*b3) */ \
783 \
784 /* --- k0 computation ------------------------------------------------------------------ */ \
785 "movq %%r13, %%rdx \n\t" /* rdx = r[0] */ \
786 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* rdx = k0 = r[0] * r_inv mod 2^64 */ \
787 \
788 /* --- Assemble a[0]*b cross-terms + k0*p reduction ------------------------------------ */ \
789 /* */ \
790 /* Per-limb totals for a[0]*b (from cross-terms added by adcxq/adoxq): */ \
791 /* r[1] += lo(a0*b1), r[2] += hi(a0*b1), r[3] += lo(a0*b3) */ \
792 /* (hi(a0*b0), lo(a0*b2), hi(a0*b2), hi(a0*b3) already in r[1..4] from mulxq outputs) */ \
793 /* */ \
794 /* Per-limb totals for k0*p: */ \
795 /* r[0] += lo(k0*p0) -> zeroed mod 2^64 */ \
796 /* r[1] += hi(k0*p0) + lo(k0*p1) */ \
797 /* r[2] += hi(k0*p1) + lo(k0*p2) */ \
798 /* r[3] += hi(k0*p2) + lo(k0*p3) */ \
799 /* r[4] += hi(k0*p3) */ \
800 /* */ \
801 /* adcx sequence (CF): r[1], r[2], r[3], r[4], then r[1], r[2], r[3] */ \
802 /* adox sequence (OF): r[3], r[4], r[0], r[1], r[2] */ \
803 /* */ \
804 /* {CF=0, OF=0} */ \
805 "adcxq %%r8, %%r14 \n\t" /* r[1] += lo(a0*b1) {CF->CF1} */ \
806 "adoxq %%rdi, %%r10 \n\t" /* r[3] += lo(a0*b3) {OF->OF1} */ \
807 "adcxq %%r9, %%r15 \n\t" /* r[2] += hi(a0*b1) + CF1 {CF->CF2} */ \
808 \
809 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k0 * p[3] */ \
810 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k0 * p[0] */ \
811 "adcxq %%rdi, %%r10 \n\t" /* r[3] += lo(k0*p3) + CF2 {CF->CF3} */ \
812 "adoxq %%r11, %%r12 \n\t" /* r[4] += hi(k0*p3) + OF1 {OF->OF2} */ \
813 /* r[4] = hi(a0*b3) + hi(k0*p3) + OF1 < 2^63 + 2^62 + 1 < 2^64 (b[3],p[3]<2^63) */ \
814 /* OF2 = 0. */ \
815 "adcxq %[zero_reference], %%r12 \n\t" /* r[4] += CF3 {CF->CF4} */ \
816 /* r[4] < 2^63 + 2^62 + 2 < 2^64, so CF4 = 0 */ \
817 "adoxq %%r8, %%r13 \n\t" /* r[0] += lo(k0*p0) + OF2=0 -> 0 mod 2^64 {OF->OF3} */ \
818 "adcxq %%r9, %%r14 \n\t" /* r[1] += hi(k0*p0) + CF4=0 {CF->CF5} (CF4=0: safe) */ \
819 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k0 * p[1] */ \
820 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k0 * p[2] */ \
821 "adoxq %%rdi, %%r14 \n\t" /* r[1] += lo(k0*p1) + OF3 {OF->OF4} */ \
822 "adcxq %%r11, %%r15 \n\t" /* r[2] += hi(k0*p1) + CF5 {CF->CF6} */ \
823 "adoxq %%r8, %%r15 \n\t" /* r[2] += lo(k0*p2) + OF4 {OF->OF5} */ \
824 "adcxq %%r9, %%r10 \n\t" /* r[3] += hi(k0*p2) + CF6 {CF->CF7} */ \
825 \
826 /* Post-round 0: terminal flags (CF7, OF5) flow into round 1. */ \
827 /* Top limb r[4] <= 2^63 + 2^62 + 2 < 2^64 (same as non-ADX). */ \
828 \
829 /* ===================================================================================== */ \
830 /* ROUND 1: accumulate a[1]*b, then reduce by k1*p. Shift out r[1]. */ \
831 /* Register map: r14=r[1] r15=r[2] r10=r[3] r12=r[4] r13=r[5] */ \
832 /* */ \
833 /* Per-limb totals for a[1]*b: */ \
834 /* r[1] += lo(a1*b0), r[2] += hi(a1*b0) + lo(a1*b1) */ \
835 /* r[3] += hi(a1*b1) + lo(a1*b2), r[4] += hi(a1*b2) + lo(a1*b3) */ \
836 /* r[5] = hi(a1*b3) */ \
837 /* */ \
838 /* Per-limb totals for k1*p (same decomposition, shifted by one): */ \
839 /* r[1] += lo(k1*p0) -> zeroed mod 2^64 */ \
840 /* r[2] += hi(k1*p0) + lo(k1*p1) */ \
841 /* r[3] += hi(k1*p1) + lo(k1*p2) */ \
842 /* r[4] += hi(k1*p2) + lo(k1*p3) */ \
843 /* r[5] += hi(k1*p3) */ \
844 /* */ \
845 /* adcx sequence (CF): r[4], r[5], r[1], r[2], r[3], r[4], r[5], r[2], r[3] */ \
846 /* adox sequence (OF): r[3], r[4], r[5], r[2], r[3], r[4], r[5], r[1], r[2] */ \
847 /* ===================================================================================== */ \
848 "movq " a2 ", %%rdx \n\t" /* rdx = a[1] */ \
849 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[1] * b[2] */ \
850 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (rdi, r13) = a[1] * b[3] -> r13 = r[5] = hi(a1*b3) */ \
851 /* {CF=CF7 from R0, OF=OF5 from R0} */ \
852 "adoxq %%r8, %%r10 \n\t" /* r[3] += lo(a1*b2) + OF {OF->OF1} */ \
853 "adcxq %%rdi, %%r12 \n\t" /* r[4] += lo(a1*b3) + CF {CF->CF1} */ \
854 "adoxq %%r9, %%r12 \n\t" /* r[4] += hi(a1*b2) + OF1 {OF->OF2} */ \
855 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += CF1 {CF->CF2} */ \
856 /* r[5] = hi(a1*b3) + CF1 < 2^63 + 1 < 2^64, so CF2 = 0 */ \
857 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += OF2 {OF->OF3} */ \
858 /* r[5] < 2^63 + 2 < 2^64, so OF3 = 0. Wrap-around is safe. */ \
859 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[1] * b[0] */ \
860 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[1] * b[1] */ \
861 "adcxq %%r8, %%r14 \n\t" /* r[1] += lo(a1*b0) + CF2=0 {CF->CF3} (CF2=0: safe) */ \
862 "adoxq %%r9, %%r15 \n\t" /* r[2] += hi(a1*b0) + OF3=0 {OF->OF4} (OF3=0: safe) */ \
863 "adcxq %%rdi, %%r15 \n\t" /* r[2] += lo(a1*b1) + CF3 {CF->CF4} */ \
864 "adoxq %%r11, %%r10 \n\t" /* r[3] += hi(a1*b1) + OF4 {OF->OF5} */ \
865 \
866 /* --- k1 * p reduction ---------------------------------------------------------------- */ \
867 "movq %%r14, %%rdx \n\t" /* rdx = r[1] */ \
868 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k1 = r[1] * r_inv mod 2^64 */ \
869 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k1 * p[2] */ \
870 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k1 * p[3] */ \
871 "adcxq %%r8, %%r10 \n\t" /* r[3] += lo(k1*p2) + CF4 {CF->CF5} */ \
872 "adoxq %%r9, %%r12 \n\t" /* r[4] += hi(k1*p2) + OF5 {OF->OF6} */ \
873 "adcxq %%rdi, %%r12 \n\t" /* r[4] += lo(k1*p3) + CF5 {CF->CF6} */ \
874 "adoxq %%r11, %%r13 \n\t" /* r[5] += hi(k1*p3) + OF6 {OF->OF7} */ \
875 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += CF6 {CF->CF7} */ \
876 /* r[5] <= 2^63 + 2^62 + 4 < 2^64, so CF7 = 0. Wrap safe. */ \
877 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k1 * p[0] */ \
878 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k1 * p[1] */ \
879 "adoxq %%r8, %%r14 \n\t" /* r[1] += lo(k1*p0) + OF7 {OF->OF8} -> 0 mod 2^64 */ \
880 "adcxq %%rdi, %%r15 \n\t" /* r[2] += lo(k1*p1) + CF7=0 {CF->CF8} (CF7=0: safe) */ \
881 "adoxq %%r9, %%r15 \n\t" /* r[2] += hi(k1*p0) + OF8 {OF->OF9} */ \
882 "adcxq %%r11, %%r10 \n\t" /* r[3] += hi(k1*p1) + CF8 {CF->CF9} */ \
883 \
884 /* Post-round 1: terminal (CF9, OF9) flow into round 2. */ \
885 /* Top limb r[5] <= 2^63 + 2^62 + 4 < 2^64. Invariant holds. */ \
886 \
887 /* ===================================================================================== */ \
888 /* ROUND 2: accumulate a[2]*b, then reduce by k2*p. Shift out r[2]. */ \
889 /* Register map: r15=r[2] r10=r[3] r12=r[4] r13=r[5] r14=r[6] */ \
890 /* */ \
891 /* Per-limb totals for a[2]*b: */ \
892 /* r[2] += lo(a2*b0), r[3] += hi(a2*b0) + lo(a2*b1) */ \
893 /* r[4] += hi(a2*b1) + lo(a2*b2), r[5] += hi(a2*b2) + lo(a2*b3) */ \
894 /* r[6] = hi(a2*b3) */ \
895 /* */ \
896 /* Per-limb totals for k2*p: */ \
897 /* r[2] += lo(k2*p0) -> zeroed mod 2^64 */ \
898 /* r[3] += hi(k2*p0) + lo(k2*p1) */ \
899 /* r[4] += hi(k2*p1) + lo(k2*p2) */ \
900 /* r[5] += hi(k2*p2) + lo(k2*p3) */ \
901 /* r[6] += hi(k2*p3) */ \
902 /* */ \
903 /* adcx sequence (CF): r[4], r[5], r[6], r[2], r[3], r[4], r[5], r[6], r[3] */ \
904 /* adox sequence (OF): r[3], r[4], r[5], r[6], r[3], r[4], r[5], r[6], r[2] */ \
905 /* ===================================================================================== */ \
906 "movq " a3 ", %%rdx \n\t" /* rdx = a[2] */ \
907 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[2] * b[1] */ \
908 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[2] * b[2] */ \
909 /* {CF=CF9 from R1, OF=OF9 from R1} */ \
910 "adoxq %%rdi, %%r10 \n\t" /* r[3] += lo(a2*b1) + OF {OF->OF1} */ \
911 "adcxq %%r11, %%r12 \n\t" /* r[4] += hi(a2*b1) + CF {CF->CF1} */ \
912 "adoxq %%r8, %%r12 \n\t" /* r[4] += lo(a2*b2) + OF1 {OF->OF2} */ \
913 "adcxq %%r9, %%r13 \n\t" /* r[5] += hi(a2*b2) + CF1 {CF->CF2} */ \
914 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (rdi, r14) = a[2] * b[3] -> r14 = r[6] = hi(a2*b3) */ \
915 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[2] * b[0] */ \
916 "adoxq %%rdi, %%r13 \n\t" /* r[5] += lo(a2*b3) + OF2 {OF->OF3} */ \
917 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF2 {CF->CF3} */ \
918 /* r[6] = hi(a2*b3) + CF2 < 2^63 + 1 < 2^64, so CF3 = 0 */ \
919 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += OF3 {OF->OF4} */ \
920 /* r[6] < 2^63 + 2 < 2^64, so OF4 = 0. Wrap-around is safe. */ \
921 "adcxq %%r8, %%r15 \n\t" /* r[2] += lo(a2*b0) + CF3=0 {CF->CF4} (CF3=0: safe) */ \
922 "adoxq %%r9, %%r10 \n\t" /* r[3] += hi(a2*b0) + OF4=0 {OF->OF5} (OF4=0: safe) */ \
923 \
924 /* --- k2 * p reduction ---------------------------------------------------------------- */ \
925 "movq %%r15, %%rdx \n\t" /* rdx = r[2] */ \
926 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k2 = r[2] * r_inv mod 2^64 */ \
927 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k2 * p[1] */ \
928 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[2] */ \
929 "adcxq %%rdi, %%r10 \n\t" /* r[3] += lo(k2*p1) + CF4 {CF->CF5} */ \
930 "adoxq %%r11, %%r12 \n\t" /* r[4] += hi(k2*p1) + OF5 {OF->OF6} */ \
931 "adcxq %%r8, %%r12 \n\t" /* r[4] += lo(k2*p2) + CF5 {CF->CF6} */ \
932 "adoxq %%r9, %%r13 \n\t" /* r[5] += hi(k2*p2) + OF6 {OF->OF7} */ \
933 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (rdi, r11) = k2 * p[3] */ \
934 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k2 * p[0] */ \
935 "adcxq %%rdi, %%r13 \n\t" /* r[5] += lo(k2*p3) + CF6 {CF->CF7} */ \
936 "adoxq %%r11, %%r14 \n\t" /* r[6] += hi(k2*p3) + OF7 {OF->OF8} */ \
937 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += CF7 {CF->CF8} */ \
938 /* r[6] <= 2^63 + 2^62 + 4 < 2^64, so CF8 = 0. Wrap safe. */ \
939 "adoxq %%r8, %%r15 \n\t" /* r[2] += lo(k2*p0) + OF8 {OF->OF9} -> 0 mod 2^64 */ \
940 "adcxq %%r9, %%r10 \n\t" /* r[3] += hi(k2*p0) + CF8=0 {CF->CF9} (CF8=0: safe) */ \
941 \
942 /* Post-round 2: terminal (CF9, OF9) flow into round 3. */ \
943 /* Top limb r[6] <= 2^63 + 2^62 + 4 < 2^64. Invariant holds. */ \
944 \
945 /* ===================================================================================== */ \
946 /* ROUND 3: accumulate a[3]*b, then reduce by k3*p. Shift out r[3]. */ \
947 /* Register map: r10=r[3] r12=r[4] r13=r[5] r14=r[6] r15=r[7] */ \
948 /* Tighter bound: a[3] < 2^63 (not 2^64), so hi(a3*b3) < 2^62. */ \
949 /* ===================================================================================== */ \
950 "movq " a4 ", %%rdx \n\t" /* rdx = a[3] (< 2^63) */ \
951 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[3] * b[0] */ \
952 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (rdi, r11) = a[3] * b[1] */ \
953 /* {CF=CF9 from R2, OF=OF9 from R2} */ \
954 "adoxq %%r8, %%r10 \n\t" /* r[3] += lo(a3*b0) + OF {OF->OF1} */ \
955 "adcxq %%r9, %%r12 \n\t" /* r[4] += hi(a3*b0) + CF {CF->CF1} */ \
956 "adoxq %%rdi, %%r12 \n\t" /* r[4] += lo(a3*b1) + OF1 {OF->OF2} */ \
957 "adcxq %%r11, %%r13 \n\t" /* r[5] += hi(a3*b1) + CF1 {CF->CF2} */ \
958 \
959 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (r8, r9) = a[3] * b[2] */ \
960 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (rdi, r15) = a[3] * b[3] -> r15 = r[7] = hi(a3*b3) */ \
961 "adoxq %%r8, %%r13 \n\t" /* r[5] += lo(a3*b2) + OF2 {OF->OF3} */ \
962 "adcxq %%r9, %%r14 \n\t" /* r[6] += hi(a3*b2) + CF2 {CF->CF3} */ \
963 "adoxq %%rdi, %%r14 \n\t" /* r[6] += lo(a3*b3) + OF3 {OF->OF4} */ \
964 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += CF3 {CF->CF4} */ \
965 /* r[7] = hi(a3*b3) + CF3 < 2^62 + 1 < 2^64, so CF4 = 0 */ \
966 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF4 {OF->OF5} */ \
967 /* r[7] < 2^62 + 2 < 2^64, so OF5 = 0. Wrap-around is safe. */ \
968 \
969 /* --- k3 * p reduction ---------------------------------------------------------------- */ \
970 /* */ \
971 /* Per-limb totals for k3*p: */ \
972 /* r[3] += lo(k3*p0) -> zeroed mod 2^64 */ \
973 /* r[4] += hi(k3*p0) + lo(k3*p1) */ \
974 /* r[5] += hi(k3*p1) + lo(k3*p2) */ \
975 /* r[6] += hi(k3*p2) + lo(k3*p3) */ \
976 /* r[7] += hi(k3*p3) */ \
977 "movq %%r10, %%rdx \n\t" /* rdx = r[3] */ \
978 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* rdx = k3 = r[3] * r_inv mod 2^64 */ \
979 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[0] */ \
980 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (rdi, r11) = k3 * p[1] */ \
981 /* {CF=CF4=0, OF=OF5=0} (both zero — see bounds on r[7] above) */ \
982 "adoxq %%r8, %%r10 \n\t" /* r[3] += lo(k3*p0) + OF5=0 {OF->OF6} -> 0 mod 2^64 */ \
983 "adcxq %%r9, %%r12 \n\t" /* r[4] += hi(k3*p0) + CF4=0 {CF->CF5} */ \
984 "adoxq %%rdi, %%r12 \n\t" /* r[4] += lo(k3*p1) + OF6 {OF->OF7} */ \
985 "adcxq %%r11, %%r13 \n\t" /* r[5] += hi(k3*p1) + CF5 {CF->CF6} */ \
986 \
987 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (r8, r9) = k3 * p[2] */ \
988 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (rdi, rdx) = k3 * p[3] (overwrites rdx; last mulxq) */ \
989 "adoxq %%r8, %%r13 \n\t" /* r[5] += lo(k3*p2) + OF7 {OF->OF8} */ \
990 "adcxq %%r9, %%r14 \n\t" /* r[6] += hi(k3*p2) + CF6 {CF->CF7} */ \
991 "adoxq %%rdi, %%r14 \n\t" /* r[6] += lo(k3*p3) + OF8 {OF->OF9} */ \
992 "adcxq %%rdx, %%r15 \n\t" /* r[7] += hi(k3*p3) + CF7 {CF->CF8} */ \
993 /* r[7] < (2^62 + 2) + 2^62 + 1 = 2^63 + 3 < 2^64, so CF8 = 0 (p[3]<2^62) */ \
994 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += OF9 {OF->OF10} */ \
995 /* r[7] < 2^63 + 4 < 2^64, so OF10 = 0. Terminal flags are both 0. */ \
996 \
997 /* Output in (r12, r13, r14, r15) = (r[4], r[5], r[6], r[7]). */ \
998 /* Since S < 2p < 2^255, the top limb r[7] < 2^63. Valid input for another MUL. */
999
1000#endif