712 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
713 return montgomery_mul_big(other);
715#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
717 auto [t0, c] = mul_wide(
data[0], other.data[0]);
718 uint64_t k = t0 * T::r_inv;
719 uint64_t
a = mac_discard_lo(t0, k, modulus.data[0]);
721 uint64_t t1 = mac_mini(
a,
data[0], other.
data[1],
a);
722 mac(t1, k, modulus.data[1], c, t0, c);
723 uint64_t t2 = mac_mini(
a,
data[0], other.
data[2],
a);
724 mac(t2, k, modulus.data[2], c, t1, c);
725 uint64_t t3 = mac_mini(
a,
data[0], other.
data[3],
a);
726 mac(t3, k, modulus.data[3], c, t2, c);
729 mac_mini(t0,
data[1], other.data[0], t0,
a);
731 c = mac_discard_lo(t0, k, modulus.data[0]);
732 mac(t1,
data[1], other.data[1],
a, t1,
a);
733 mac(t1, k, modulus.data[1], c, t0, c);
734 mac(t2,
data[1], other.data[2],
a, t2,
a);
735 mac(t2, k, modulus.data[2], c, t1, c);
736 mac(t3,
data[1], other.data[3],
a, t3,
a);
737 mac(t3, k, modulus.data[3], c, t2, c);
740 mac_mini(t0,
data[2], other.data[0], t0,
a);
742 c = mac_discard_lo(t0, k, modulus.data[0]);
743 mac(t1,
data[2], other.data[1],
a, t1,
a);
744 mac(t1, k, modulus.data[1], c, t0, c);
745 mac(t2,
data[2], other.data[2],
a, t2,
a);
746 mac(t2, k, modulus.data[2], c, t1, c);
747 mac(t3,
data[2], other.data[3],
a, t3,
a);
748 mac(t3, k, modulus.data[3], c, t2, c);
751 mac_mini(t0,
data[3], other.data[0], t0,
a);
753 c = mac_discard_lo(t0, k, modulus.data[0]);
754 mac(t1,
data[3], other.data[1],
a, t1,
a);
755 mac(t1, k, modulus.data[1], c, t0, c);
756 mac(t2,
data[3], other.data[2],
a, t2,
a);
757 mac(t2, k, modulus.data[2], c, t1, c);
758 mac(t3,
data[3], other.data[3],
a, t3,
a);
759 mac(t3, k, modulus.data[3], c, t2, c);
762 field result{ t0, t1, t2, t3 };
771 auto left = wasm_convert(
data);
772 auto right = wasm_convert(other.data);
773 constexpr uint64_t mask = 0x1fffffff;
783 uint64_t pl0 = left[0] * right[0];
784 uint64_t pl1 = left[0] * right[1] + left[1] * right[0];
785 uint64_t pl2 = left[0] * right[2] + left[1] * right[1] + left[2] * right[0];
786 uint64_t pl3 = left[0] * right[3] + left[1] * right[2] + left[2] * right[1] + left[3] * right[0];
788 left[0] * right[4] + left[1] * right[3] + left[2] * right[2] + left[3] * right[1] + left[4] * right[0];
789 uint64_t pl5 = left[1] * right[4] + left[2] * right[3] + left[3] * right[2] + left[4] * right[1];
790 uint64_t pl6 = left[2] * right[4] + left[3] * right[3] + left[4] * right[2];
791 uint64_t pl7 = left[3] * right[4] + left[4] * right[3];
792 uint64_t pl8 = left[4] * right[4];
795 uint64_t ph0 = left[5] * right[5];
796 uint64_t ph1 = left[5] * right[6] + left[6] * right[5];
797 uint64_t ph2 = left[5] * right[7] + left[6] * right[6] + left[7] * right[5];
798 uint64_t ph3 = left[5] * right[8] + left[6] * right[7] + left[7] * right[6] + left[8] * right[5];
799 uint64_t ph4 = left[6] * right[8] + left[7] * right[7] + left[8] * right[6];
800 uint64_t ph5 = left[7] * right[8] + left[8] * right[7];
801 uint64_t ph6 = left[8] * right[8];
804 uint64_t sl0 = left[0] + left[5];
805 uint64_t sl1 = left[1] + left[6];
806 uint64_t sl2 = left[2] + left[7];
807 uint64_t sl3 = left[3] + left[8];
808 uint64_t sl4 = left[4];
809 uint64_t sr0 = right[0] + right[5];
810 uint64_t sr1 = right[1] + right[6];
811 uint64_t sr2 = right[2] + right[7];
812 uint64_t sr3 = right[3] + right[8];
813 uint64_t sr4 = right[4];
816 uint64_t pc0 = sl0 * sr0;
817 uint64_t pc1 = sl0 * sr1 + sl1 * sr0;
818 uint64_t pc2 = sl0 * sr2 + sl1 * sr1 + sl2 * sr0;
819 uint64_t pc3 = sl0 * sr3 + sl1 * sr2 + sl2 * sr1 + sl3 * sr0;
820 uint64_t pc4 = sl0 * sr4 + sl1 * sr3 + sl2 * sr2 + sl3 * sr1 + sl4 * sr0;
821 uint64_t pc5 = sl1 * sr4 + sl2 * sr3 + sl3 * sr2 + sl4 * sr1;
822 uint64_t pc6 = sl2 * sr4 + sl3 * sr3 + sl4 * sr2;
823 uint64_t pc7 = sl3 * sr4 + sl4 * sr3;
824 uint64_t pc8 = sl4 * sr4;
828 uint64_t temp_0 = pl0;
829 uint64_t temp_1 = pl1;
830 uint64_t temp_2 = pl2;
831 uint64_t temp_3 = pl3;
832 uint64_t temp_4 = pl4;
833 uint64_t temp_5 = pl5 + (pc0 - pl0 - ph0);
834 uint64_t temp_6 = pl6 + (pc1 - pl1 - ph1);
835 uint64_t temp_7 = pl7 + (pc2 - pl2 - ph2);
836 uint64_t temp_8 = pl8 + (pc3 - pl3 - ph3);
837 uint64_t temp_9 = pc4 - pl4 - ph4;
838 uint64_t temp_10 = (pc5 - pl5 - ph5) + ph0;
839 uint64_t temp_11 = (pc6 - pl6 - ph6) + ph1;
840 uint64_t temp_12 = (pc7 - pl7) + ph2;
841 uint64_t temp_13 = (pc8 - pl8) + ph3;
842 uint64_t temp_14 = ph4;
843 uint64_t temp_15 = ph5;
844 uint64_t temp_16 = ph6;
848 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
849 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
850 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
851 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
852 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
853 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
854 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
855 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
861 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
893 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
894 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
895 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
896 (temp_15 >> 18) | (temp_16 << 11) };
907 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
908 return montgomery_mul_big(*
this);
910#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
911 uint64_t carry_hi = 0;
913 auto [t0, carry_lo] = mul_wide(
data[0],
data[0]);
914 uint64_t t1 = square_accumulate(0,
data[1],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
915 uint64_t t2 = square_accumulate(0,
data[2],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
916 uint64_t t3 = square_accumulate(0,
data[3],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
918 uint64_t round_carry = carry_lo;
919 uint64_t k = t0 * T::r_inv;
920 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
921 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
922 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
923 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
924 t3 = carry_lo + round_carry;
926 t1 = mac_mini(t1,
data[1],
data[1], carry_lo);
928 t2 = square_accumulate(t2,
data[2],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
929 t3 = square_accumulate(t3,
data[3],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
930 round_carry = carry_lo;
932 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
933 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
934 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
935 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
936 t3 = carry_lo + round_carry;
938 t2 = mac_mini(t2,
data[2],
data[2], carry_lo);
940 t3 = square_accumulate(t3,
data[3],
data[2], carry_lo, carry_hi, carry_lo, carry_hi);
941 round_carry = carry_lo;
943 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
944 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
945 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
946 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
947 t3 = carry_lo + round_carry;
949 t3 = mac_mini(t3,
data[3],
data[3], carry_lo);
951 round_carry = carry_lo;
952 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
953 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
954 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
955 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
956 t3 = carry_lo + round_carry;
958 field result{ t0, t1, t2, t3 };
966 auto left = wasm_convert(
data);
967 constexpr uint64_t mask = 0x1fffffff;
978 uint64_t temp_10 = 0;
979 uint64_t temp_11 = 0;
980 uint64_t temp_12 = 0;
981 uint64_t temp_13 = 0;
982 uint64_t temp_14 = 0;
983 uint64_t temp_15 = 0;
984 uint64_t temp_16 = 0;
987 temp_0 += left[0] * left[0];
989 acc += left[0] * left[1];
990 temp_1 += (acc << 1);
992 acc += left[0] * left[2];
993 temp_2 += left[1] * left[1];
994 temp_2 += (acc << 1);
996 acc += left[0] * left[3];
997 acc += left[1] * left[2];
998 temp_3 += (acc << 1);
1000 acc += left[0] * left[4];
1001 acc += left[1] * left[3];
1002 temp_4 += left[2] * left[2];
1003 temp_4 += (acc << 1);
1005 acc += left[0] * left[5];
1006 acc += left[1] * left[4];
1007 acc += left[2] * left[3];
1008 temp_5 += (acc << 1);
1010 acc += left[0] * left[6];
1011 acc += left[1] * left[5];
1012 acc += left[2] * left[4];
1013 temp_6 += left[3] * left[3];
1014 temp_6 += (acc << 1);
1016 acc += left[0] * left[7];
1017 acc += left[1] * left[6];
1018 acc += left[2] * left[5];
1019 acc += left[3] * left[4];
1020 temp_7 += (acc << 1);
1022 acc += left[0] * left[8];
1023 acc += left[1] * left[7];
1024 acc += left[2] * left[6];
1025 acc += left[3] * left[5];
1026 temp_8 += left[4] * left[4];
1027 temp_8 += (acc << 1);
1029 acc += left[1] * left[8];
1030 acc += left[2] * left[7];
1031 acc += left[3] * left[6];
1032 acc += left[4] * left[5];
1033 temp_9 += (acc << 1);
1035 acc += left[2] * left[8];
1036 acc += left[3] * left[7];
1037 acc += left[4] * left[6];
1038 temp_10 += left[5] * left[5];
1039 temp_10 += (acc << 1);
1041 acc += left[3] * left[8];
1042 acc += left[4] * left[7];
1043 acc += left[5] * left[6];
1044 temp_11 += (acc << 1);
1046 acc += left[4] * left[8];
1047 acc += left[5] * left[7];
1048 temp_12 += left[6] * left[6];
1049 temp_12 += (acc << 1);
1051 acc += left[5] * left[8];
1052 acc += left[6] * left[7];
1053 temp_13 += (acc << 1);
1055 acc += left[6] * left[8];
1056 temp_14 += left[7] * left[7];
1057 temp_14 += (acc << 1);
1059 acc += left[7] * left[8];
1060 temp_15 += (acc << 1);
1061 temp_16 += left[8] * left[8];
1065 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1066 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1067 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1068 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1069 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1070 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1071 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1072 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1090 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1108 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
1109 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
1110 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
1111 (temp_15 >> 18) | (temp_16 << 11) };