Replace slow 64-bit div/idiv instructions (10-20 cycles on latest x86,
even slower with older models) with fast multiply-and-shift sequences
(~4-5 cycles) for BPF_K div/mod operations, using the
Granlund-Montgomery algorithm.

For each 64-bit BPF_ALU64_{DIV,MOD}_K instruction with a non-zero
immediate divisor, the JIT computes a magic multiplier and shift at
JIT compile time via choose_multiplier_64(), then emits:

  Unsigned: floor(n/d) = mulhi(n, m) >> s
  Signed:   trunc(n/d) = mulhi_signed(n, m) >> s + sign_correction

Both unsigned (mul + wide correction) and signed (imul + add-n
correction) variants are implemented. Remainder is computed as
r = n - q * d. The optimization targets 64-bit operations only, as
LLVM already optimizes 32-bit cases at the compiler level.

The change doesn't put much effort in optimizing cases expected to
be handled by LLVM, such as 32-bit division by an immediate, or
power-of-2 divisors, which produce correct but suboptimal code sequence
(still faster than div/idiv). BPF LLVM cannot apply the optimization for
64-bit division because it would need an instruction that can do 128-bit
by 64-bit division (like x86's DIV and IDIV)

- div128_u64(): inline asm helper for 128-bit division using two divq
  steps, avoiding libgcc dependency
- choose_multiplier_64(): compute magic multiplier m, shift s, and
  is_wide flag for a given divisor and precision
- emit_udivmod_const(): unsigned 64-bit div/mod code generation with
  mul/shr and optional wide correction ((n-t)/2+t) path
- emit_sdivmod_const(): signed 64-bit div/mod code generation with
  imul/sar, add-n correction for overflowing m, sign correction
  (add sign bit to round toward zero), and negation for negative
  divisors
- emit_divmod_const(): top-level dispatch; falls back to div/idiv for
  32-bit, divisor=0, or negative unsigned immediates

Test Plan:

 test_bpf: Summary: 1061 PASSED, 0 FAILED, [1049/1049 JIT'ed]
 test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed]
 test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 286 ++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ea9e707e8abf..39182355d61d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1649,6 +1649,286 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip,
 	return 0;
 }
 
+/*
+ * ---------- Division by compile-time constant optimization ----------
+ *
+ * For 64-bit BPF_K (immediate divisor) div/mod operations, replace slow
+ * div/idiv instructions (15+ cycles) with fast multiply+shift sequences
+ * (~4 cycles).
+ *
+ * LLVM already optimizes power-of-2 divisors and 32-bit operations at the
+ * compiler level, so this JIT optimization only focuses on 64-bit division by
+ * general constants, both signed and unsigned. For power-of-2 divisors, the
+ * algorithm generates instructions yielding correct results, but isn't the
+ * most optimal for performance (still faster than div/idiv though).
+ */
+
+struct bpf_jit_div_const {
+	u64 m;        /* magic multiplier */
+	u8 shift;     /* post-shift amount */
+	bool is_wide; /* unsigned: m > 2^64, needs (n-t)/2+t correction
+		       * signed: m overflows, needs add-n correction
+		       */
+};
+
+/*
+ * Divide a 128-bit value (n_hi:n_lo) by a u64 divisor using x86-64 divq.
+ * Do not rely on libgcc for 128-bit division. Two divq steps give a full
+ * 128-bit quotient.
+ */
+static inline unsigned __int128 div128_u64(u64 n_hi, u64 n_lo, u64 d)
+{
+	u64 q_hi, q_lo, rem;
+
+	asm("divq %[d]" : "=a"(q_hi), "=d"(rem)
+	    : "a"(n_hi), "d"(0ULL), [d] "rm"(d));
+	asm("divq %[d]" : "=a"(q_lo), "=d"(rem)
+	    : "a"(n_lo), "d"(rem), [d] "rm"(d));
+
+	return ((unsigned __int128)q_hi << 64) | q_lo;
+}
+
+/*
+ * Granlund-Montgomery choose_multiplier for 64-bit division by constant.
+ * Computes magic multiplier m and shift s such that:
+ *   unsigned: floor(n/d) = mulhi(n, m) >> s  (with possible wide correction)
+ *   signed:   floor(n/d) = mulhi_signed(n, m) >> s + sign_correction
+ *
+ * prec = 64 for unsigned, 63 for signed (sign bit reduces usable range).
+ */
+static bool choose_multiplier_64(u32 d, u8 prec, struct bpf_jit_div_const *c)
+{
+	unsigned __int128 mlow, mhigh;
+	u32 l, post_shift;
+
+	if (d <= 1)
+		return false;
+
+	l = fls(d - 1);
+
+	mlow = div128_u64(1ULL << l, 0, d);
+	mhigh = div128_u64(1ULL << l, 1ULL << (l + 64 - prec), d);
+
+	post_shift = l;
+	while (post_shift > 0) {
+		unsigned __int128 lo = mlow >> 1, hi = mhigh >> 1;
+
+		if (lo >= hi)
+			break;
+		mlow = lo;
+		mhigh = hi;
+		post_shift--;
+	}
+
+	c->m = (u64)mhigh;
+	c->shift = post_shift;
+	c->is_wide = (mhigh >> 64) != 0;
+	return true;
+}
+
+/* Unsigned 64-bit div/mod by constant via multiply+shift */
+static void emit_udivmod_const(u8 **pprog, u32 dst_reg, u32 d,
+			       const struct bpf_jit_div_const *c, bool is_mod)
+{
+	u8 *prog = *pprog;
+
+	/* For MOD, save original n for remainder = n - q * d */
+	if (is_mod) {
+		maybe_emit_1mod(&prog, dst_reg, false); /* push dst */
+		EMIT1(add_1reg(0x50, dst_reg));
+	}
+
+	if (dst_reg != BPF_REG_0)
+		EMIT1(0x50); /* push rax */
+	if (dst_reg != BPF_REG_3)
+		EMIT1(0x52); /* push rdx */
+	if (dst_reg != BPF_REG_0)
+		emit_mov_reg(&prog, true, BPF_REG_0, dst_reg); /* mov rax, dst */
+	if (c->is_wide)
+		EMIT1(0x50); /* push rax (= n, for wide correction) */
+
+	/* movabs r11, m */
+	emit_mov_imm64(&prog, AUX_REG, (u32)(c->m >> 32), (u32)c->m);
+
+	/* mul r11 -> rdx:rax = rax * r11 (unsigned) */
+	EMIT1(add_1mod(0x48, AUX_REG));
+	EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+
+	if (c->is_wide) {
+		/*
+		 * Wide correction: quotient = ((n - t) >> 1 + t) >> (shift-1)
+		 * where t = mulhi(n, m) in rdx
+		 */
+		EMIT2(0x41, 0x5B); /* pop r11 (= n) */
+		/* sub r11, rdx */
+		EMIT1(add_2mod(0x48, AUX_REG, BPF_REG_3));
+		EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_3));
+		/* shr r11, 1 */
+		EMIT1(add_1mod(0x48, AUX_REG));
+		EMIT2(0xD1, add_1reg(0xE8, AUX_REG));
+		/* add r11, rdx */
+		EMIT1(add_2mod(0x48, AUX_REG, BPF_REG_3));
+		EMIT2(0x01, add_2reg(0xC0, AUX_REG, BPF_REG_3));
+		/* shr r11, shift-1 */
+		if (c->shift > 1) {
+			EMIT1(add_1mod(0x48, AUX_REG));
+			EMIT3(0xC1, add_1reg(0xE8, AUX_REG),
+			      c->shift - 1);
+		}
+		/* mov dst, r11 */
+		emit_mov_reg(&prog, true, dst_reg, AUX_REG);
+	} else {
+		/* shr rdx, shift */
+		if (c->shift) {
+			EMIT1(0x48);
+			EMIT3(0xC1, add_1reg(0xE8, BPF_REG_3), c->shift);
+		}
+		/* mov dst, rdx */
+		if (dst_reg != BPF_REG_3)
+			emit_mov_reg(&prog, true, dst_reg, BPF_REG_3);
+	}
+
+	if (dst_reg != BPF_REG_3)
+		EMIT1(0x5A); /* pop rdx */
+	if (dst_reg != BPF_REG_0)
+		EMIT1(0x58); /* pop rax */
+
+	/* dst = quotient; compute remainder if needed */
+	if (is_mod) {
+		/* imul r11, dst, d -> r11 = quotient * d */
+		maybe_emit_mod(&prog, dst_reg, AUX_REG, true);
+		if (is_imm8(d))
+			EMIT3(0x6B, add_2reg(0xC0, dst_reg, AUX_REG), d);
+		else
+			EMIT2_off32(0x69,
+				    add_2reg(0xC0, dst_reg, AUX_REG), d);
+		/* pop dst (= original n) */
+		maybe_emit_1mod(&prog, dst_reg, false);
+		EMIT1(add_1reg(0x58, dst_reg));
+		/* sub dst, r11 -> remainder = n - q*d */
+		maybe_emit_mod(&prog, dst_reg, AUX_REG, true);
+		EMIT2(0x29, add_2reg(0xC0, dst_reg, AUX_REG));
+	}
+
+	*pprog = prog;
+}
+
+/* Signed 64-bit div/mod by constant via multiply+shift */
+static void emit_sdivmod_const(u8 **pprog, u32 dst_reg, s32 imm,
+			       const struct bpf_jit_div_const *c, bool is_mod)
+{
+	u8 *prog = *pprog;
+	bool neg_divisor = (imm < 0);
+
+	/* For MOD, save original n for remainder = n - q * d */
+	if (is_mod) {
+		maybe_emit_1mod(&prog, dst_reg, false); /* push dst */
+		EMIT1(add_1reg(0x50, dst_reg));
+	}
+
+	if (dst_reg != BPF_REG_0)
+		EMIT1(0x50); /* push rax */
+	if (dst_reg != BPF_REG_3)
+		EMIT1(0x52); /* push rdx */
+	if (dst_reg != BPF_REG_0)
+		emit_mov_reg(&prog, true, BPF_REG_0, dst_reg); /* mov rax, dst */
+	if (c->is_wide)
+		EMIT1(0x50); /* push rax (= n, for wide correction) */
+
+	/* movabs r11, m */
+	emit_mov_imm64(&prog, AUX_REG, (u32)(c->m >> 32), (u32)c->m);
+
+	/* imul r11 -> rdx:rax = rax * r11 (signed) */
+	EMIT1(add_1mod(0x48, AUX_REG));
+	EMIT2(0xF7, add_1reg(0xE8, AUX_REG));
+
+	if (c->is_wide) {
+		/* Wide correction: rdx += n (bit 63 set in m) */
+		EMIT2(0x41, 0x5B); /* pop r11 (= n) */
+		/* add rdx, r11 */
+		EMIT1(add_2mod(0x48, BPF_REG_3, AUX_REG));
+		EMIT2(0x01, add_2reg(0xC0, BPF_REG_3, AUX_REG));
+	}
+
+	/* sar rdx, shift */
+	if (c->shift) {
+		EMIT1(0x48);
+		EMIT3(0xC1, add_1reg(0xF8, BPF_REG_3), c->shift);
+	}
+
+	/* Sign correction: round toward zero by adding sign bit */
+	emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3); /* mov r11, rdx */
+	EMIT1(add_1mod(0x48, AUX_REG));
+	EMIT3(0xC1, add_1reg(0xE8, AUX_REG), 63); /* shr r11, 63 */
+	EMIT1(add_2mod(0x48, BPF_REG_3, AUX_REG));
+	EMIT2(0x01, add_2reg(0xC0, BPF_REG_3, AUX_REG)); /* add rdx, r11 */
+
+	/* mov dst, rdx */
+	if (dst_reg != BPF_REG_3)
+		emit_mov_reg(&prog, true, dst_reg, BPF_REG_3);
+	if (dst_reg != BPF_REG_3)
+		EMIT1(0x5A); /* pop rdx */
+	if (dst_reg != BPF_REG_0)
+		EMIT1(0x58); /* pop rax */
+
+	/* neg dst (if divisor was negative) */
+	if (neg_divisor) {
+		EMIT1(add_1mod(0x48, dst_reg));
+		EMIT2(0xF7, add_1reg(0xD8, dst_reg));
+	}
+
+	/* dst = quotient; compute remainder if needed */
+	if (is_mod) {
+		/* imul r11, dst, d -> r11 = quotient * d */
+		maybe_emit_mod(&prog, dst_reg, AUX_REG, true);
+		if (is_imm8(imm))
+			EMIT3(0x6B, add_2reg(0xC0, dst_reg, AUX_REG), imm);
+		else
+			EMIT2_off32(0x69,
+				    add_2reg(0xC0, dst_reg, AUX_REG), imm);
+		/* pop dst (= original n) */
+		maybe_emit_1mod(&prog, dst_reg, false);
+		EMIT1(add_1reg(0x58, dst_reg));
+		/* sub dst, r11 -> remainder = n - q*d */
+		maybe_emit_mod(&prog, dst_reg, AUX_REG, true);
+		EMIT2(0x29, add_2reg(0xC0, dst_reg, AUX_REG));
+	}
+
+	*pprog = prog;
+}
+
+static bool emit_divmod_const(u8 **pprog, u32 dst_reg, s32 imm, bool is64,
+			      bool is_mod, bool is_signed)
+{
+	struct bpf_jit_div_const c;
+
+    /* Assuming that LLVM already optimizes 32-bit */
+	if (!is64 || imm == 0)
+		return false;
+
+	if (!is_signed) {
+		if (imm < 0)
+			return false;
+		if (!choose_multiplier_64((u32)imm, 64, &c))
+			return false;
+		emit_udivmod_const(pprog, dst_reg, (u32)imm, &c, is_mod);
+	} else {
+		u32 ad = (imm > 0) ? (u32)imm : (u32)(-imm);
+
+		if (ad <= 1)
+			return false;
+		if (!choose_multiplier_64(ad, 63, &c))
+			return false;
+		/* For signed, is_wide means bit 63 is set (m appears
+		 * negative to imul), not that it exceeds 2^64.
+		 */
+		c.is_wide = (c.m >> 63) != 0;
+		emit_sdivmod_const(pprog, dst_reg, imm, &c, is_mod);
+	}
+
+	return true;
+}
+
 static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1891,6 +2171,12 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *
 		case BPF_ALU64 | BPF_DIV | BPF_K: {
 			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
 
+			if (BPF_SRC(insn->code) == BPF_K &&
+			    emit_divmod_const(&prog, dst_reg, imm32, is64,
+					     BPF_OP(insn->code) == BPF_MOD,
+					     insn->off != 0))
+				break;
+
 			if (dst_reg != BPF_REG_0)
 				EMIT1(0x50); /* push rax */
 			if (dst_reg != BPF_REG_3)
-- 
2.52.0