BPF loads with BPF_PROBE_MEM(SX) can load from unsafe pointers and the JIT adds an exception table entry for the JITed instruction which allows the exeption handler to set the destination register of the load to zero and continue execution from the next instruction. As all arm64 instructions are AARCH64_INSN_SIZE size, the exception handler can just increment the pc by AARCH64_INSN_SIZE without needing the exact address of the instruction following the the faulting instruction. Simplify the exception table usage in arm64 JIT by only saving the destination register in ex->fixup and drop everything related to the fixup_offset. The fault handler is modified to add AARCH64_INSN_SIZE to the pc. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song --- arch/arm64/net/bpf_jit_comp.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c47..42643fd9168fc 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,19 +1066,18 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } -#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { - off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; - regs->pc = (unsigned long)&ex->fixup - offset; + /* Skip the faulting instruction */ + regs->pc += AARCH64_INSN_SIZE; return true; } @@ -1088,7 +1087,6 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; - off_t fixup_offset; unsigned long pc; struct exception_table_entry *ex; @@ -1119,22 +1117,6 @@ static int add_exception_handler(const struct bpf_insn *insn, if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN)) return -ERANGE; - /* - * Since the extable follows the program, the fixup offset is always - * negative and limited to BPF_JIT_REGION_SIZE. Store a positive value - * to keep things simple, and put the destination register in the upper - * bits. We don't need to worry about buildtime or runtime sort - * modifying the upper bits because the table is already sorted, and - * isn't part of the main exception table. - * - * The fixup_offset is set to the next instruction from the instruction - * that may fault. The execution will jump to this after handling the - * fault. - */ - fixup_offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) - return -ERANGE; - /* * The offsets above have been calculated using the RO buffer but we * need to use the R/W buffer for writes. @@ -1147,8 +1129,7 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_CLASS(insn->code) != BPF_LDX) dst_reg = DONT_CLEAR; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | - FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; -- 2.47.3 Begin reporting arena page faults and the faulting address to BPF program's stderr, this patch adds support in the arm64 and x86-64 JITs, support for other archs can be added later. The fault handlers receive the 32 bit address in the arena region so the upper 32 bits of user_vm_start is added to it before printing the address. This is what the user would expect to see as this is what is printed by bpf_printk() is you pass it an address returned by bpf_arena_alloc_pages(); Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song --- arch/arm64/net/bpf_jit_comp.c | 52 +++++++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 79 +++++++++++++++++++++++++++++++++-- include/linux/bpf.h | 1 + kernel/bpf/arena.c | 20 +++++++++ 4 files changed, 148 insertions(+), 4 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 42643fd9168fc..5083886d6e66b 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,6 +1066,30 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ @@ -1073,11 +1097,22 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; /* Skip the faulting instruction */ regs->pc += AARCH64_INSN_SIZE; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr); + } + return true; } @@ -1087,6 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1100,6 +1138,9 @@ static int add_exception_handler(const struct bpf_insn *insn, BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1131,6 +1172,17 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } + ex->type = EX_TYPE_BPF; ctx->exentry_idx++; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca1646203..b75dea55df5a2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1388,16 +1389,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr); + } + return true; } @@ -2070,6 +2122,8 @@ st: if (is_imm8(insn->off)) { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; + bool is_arena; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2143,25 @@ st: if (is_imm8(insn->off)) ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + + fixup_reg = (BPF_CLASS(insn->code) == BPF_LDX) ? + reg2pt_regs[dst_reg] : DONT_CLEAR; + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + + if (is_arena) { + ex->fixup |= FIXUP_ARENA_ACCESS; + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = reg2pt_regs[src_reg]; + else + arena_reg = reg2pt_regs[dst_reg]; + + ex->fixup |= FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg); + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); + } } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a89..bf42a2cfc635e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3659,6 +3659,7 @@ int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, enum bpf_stream_id stream_id); int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); +void bpf_prog_report_arena_violation(bool write, unsigned long addr); #define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__) #define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 5b37753799d20..a1653d1c04ca5 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -633,3 +633,23 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += (user_vm_start >> 32) << 32; + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} -- 2.47.3 Add selftests for testing the reporting of arena page faults through BPF streams. Two new bpf programs are added that read and write to an unmapped arena address and the fault reporting is verified in the userspace through streams. Signed-off-by: Puranjay Mohan --- .../testing/selftests/bpf/prog_tests/stream.c | 33 +++++++++++++++- tools/testing/selftests/bpf/progs/stream.c | 39 +++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index 36a1a1ebde692..8fdc83260ea14 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -41,6 +41,22 @@ struct { "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" "|[ \t]+[^\n]+\n)*", }, + { + offsetof(struct stream, progs.stream_arena_read_fault), + "ERROR: Arena READ access at unmapped address 0x.*\n" + "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" + "Call trace:\n" + "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" + "|[ \t]+[^\n]+\n)*", + }, + { + offsetof(struct stream, progs.stream_arena_write_fault), + "ERROR: Arena WRITE access at unmapped address 0x.*\n" + "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" + "Call trace:\n" + "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" + "|[ \t]+[^\n]+\n)*", + }, }; static int match_regex(const char *pattern, const char *string) @@ -63,6 +79,7 @@ void test_stream_errors(void) struct stream *skel; int ret, prog_fd; char buf[1024]; + char fault_addr[64] = {0}; skel = stream__open_and_load(); if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) @@ -85,6 +102,14 @@ void test_stream_errors(void) continue; } #endif +#if !defined(__x86_64__) && !defined(__aarch64__) + ASSERT_TRUE(1, "Arena fault reporting unsupported, skip."); + if (i == 2 || i == 3) { + ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); + ASSERT_EQ(ret, 0, "stream read"); + continue; + } +#endif ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); ASSERT_GT(ret, 0, "stream read"); @@ -92,8 +117,14 @@ void test_stream_errors(void) buf[ret] = '\0'; ret = match_regex(stream_error_arr[i].errstr, buf); - if (!ASSERT_TRUE(ret == 1, "regex match")) + if (ret && (i == 2 || i == 3)) { + sprintf(fault_addr, "0x%lx", skel->bss->fault_addr); + ret = match_regex(fault_addr, buf); + } + if (!ASSERT_TRUE(ret == 1, "regex match")) { fprintf(stderr, "Output from stream:\n%s\n", buf); + fprintf(stderr, "Fault Addr: 0x%lx\n", skel->bss->fault_addr); + } } stream__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 35790897dc879..9de015ac3ced5 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,6 +5,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" +#include "bpf_arena_common.h" struct arr_elem { struct bpf_res_spin_lock lock; @@ -17,10 +18,17 @@ struct { __type(value, struct arr_elem); } arrmap SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); /* number of pages */ +} arena SEC(".maps"); + #define ENOSPC 28 #define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" int size; +u64 fault_addr; SEC("syscall") __success __retval(0) @@ -76,4 +84,35 @@ int stream_syscall(void *ctx) return 0; } +SEC("syscall") +__success __retval(0) +int stream_arena_write_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + + fault_addr = user_vm_start + 0xbeef; + *(u32 __arena *)(user_vm_start + 0xbeef) = 1; + + return 0; +} + +SEC("syscall") +__success __retval(0) +int stream_arena_read_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + + fault_addr = user_vm_start + 0xbeef; + + return *(u32 __arena *)(user_vm_start + 0xbeef); +} + char _license[] SEC("license") = "GPL"; -- 2.47.3