From: Abhishek Dubey Ensure the dummy trampoline address field present between the OOL stub and the long branch stub is 8-byte aligned, for compatibility when loaded into a register. Fixes: d243b62b7bd3 ("powerpc64/bpf: Add support for bpf trampolines") Cc: stable@vger.kernel.org Signed-off-by: Abhishek Dubey --- arch/powerpc/net/bpf_jit.h | 4 ++-- arch/powerpc/net/bpf_jit_comp.c | 34 ++++++++++++++++++++++++++----- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index f32de8704d4d..71e6e7d01057 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -214,8 +214,8 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context * int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx, u32 *addrs, int pass, bool extra_pass); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); -void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); -void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx); +void bpf_jit_build_epilogue(u32 *image, u32 *fimage, struct codegen_context *ctx); +void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr); void prepare_for_fsession_fentry(u32 *image, struct codegen_context *ctx, int cookie_cnt, diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 9b2b456b0765..3cbb3647f7a0 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -49,11 +49,34 @@ asm ( " .popsection ;" ); -void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx) +void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context *ctx) { int ool_stub_idx, long_branch_stub_idx; /* + * In the final pass, align the mis-aligned dummy tramp address + * in the fimage. The alignment NOP must appear before OOL stub, + * to make ool_stub_idx & long_branch_stub_idx constant from end. + * + * Need alignment NOP in following conditions: + * + * OOL stub aligned CONFIG_PPC_FTRACE_OUT_OF_LINE Alignment NOP + * Y Y N + * Y N Y + * N Y Y + * N N N + */ +#ifdef CONFIG_PPC64 + if (fimage && image) { + unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx); + + if (IS_ALIGNED(pc, 8) ^ + IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + EMIT(PPC_RAW_NOP()); + } +#endif + + /* nop // optional, for alignment of dummy_tramp_addr * Out-of-line stub: * mflr r0 * [b|bl] tramp @@ -70,7 +93,7 @@ void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx) /* * Long branch stub: - * .long + * .long // 8-byte aligned * mflr r11 * bcl 20,31,$+4 * mflr r12 @@ -81,6 +104,7 @@ void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx) */ if (image) *((unsigned long *)&image[ctx->idx]) = (unsigned long)dummy_tramp; + ctx->idx += SZL / 4; long_branch_stub_idx = ctx->idx; EMIT(PPC_RAW_MFLR(_R11)); @@ -107,7 +131,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, PPC_JMP(ctx->alt_exit_addr); } else { ctx->alt_exit_addr = ctx->idx * 4; - bpf_jit_build_epilogue(image, ctx); + bpf_jit_build_epilogue(image, NULL, ctx); } return 0; @@ -308,7 +332,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ bpf_jit_build_prologue(NULL, &cgctx); addrs[fp->len] = cgctx.idx * 4; - bpf_jit_build_epilogue(NULL, &cgctx); + bpf_jit_build_epilogue(NULL, NULL, &cgctx); fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN * 4; extable_len = fp->aux->num_exentries * sizeof(struct exception_table_entry); @@ -343,7 +367,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp = org_fp; goto out_addrs; } - bpf_jit_build_epilogue(code_base, &cgctx); + bpf_jit_build_epilogue(code_base, fcode_base, &cgctx); if (bpf_jit_enable > 1) pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass, diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index d39241444cd9..57992c1e8386 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -398,7 +398,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx } } -void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) +void bpf_jit_build_epilogue(u32 *image, u32 *fimage, struct codegen_context *ctx) { bpf_jit_emit_common_epilogue(image, ctx); @@ -407,7 +407,7 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) EMIT(PPC_RAW_BLR()); - bpf_jit_build_fentry_stubs(image, ctx); + bpf_jit_build_fentry_stubs(image, fimage, ctx); } /* -- 2.52.0 From: Abhishek Dubey Move the long branch address space to the bottom of the long branch stub. This allows uninterrupted disassembly until the last 8 bytes. Exclude these last bytes from the overall program length to prevent failure in assembly generation. Also, align dummy_tramp_addr field with 8-byte boundary. Following is disassembler output for test program with moved down dummy_tramp_addr field: ..... ..... pc:68 left:44 a6 03 08 7c : mtlr 0 pc:72 left:40 bc ff ff 4b : b .-68 pc:76 left:36 a6 02 68 7d : mflr 11 pc:80 left:32 05 00 9f 42 : bcl 20, 31, .+4 pc:84 left:28 a6 02 88 7d : mflr 12 pc:88 left:24 14 00 8c e9 : ld 12, 20(12) pc:92 left:20 a6 03 89 7d : mtctr 12 pc:96 left:16 a6 03 68 7d : mtlr 11 pc:100 left:12 20 04 80 4e : bctr pc:104 left:8 c0 34 1d 00 : Failure log: Can't disasm instruction at offset 104: c0 34 1d 00 00 00 00 c0 Disassembly logic can truncate at 104, ignoring last 8 bytes. Update the dummy_tramp_addr field offset calculation from the end of the program to reflect its new location, for bpf_arch_text_poke() to update the actual trampoline's address in this field. All BPF trampoline selftests continue to pass with this patch applied. Signed-off-by: Abhishek Dubey --- arch/powerpc/net/bpf_jit_comp.c | 35 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3cbb3647f7a0..00abad48fa6c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -93,27 +93,36 @@ void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context /* * Long branch stub: - * .long // 8-byte aligned * mflr r11 * bcl 20,31,$+4 - * mflr r12 - * ld r12, -8-SZL(r12) + * mflr r12 // lr/r12 stores current pc + * ld r12, 20(r12) // offset(dummy_tramp_addr) from prev inst. is 20 * mtctr r12 - * mtlr r11 // needed to retain ftrace ABI + * mtlr r11 // needed to retain ftrace ABI * bctr + * nop // for alignment of following address field + * .long // 8-byte aligned */ - if (image) - *((unsigned long *)&image[ctx->idx]) = (unsigned long)dummy_tramp; - - ctx->idx += SZL / 4; long_branch_stub_idx = ctx->idx; EMIT(PPC_RAW_MFLR(_R11)); EMIT(PPC_RAW_BCL4()); EMIT(PPC_RAW_MFLR(_R12)); - EMIT(PPC_RAW_LL(_R12, _R12, -8-SZL)); + EMIT(PPC_RAW_LL(_R12, _R12, 20)); EMIT(PPC_RAW_MTCTR(_R12)); EMIT(PPC_RAW_MTLR(_R11)); EMIT(PPC_RAW_BCTR()); + /* + * The start of Long branch stub is guaranteed to be aligned as + * result of optional NOP injection before OOL stub above. + * Append tail NOP to re-gain 8-byte alignment disturbed by odd + * instruction count in Long branch stub. + */ + EMIT(PPC_RAW_NOP()); + + if (image) + *((unsigned long *)&image[ctx->idx]) = (unsigned long)dummy_tramp; + + ctx->idx += SZL / 4; if (!bpf_jit_ool_stub) { bpf_jit_ool_stub = (ctx->idx - ool_stub_idx) * 4; @@ -1309,6 +1318,7 @@ static void do_isync(void *info __maybe_unused) * bpf_func: * [nop|b] ool_stub * 2. Out-of-line stub: + * nop // optional nop for alignment * ool_stub: * mflr r0 * [b|bl] / @@ -1316,7 +1326,6 @@ static void do_isync(void *info __maybe_unused) * b bpf_func + 4 * 3. Long branch stub: * long_branch_stub: - * .long / * mflr r11 * bcl 20,31,$+4 * mflr r12 @@ -1324,6 +1333,8 @@ static void do_isync(void *info __maybe_unused) * mtctr r12 * mtlr r11 // needed to retain ftrace ABI * bctr + * nop // nop for mem alignment of dummy_tramp_addr + * .long / * * dummy_tramp is used to reduce synchronization requirements. * @@ -1425,10 +1436,12 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, * 1. Update the address in the long branch stub: * If new_addr is out of range, we will have to use the long branch stub, so patch new_addr * here. Otherwise, revert to dummy_tramp, but only if we had patched old_addr here. + * + * dummy_tramp_addr moved to bottom of long branch stub. */ if ((new_addr && !is_offset_in_branch_range(new_addr - ip)) || (old_addr && !is_offset_in_branch_range(old_addr - ip))) - ret = patch_ulong((void *)(bpf_func_end - bpf_jit_long_branch_stub - SZL), + ret = patch_ulong((void *)(bpf_func_end - SZL), /* SZL: dummy_tramp_addr offset */ (new_addr && !is_offset_in_branch_range(new_addr - ip)) ? (unsigned long)new_addr : (unsigned long)dummy_tramp); if (ret) -- 2.52.0 From: Abhishek Dubey Ensure that the trampoline stubs JITed at the tail of the epilogue do not expose the dummy trampoline address stored in the last 8 bytes (for both 64-bit and 32-bit PowerPC) to the disassembly flow. Prevent the disassembler from ingesting this memory address, as it may occasionally decode into a seemingly valid but incorrect instruction. Fix this issue by truncating the last 8 bytes from JITed buffers before supplying them for disassembly. Signed-off-by: Abhishek Dubey --- tools/testing/selftests/bpf/jit_disasm_helpers.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index 364c557c5115..4c6bcbe08491 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -170,9 +170,11 @@ int get_jited_program_text(int fd, char *text, size_t text_sz) struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); __u32 jited_funcs, len, pc; + __u32 trunc_len = 0; __u32 *func_lens = NULL; FILE *text_out = NULL; uint8_t *image = NULL; + char *triple = NULL; int i, err = 0; if (!llvm_initialized) { @@ -216,9 +218,18 @@ int get_jited_program_text(int fd, char *text, size_t text_sz) if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #2")) goto out; + /* + * last 8 bytes contains dummy_trampoline address in JIT + * output for 64-bit and 32-bit powerpc, which can't + * disassemble a to valid instruction. + */ + triple = LLVMGetDefaultTargetTriple(); + if (strstr(triple, "powerpc")) + trunc_len = 8; + for (pc = 0, i = 0; i < jited_funcs; ++i) { fprintf(text_out, "func #%d:\n", i); - disasm_one_func(text_out, image + pc, func_lens[i]); + disasm_one_func(text_out, image + pc, func_lens[i] - trunc_len); fprintf(text_out, "\n"); pc += func_lens[i]; } -- 2.52.0 From: Abhishek Dubey This patch enables arch specifier "__powerpc64" in verifier selftest for ppc64. Power 32-bit would require separate handling. Changes tested for ppc64 only. Signed-off-by: Abhishek Dubey --- tools/testing/selftests/bpf/progs/bpf_misc.h | 1 + tools/testing/selftests/bpf/test_loader.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index c9bfbe1bafc1..dee284c3ddba 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -155,6 +155,7 @@ #define __arch_arm64 __arch("ARM64") #define __arch_riscv64 __arch("RISCV64") #define __arch_s390x __arch("s390x") +#define __arch_powerpc64 __arch("POWERPC64") #define __caps_unpriv(caps) __attribute__((btf_decl_tag("comment:test_caps_unpriv=" EXPAND_QUOTE(caps)))) #define __load_if_JITed() __attribute__((btf_decl_tag("comment:load_mode=jited"))) #define __load_if_no_JITed() __attribute__((btf_decl_tag("comment:load_mode=no_jited"))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 338c035c3688..fc8b95316379 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -378,6 +378,7 @@ enum arch { ARCH_ARM64 = 0x4, ARCH_RISCV64 = 0x8, ARCH_S390X = 0x10, + ARCH_POWERPC64 = 0x20, }; static int get_current_arch(void) @@ -390,6 +391,8 @@ static int get_current_arch(void) return ARCH_RISCV64; #elif defined(__s390x__) return ARCH_S390X; +#elif defined(__powerpc64__) + return ARCH_POWERPC64; #endif return ARCH_UNKNOWN; } @@ -587,6 +590,8 @@ static int parse_test_spec(struct test_loader *tester, arch = ARCH_RISCV64; } else if (strcmp(val, "s390x") == 0) { arch = ARCH_S390X; + } else if (strcmp(val, "POWERPC64") == 0) { + arch = ARCH_POWERPC64; } else { PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; -- 2.52.0 From: Abhishek Dubey Verifier testcase result for tailcalls: # ./test_progs -t verifier_tailcall #618/1 verifier_tailcall/invalid map type for tail call:OK #618/2 verifier_tailcall/invalid map type for tail call @unpriv:OK #618 verifier_tailcall:OK #619/1 verifier_tailcall_jit/main:OK #619 verifier_tailcall_jit:OK Summary: 2/3 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Abhishek Dubey --- .../bpf/progs/verifier_tailcall_jit.c | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c index 8d60c634a114..17475ecb3207 100644 --- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -90,6 +90,75 @@ __jited(" popq %rax") __jited(" jmp {{.*}}") /* jump to tail call tgt */ __jited("L0: leave") __jited(" {{(retq|jmp 0x)}}") /* return or jump to rethunk */ +__arch_powerpc64 +/* program entry for main(), regular function prologue */ +__jited(" nop") +__jited(" ld 2, 16(13)") +__jited(" li 9, 0") +__jited(" std 9, -8(1)") +__jited(" mflr 0") +__jited(" std 0, 16(1)") +__jited(" stdu 1, {{.*}}(1)") +/* load address and call sub() via count register */ +__jited(" lis 12, {{.*}}") +__jited(" sldi 12, 12, 32") +__jited(" oris 12, 12, {{.*}}") +__jited(" ori 12, 12, {{.*}}") +__jited(" mtctr 12") +__jited(" bctrl") +__jited(" mr 8, 3") +__jited(" li 8, 0") +__jited(" addi 1, 1, {{.*}}") +__jited(" ld 0, 16(1)") +__jited(" mtlr 0") +__jited(" mr 3, 8") +__jited(" blr") +__jited("...") +__jited("func #1") +/* subprogram entry for sub() */ +__jited(" nop") +__jited(" ld 2, 16(13)") +/* tail call prologue for subprogram */ +__jited(" ld 10, 0(1)") +__jited(" ld 9, -8(10)") +__jited(" cmplwi 9, 33") +__jited(" bt {{.*}}, {{.*}}") +__jited(" addi 9, 10, -8") +__jited(" std 9, -8(1)") +__jited(" lis {{.*}}, {{.*}}") +__jited(" sldi {{.*}}, {{.*}}, 32") +__jited(" oris {{.*}}, {{.*}}, {{.*}}") +__jited(" ori {{.*}}, {{.*}}, {{.*}}") +__jited(" li {{.*}}, 0") +__jited(" lwz 9, {{.*}}({{.*}})") +__jited(" slwi {{.*}}, {{.*}}, 0") +__jited(" cmplw {{.*}}, 9") +__jited(" bf 0, {{.*}}") +/* bpf_tail_call implementation */ +__jited(" ld 9, -8(1)") +__jited(" cmplwi 9, 33") +__jited(" bf {{.*}}, {{.*}}") +__jited(" ld 9, 0(9)") +__jited(" cmplwi 9, 33") +__jited(" bt {{.*}}, {{.*}}") +__jited(" addi 9, 9, 1") +__jited(" mulli 10, {{.*}}, 8") +__jited(" add 10, 10, {{.*}}") +__jited(" ld 10, {{.*}}(10)") +__jited(" cmpldi 10, 0") +__jited(" bt {{.*}}, {{.*}}") +__jited(" ld 10, {{.*}}(10)") +__jited(" addi 10, 10, 16") +__jited(" mtctr 10") +__jited(" ld 10, -8(1)") +__jited(" cmplwi 10, 33") +__jited(" bt {{.*}}, {{.*}}") +__jited(" addi 10, 1, -8") +__jited(" std 9, 0(10)") +__jited(" bctr") +__jited(" mr 3, 8") +__jited(" blr") + SEC("tc") __naked int main(void) { -- 2.52.0