From: Cong Wang Stacked seccomp filters are composed by seccomp_run_filters() taking the min over a single evaluation of one seccomp_data: every filter votes on the same registers, and the most restrictive verdict wins. That model assumes the syscall the filters voted on is the syscall that runs. SECCOMP_IOCTL_NOTIF_SEND_REDIRECT breaks the assumption. A USER_NOTIF verdict only wins the min when every other (here: outer, earlier-applied) filter returned something at least as permissive on the *original* registers. The supervisor then rewrites the argument registers and the syscall resumes via FLAG_CONTINUE without the stack being re-consulted. So an inner, container-installed filter's notifier can redirect a syscall whose substituted form an outer filter would have blocked. Close the hole by resuming evaluation after a redirect instead of re-running the min. A redirect turns one syscall into a different one, so the outer filters cannot simply re-vote on a single seccomp_data; each must judge the substituted syscall in turn, exactly as if the target had issued it directly. Starting at the filter outer to the one that notified (match->prev), seccomp_run_filters_seq() walks strictly toward the root, evaluating one filter at a time and stopping at the first that does not allow the call; ALLOW and LOG (which only logs) are non-blocking and continue to the next filter. A BPF filter can only read registers, so re-evaluating on the substituted registers is complete. An outer filter may legitimately ERRNO/KILL/TRAP/TRACE or run its own USER_NOTIF on the redirected syscall; if an outer notifier redirects again, evaluation continues from its own ->prev. Walking strictly toward the root is monotonic: the notifying filter and everything inner to it are never reconsulted, so a notifier cannot loop by re-notifying on its own redirect, and the walk is iterative (goto, not recursion) so a deep stack of redirecting filters cannot exhaust the kernel stack. Only a redirect triggers the continuation. The first pass is the unchanged min over the full stack, and the allow-cache is consulted only there, so nothing changes for existing, non-redirect users. Assisted-by: Claude:claude-opus-4.8 Signed-off-by: Cong Wang --- kernel/seccomp.c | 79 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7d499e422ba1..84812ce9bdb3 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -94,6 +94,13 @@ struct seccomp_knotif { long val; u32 flags; + /* + * Set by SEND_REDIRECT: the reply rewrote the syscall's registers, + * so on resume the syscall must be re-evaluated against the filters + * outer to the one that notified (see __seccomp_filter()). + */ + bool redirect; + /* * Signals when this has changed states, such as the listener * dying, a new seccomp addfd message, or changing to REPLIED @@ -1181,10 +1188,12 @@ static bool should_sleep_killable(struct seccomp_filter *match, static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, - const struct seccomp_data *sd) + const struct seccomp_data *sd, + bool *redirected) { int err; u32 flags = 0; + bool redirect = false; long ret = 0; struct seccomp_knotif n = {}; struct seccomp_kaddfd *addfd, *tmp; @@ -1241,6 +1250,7 @@ static int seccomp_do_user_notification(int this_syscall, ret = n.val; err = n.error; flags = n.flags; + redirect = n.redirect; interrupted: /* If there were any pending addfd calls, clear them out */ @@ -1267,14 +1277,38 @@ static int seccomp_do_user_notification(int this_syscall, mutex_unlock(&match->notify_lock); /* Userspace requests to continue the syscall. */ - if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) + if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) { + *redirected = redirect; return 0; + } syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } +static u32 seccomp_run_filters_seq(const struct seccomp_data *sd, + struct seccomp_filter **match, + struct seccomp_filter *f, + int this_syscall) +{ + for (; f; f = f->prev) { + u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); + u32 action = cur_ret & SECCOMP_RET_ACTION_FULL; + + if (action == SECCOMP_RET_ALLOW) + continue; + /* LOG does not block the syscall; record it and continue. */ + if (action == SECCOMP_RET_LOG) { + seccomp_log(this_syscall, 0, action, true); + continue; + } + *match = f; + return cur_ret; + } + return SECCOMP_RET_ALLOW; +} + static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; @@ -1291,6 +1325,8 @@ static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) populate_seccomp_data(&sd); filter_ret = seccomp_run_filters(&sd, &match); + +eval: data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1353,11 +1389,40 @@ static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) return 0; - case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, &sd)) + case SECCOMP_RET_USER_NOTIF: { + struct seccomp_filter *outer; + bool redirected = false; + + if (seccomp_do_user_notification(this_syscall, match, &sd, + &redirected)) goto skip; + if (redirected && match->prev) { + /* + * The notifier rewrote the registers. Resume + * evaluation at the next outer filter on the + * substituted syscall, sequentially toward the root: + * each outer filter judges the new syscall exactly as + * if the target had issued it. Walking outward is + * monotonic, so a notifier cannot re-notify on its own + * redirect. + */ + this_syscall = syscall_get_nr(current, + current_pt_regs()); + if (this_syscall < 0) + return 0; + outer = match->prev; + match = NULL; + populate_seccomp_data(&sd); + filter_ret = seccomp_run_filters_seq(&sd, &match, outer, + this_syscall); + if (!match) + return 0; + goto eval; + } + return 0; + } case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); @@ -2154,13 +2219,15 @@ static long seccomp_notify_send_redirect(struct seccomp_filter *filter, } /* - * Mark REPLIED with FLAG_CONTINUE so the wait-loop exit path - * runs the syscall normally. + * Mark REPLIED with FLAG_CONTINUE so the wait-loop exit path runs the + * syscall normally. Flag the redirect so the resume path re-validates + * the rewritten syscall against the filters outer to this one. */ knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = 0; knotif->val = 0; knotif->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + knotif->redirect = true; if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) complete_on_current_cpu(&knotif->ready); else -- 2.43.0