Don't skip active reference count initialization for initial namespaces. Doing this will break network namespace active reference counting. Fixes: 3a18f809184b ("ns: add active reference count") Signed-off-by: Christian Brauner --- kernel/nscommon.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 6fe1c747fa46..d67ae7ad7759 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -54,7 +54,7 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { - int ret; + int ret = 0; refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; @@ -74,11 +74,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - ret = proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); if (ret) return ret; /* -- 2.47.3 There's no need to bump the active reference counts of initial namespaces as they're always active and can simply remain at 1. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 23 ++++++++++++++++++++--- kernel/nscommon.c | 6 ++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bd4492ef6ffc..791b18dc77d0 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); } +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -285,14 +291,19 @@ void __ns_ref_active_get_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_get(struct ns_common *ns) { - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); + /* Initial namespaces are always active. */ + if (!is_ns_init_id(ns)) + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); } #define ns_ref_active_get(__ns) \ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return true; + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); return true; @@ -307,6 +318,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + if (atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(is_initial_namespace(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -319,8 +334,10 @@ static __always_inline void __ns_ref_active_put(struct ns_common *ns) static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); - if (!__ns_ref_active_read(ns)) + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; + } if (!__ns_ref_get(ns)) return NULL; return ns; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index d67ae7ad7759..70cb66232e4c 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -177,6 +177,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) ns = ns_owner(ns); if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (!atomic_dec_and_test(&ns->__ns_ref_active)) return; } @@ -276,6 +277,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns) */ void __ns_ref_active_resurrect(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + /* If we didn't resurrect the namespace we're done. */ if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; @@ -289,6 +294,7 @@ void __ns_ref_active_resurrect(struct ns_common *ns) if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; } -- 2.47.3 The mount namespace may in fact sleep when putting the last passive reference so we need to drop the namespace reference outside of the rcu read lock. Do this by delaying the put until the next iteration where we've already moved on to the next namespace and legitimized it. Once we drop the rcu read lock to call put_user() we will also drop the reference to the previous namespace in the tree. Fixes: 76b6f5dfb3fd ("nstree: add listns()") Signed-off-by: Christian Brauner --- kernel/nstree.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/kernel/nstree.c b/kernel/nstree.c index 4a8838683b6b..55b72d4f8de4 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -505,13 +505,13 @@ static inline bool __must_check may_list_ns(const struct klistns *kls, return false; } -static void __ns_put(struct ns_common *ns) +static inline void ns_put(struct ns_common *ns) { - if (ns->ops) + if (ns && ns->ops) ns->ops->put(ns); } -DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, struct ns_common *candidate) @@ -535,7 +535,7 @@ static ssize_t do_listns_userns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns = NULL, *first_ns = NULL; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; const struct list_head *head; ssize_t ret; @@ -568,9 +568,10 @@ static ssize_t do_listns_userns(struct klistns *kls) if (!first_ns) first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); + for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -578,8 +579,14 @@ static ssize_t do_listns_userns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); return -EINVAL; + } + nr_ns_ids--; ret++; @@ -587,6 +594,7 @@ static ssize_t do_listns_userns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } @@ -668,7 +676,7 @@ static ssize_t do_listns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns, *first_ns = NULL; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; struct ns_tree *ns_tree = NULL; const struct list_head *head; u32 ns_type; @@ -705,7 +713,7 @@ static ssize_t do_listns(struct klistns *kls) for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; ns = next_ns_common(ns, ns_tree)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -713,8 +721,13 @@ static ssize_t do_listns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); return -EINVAL; + } nr_ns_ids--; ret++; @@ -723,6 +736,7 @@ static ssize_t do_listns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } -- 2.47.3 Don't return EINVAL, return EFAULT just like we do in other system calls. Signed-off-by: Christian Brauner --- kernel/nstree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/nstree.c b/kernel/nstree.c index 55b72d4f8de4..f27f772a6762 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -584,7 +584,7 @@ static ssize_t do_listns_userns(struct klistns *kls) if (put_user(valid->ns_id, ns_ids + ret)) { ns_put(prev); - return -EINVAL; + return -EFAULT; } nr_ns_ids--; @@ -726,7 +726,7 @@ static ssize_t do_listns(struct klistns *kls) if (put_user(valid->ns_id, ns_ids + ret)) { ns_put(prev); - return -EINVAL; + return -EFAULT; } nr_ns_ids--; -- 2.47.3 The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime and thus has dropped all active references to its namespaces. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com Fixes: 3c9820d5c64a ("ns: add active reference count") Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- include/linux/ns_common.h | 47 ++++------------------------------------------- kernel/nscommon.c | 21 ++++++++++++--------- kernel/nstree.c | 8 -------- 4 files changed, 17 insertions(+), 61 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index ba6c8975c82e..a80f8d2a4122 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data) * ioctl on such a socket will resurrect the relevant namespace * subtree. */ - __ns_ref_active_resurrect(ns); + __ns_ref_active_get(ns); return 0; } diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 791b18dc77d0..3aaba2ca31d7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -287,47 +287,8 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -void __ns_ref_active_get_owner(struct ns_common *ns); +void __ns_ref_active_put(struct ns_common *ns); -static __always_inline void __ns_ref_active_get(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (!is_ns_init_id(ns)) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} -#define ns_ref_active_get(__ns) \ - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) - -static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return true; - - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - return true; - } - return false; -} - -#define ns_ref_active_get_owner(__ns) \ - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) - -void __ns_ref_active_put_owner(struct ns_common *ns); - -static __always_inline void __ns_ref_active_put(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return; - - if (atomic_dec_and_test(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - __ns_ref_active_put_owner(ns); - } -} #define ns_ref_active_put(__ns) \ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) @@ -343,9 +304,9 @@ static __always_inline struct ns_common *__must_check ns_get_unless_inactive(str return ns; } -void __ns_ref_active_resurrect(struct ns_common *ns); +void __ns_ref_active_get(struct ns_common *ns); -#define ns_ref_active_resurrect(__ns) \ - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 70cb66232e4c..bfd2d6805776 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -114,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns) return to_ns_common(owner); } -void __ns_ref_active_get_owner(struct ns_common *ns) -{ - ns = ns_owner(ns); - if (ns) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} - /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. @@ -171,8 +164,18 @@ void __ns_ref_active_get_owner(struct ns_common *ns) * The iteration stops once we reach a namespace that still has active * references. */ -void __ns_ref_active_put_owner(struct ns_common *ns) +void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + for (;;) { ns = ns_owner(ns); if (!ns) @@ -275,7 +278,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) * it also needs to take another reference on its owning user namespace * and so on. */ -void __ns_ref_active_resurrect(struct ns_common *ns) +void __ns_ref_active_get(struct ns_common *ns) { /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) diff --git a/kernel/nstree.c b/kernel/nstree.c index f27f772a6762..97404fb90749 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); - - /* - * Take an active reference on the owner namespace. This ensures - * that the owner remains visible while any of its child namespaces - * are active. For init namespaces this is a no-op as ns_owner() - * returns NULL for namespaces owned by init_user_ns. - */ - __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) -- 2.47.3 Add a few more assert to detect active reference count underflows. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 1 - kernel/nscommon.c | 18 ++++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3aaba2ca31d7..66ea09b48377 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -294,7 +294,6 @@ void __ns_ref_active_put(struct ns_common *ns); static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); if (!__ns_ref_active_read(ns)) { VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bfd2d6805776..c910b979e433 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -170,8 +170,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (is_ns_init_id(ns)) return; - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } VFS_WARN_ON_ONCE(is_ns_init_id(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -181,8 +183,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } } } @@ -280,12 +284,16 @@ void __ns_ref_active_put(struct ns_common *ns) */ void __ns_ref_active_get(struct ns_common *ns) { + int prev; + /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; /* If we didn't resurrect the namespace we're done. */ - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; /* @@ -298,7 +306,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; } } -- 2.47.3 Add a regression test for setns() with pidfd. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 4 +- .../namespaces/regression_pidfd_setns_test.c | 113 +++++++++++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index f4d2209ca4e4..4cb428d77659 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -8,3 +8,4 @@ siocgskns_test cred_change_test stress_test listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 01569e0abbdb..1f36c7bf7728 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -11,7 +11,8 @@ TEST_GEN_PROGS := nsid_test \ siocgskns_test \ cred_change_test \ stress_test \ - listns_pagination_bug + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk @@ -22,4 +23,5 @@ $(OUTPUT)/siocgskns_test: ../filesystems/utils.c $(OUTPUT)/cred_change_test: ../filesystems/utils.c $(OUTPUT)/stress_test: ../filesystems/utils.c $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN -- 2.47.3 Ensure that put_user() can fail and that namespace cleanup works correctly. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 2 + .../selftests/namespaces/listns_efault_test.c | 521 +++++++++++++++++++++ 3 files changed, 524 insertions(+) diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index 4cb428d77659..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -4,6 +4,7 @@ init_ino_test ns_active_ref_test listns_test listns_permissions_test +listns_efault_test siocgskns_test cred_change_test stress_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 1f36c7bf7728..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -8,6 +8,7 @@ TEST_GEN_PROGS := nsid_test \ ns_active_ref_test \ listns_test \ listns_permissions_test \ + listns_efault_test \ siocgskns_test \ cred_change_test \ stress_test \ @@ -19,6 +20,7 @@ include ../lib.mk $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c $(OUTPUT)/listns_test: ../filesystems/utils.c $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c $(OUTPUT)/siocgskns_test: ../filesystems/utils.c $(OUTPUT)/cred_change_test: ../filesystems/utils.c $(OUTPUT)/stress_test: ../filesystems/utils.c diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..906d8df90ab2 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN -- 2.47.3