Freezing a cgroup of a task from BPF is better than user space which
could be too late and is subject to races. To achieve this allow writing to
cgroup core interfaces from BPF by adding a new kfunc helper that take a
kernfs node directly.

Currently only writing to "cgroup.freeze" on the default hierarchy is
allowed. The writing goes directly via a kernfs_node which allows to
share the same path as if a kernfs_node was opened from userspace.

Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 include/linux/cgroup.h |   3 ++
 kernel/cgroup/cgroup.c | 102 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b18fb5fcb38e..03a0782c94bf 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -125,6 +125,9 @@ int cgroup_rm_cftypes(struct cftype *cfts);
 void cgroup_file_notify(struct cgroup_file *cfile);
 void cgroup_file_show(struct cgroup_file *cfile, bool show);
 
+ssize_t cgroup_kn_interface_write(struct kernfs_node *kn, const char *name__str,
+				  const char *buf, size_t nbytes, loff_t off);
+
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..cddd7c1d354d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -229,6 +229,24 @@ static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 static struct cftype cgroup_psi_files[];
 
+struct cgroup_kn_cftype {
+	char name[MAX_CFTYPE_NAME];
+	unsigned int namelen;
+
+	/*
+	 * write() is the write operation on a kernfs node.
+	 */
+	ssize_t (*write)(struct kernfs_node *kn, const char *buf, size_t nbytes,
+			 loff_t off, bool revalidate);
+};
+
+#define CGROUP_PREFIX "cgroup."
+#define CGROUP_CORE_INTERFACE_FREEZE_SUFFIX "freeze"
+#define CGROUP_CORE_INTERFACE_FREEZE (CGROUP_PREFIX CGROUP_CORE_INTERFACE_FREEZE_SUFFIX)
+#define CGROUP_CORE_INTERFACE_FREEZE_LEN (sizeof(CGROUP_CORE_INTERFACE_FREEZE) - 1)
+
+static struct cgroup_kn_cftype kn_cfts[];
+
 /* cgroup optional features */
 enum cgroup_opt_features {
 #ifdef CONFIG_PSI
@@ -4030,29 +4048,58 @@ static int cgroup_freeze_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
-				   char *buf, size_t nbytes, loff_t off)
+static bool cgroup_kn_revalidate(struct cgroup *cgrp)
+{
+	if (!cgroup_on_dfl(cgrp) || !cgroup_parent(cgrp))
+		return false;
+
+	return true;
+}
+
+static ssize_t cgroup_kn_freeze(struct kernfs_node *kn,
+				const char *buf, size_t nbytes, loff_t off,
+				bool revalidate)
 {
 	struct cgroup *cgrp;
 	ssize_t ret;
 	int freeze;
+	char b[4] = {0};
+
+	/* Handle userspace writes +(0|1)\n and fail otherwise */
+	ret = strscpy(b, buf, sizeof(b));
+	if (ret < 0)
+		return ret;
 
-	ret = kstrtoint(strstrip(buf), 0, &freeze);
+	nbytes = ret;
+	ret = kstrtoint(strstrip(b), 0, &freeze);
 	if (ret)
 		return ret;
 
 	if (freeze < 0 || freeze > 1)
 		return -ERANGE;
 
-	cgrp = cgroup_kn_lock_live(of->kn, false);
+	cgrp = cgroup_kn_lock_live(kn, false);
 	if (!cgrp)
 		return -ENOENT;
 
+	if (revalidate && !cgroup_kn_revalidate(cgrp)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
 	cgroup_freeze(cgrp, freeze);
 
-	cgroup_kn_unlock(of->kn);
+	ret = nbytes;
 
-	return nbytes;
+out:
+	cgroup_kn_unlock(kn);
+	return ret;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+				   char *buf, size_t nbytes, loff_t off)
+{
+	return cgroup_kn_freeze(of->kn, buf, nbytes, off, false);
 }
 
 static void __cgroup_kill(struct cgroup *cgrp)
@@ -4601,6 +4648,49 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
 	kernfs_put(kn);
 }
 
+static struct cgroup_kn_cftype kn_cfts[] = {
+	{
+		.name = CGROUP_CORE_INTERFACE_FREEZE,
+		.namelen = CGROUP_CORE_INTERFACE_FREEZE_LEN,
+		.write = cgroup_kn_freeze,
+	},
+	{ },
+};
+
+static const struct cgroup_kn_cftype *cgroup_kn_cft(const char *name__str)
+{
+	struct cgroup_kn_cftype *kn_cft;
+
+	for (kn_cft = kn_cfts; kn_cft && kn_cft->name[0] != '\0'; kn_cft++) {
+		if (!strncmp(name__str, kn_cft->name, kn_cft->namelen))
+			return kn_cft;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+ssize_t cgroup_kn_interface_write(struct kernfs_node *kn, const char *name__str,
+				  const char *buf, size_t nbytes, loff_t off)
+{
+	const struct cgroup_kn_cftype *kn_cft;
+
+	/* empty, do not remove */
+	if (!nbytes)
+		return 0;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+
+	kn_cft = cgroup_kn_cft(name__str);
+	if (IS_ERR(kn_cft))
+		return PTR_ERR(kn_cft);
+
+	if (unlikely(!kn_cft->write))
+		return -EOPNOTSUPP;
+
+	return kn_cft->write(kn, buf, nbytes, off, true);
+}
+
 /**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
-- 
2.43.0

Add bpf_cgroup_write_interface() kfunc that writes to a cgroup
interface. Takes a cgroup on the default hierarchy as argument, and
writes to the specified interface file of that cgroup.

Freezing a cgroup of a task from BPF is better than user space
which could be too late and is subject to races. Hence, add support
for writing to "cgroup.freeze" interface using the mentioned bpf kfunc.

Planned users of this feature are: systemd and BPF tools.
Taking the freezing example, we could freeze a cgroup hierarchy on
suspicious activity for a more thorough analysis. The cgroup hierarchies
could be system services, user sessions, K8s pods or containers.

Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 kernel/bpf/helpers.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..5efc1bc57db9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2605,6 +2605,50 @@ bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
 		return NULL;
 	return cgrp;
 }
+
+#define BPF_CGROUP_MAX_WRITE	((1UL << 24) - 1)
+
+/**
+ * bpf_cgroup_write_interface - Writes to a cgroup interface file.
+ * @cgrp: The target cgroup
+ * @name__str: name of the cgroup core interface file
+ * @value_p: value to write
+ * @off: offset
+ *
+ * Return: number of bytes written on success, a negative value on error.
+ */
+__bpf_kfunc int
+bpf_cgroup_write_interface(struct cgroup *cgrp, const char *name__str,
+			   const struct bpf_dynptr *value_p, loff_t off)
+{
+	struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+	struct kernfs_node *kn;
+	const void *value;
+	u32 value_len;
+	int ret;
+
+	value_len = __bpf_dynptr_size(value_ptr);
+	if (!value_len)
+		return 0;
+
+	if (value_len > BPF_CGROUP_MAX_WRITE)
+		return -E2BIG;
+
+	value = __bpf_dynptr_data(value_ptr, value_len);
+	if (!value)
+		return -EINVAL;
+
+	rcu_read_lock();
+	kn = cgrp->kn;
+	rcu_read_unlock();
+
+	kernfs_get(kn);
+	ret = cgroup_kn_interface_write(kn, name__str, value, value_len, off);
+	kernfs_put(kn);
+
+	return ret;
+}
+
 #endif /* CONFIG_CGROUPS */
 
 /**
@@ -3736,6 +3780,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
 BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_write_interface, KF_TRUSTED_ARGS | KF_SLEEPABLE)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
-- 
2.43.0

This adds a selftest for `bpf_cgroup_write_interface` kfunc. The test works
by forking a child then:

1. Child:
 - Migrate to a new cgroup
 - Loads bpf programs
 - Trigger the 'lsm_freeze_cgroup' bpf program so it freeze itself.

   <- wait for parent to unthaw

 - On unthaw it continues, forks another process and triggers the
   'tp_newchild' bpf program to set some monitored pids of the new
   process, that assert that the user space resumed correctly.

2. Parent:
 - Keeps reading the 'cgroup.freeze' file of the child cgroup until
   it prints 1 which means the child cgroup is frozen.
 - Attaches the sample 'lsm_task_free' so it triggers the bpf program
   to unthaw the child task cgroup.
 - Then waits for a clean exit of the child process.

The scenario allows to test multiple sides of: freeze and unthaw a cgroup.

Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 .../bpf/prog_tests/task_freeze_cgroup.c       | 172 ++++++++++++++++++
 .../bpf/progs/test_task_freeze_cgroup.c       | 155 ++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c

diff --git a/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c b/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c
new file mode 100644
index 000000000000..d4e9c0f32196
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/syscall.h>
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include <unistd.h>
+#include "test_task_freeze_cgroup.skel.h"
+
+#define CGROUP_PATH	"/test-task-freeze-cgroup"
+
+static int bpf_sleepable(struct test_task_freeze_cgroup *skel)
+{
+	int err, cgroup_fd;
+	pid_t new_pid2;
+
+	cgroup_fd = cgroup_setup_and_join(CGROUP_PATH);
+	if (!ASSERT_OK(cgroup_fd < 0, "cgroup_setup_and_join"))
+		return -errno;
+
+	skel = test_task_freeze_cgroup__open();
+	if (!ASSERT_OK_PTR(skel, "test_task_freeze_cgroup__open")) {
+		err = -errno;
+		goto cleanup_cgroup;
+	}
+
+	skel->rodata->parent_pid = getppid();
+	skel->rodata->monitor_pid = getpid();
+	skel->rodata->cgid = get_cgroup_id(CGROUP_PATH);
+	skel->bss->new_pid = getpid();
+	skel->bss->freeze = 1;
+
+	err = test_task_freeze_cgroup__load(skel);
+	if (!ASSERT_OK(err, "test_task_freeze_cgroup__load")) {
+		err = -errno;
+		goto cleanup_skel;
+	}
+
+	/* First attach the LSM Program that is triggered on bpf() calls
+	 * especially on TP_BTF programs when attached.
+	 */
+	skel->links.lsm_freeze_cgroup =
+		bpf_program__attach_lsm(skel->progs.lsm_freeze_cgroup);
+	if (!ASSERT_OK_PTR(skel->links.lsm_freeze_cgroup, "attach_lsm")) {
+		err = -errno;
+		goto cleanup_detach;
+	}
+
+	/* Attaching this must fail with -EPERM and freeze current task */
+	skel->links.tp_newchild =
+		bpf_program__attach_trace(skel->progs.tp_newchild);
+	if (!ASSERT_EQ(errno, EPERM, "attach_trace() must fail here")) {
+		err = -EINVAL;
+		goto cleanup_detach;
+	}
+
+	/* Continue */
+
+	/* Attach again now with success */
+	skel->links.tp_newchild =
+		bpf_program__attach_trace(skel->progs.tp_newchild);
+	if (!ASSERT_OK_PTR(skel->links.tp_newchild, "attach_trace")) {
+		err = -EINVAL;
+		goto cleanup_detach;
+	}
+
+	/* Fork, update vars from BPF and assert the unfrozen state */
+	new_pid2 = fork();
+	if (new_pid2 == 0)
+		exit(0);
+
+	err = (new_pid2 == -1);
+	if (ASSERT_OK(err, "fork process"))
+		wait(NULL);
+
+	/* Now assert that new_pid2 reflects this new child */
+	ASSERT_NEQ(0, skel->bss->new_pid,
+		   "test task_freeze_cgroup failed  at new_pid != 0");
+	ASSERT_NEQ(skel->rodata->monitor_pid, skel->bss->new_pid,
+		   "test task_freeze_cgroup failed  at old monitor_pid != new_pid");
+	/* Assert that bpf sets new_pid to new forked child new_pid2 */
+	ASSERT_EQ(skel->bss->new_pid, new_pid2,
+		  "test task_freeze_cgroup failed first child new_pid == new_pid2");
+
+cleanup_detach:
+	test_task_freeze_cgroup__detach(skel);
+cleanup_skel:
+	test_task_freeze_cgroup__destroy(skel);
+cleanup_cgroup:
+	close(cgroup_fd);
+	cleanup_cgroup_environment();
+	return err;
+}
+
+void test_task_freeze_cgroup(void)
+{
+	pid_t pid, result;
+	char buf[512] = {0};
+	char path[PATH_MAX] = {0};
+	int ret, status, attempts, frozen = 0, fd;
+	struct test_task_freeze_cgroup *skel = NULL;
+
+	pid = fork();
+	ret = (pid == -1);
+	if (!ASSERT_OK(ret, "fork process"))
+		return;
+
+	if (pid == 0) {
+		ret = bpf_sleepable(skel);
+		ASSERT_EQ(0, ret, "child bpf_sleepable failed");
+		exit(ret);
+	}
+
+	skel = test_task_freeze_cgroup__open();
+	if (!ASSERT_OK_PTR(skel, "test_task_freeze_cgroup__open"))
+		goto out;
+
+	snprintf(path, sizeof(path),
+		 "/sys/fs/cgroup/cgroup-test-work-dir%d%s/cgroup.freeze",
+		 pid, CGROUP_PATH);
+
+	for (attempts = 10; attempts >= 0; attempts--) {
+		ret = 0;
+
+		fd = open(path, O_RDONLY);
+		if (fd > 0)
+			ret = read(fd, buf, sizeof(buf) - 1);
+		if (ret > 0) {
+			errno = 0;
+			frozen = strtol(buf, NULL, 10);
+			if (errno)
+				frozen = 0;
+		}
+
+		close(fd);
+		if (frozen)
+			break;
+		sleep(1);
+	}
+
+	/* Assert that child cgroup is frozen */
+	if (!ASSERT_EQ(1, frozen, "child cgroup not frozen"))
+		goto out;
+
+	ret = test_task_freeze_cgroup__load(skel);
+	if (!ASSERT_OK(ret, "test_task_freeze_cgroup__load"))
+		goto out;
+
+	/* Trigger the unthaw child cgroup from parent */
+	skel->links.lsm_task_free =
+		bpf_program__attach_lsm(skel->progs.lsm_task_free);
+	if (!ASSERT_OK_PTR(skel->links.lsm_task_free, "attach_lsm"))
+		goto out;
+
+	result = waitpid(pid, &status, WUNTRACED);
+	if (!ASSERT_NEQ(result, -1, "waitpid"))
+		goto detach;
+
+	result = WIFEXITED(status);
+	if (!ASSERT_EQ(result, 1, "forked process did not terminate normally"))
+		goto detach;
+
+	result = WEXITSTATUS(status);
+	if (!ASSERT_EQ(result, 0, "forked process did not exit successfully"))
+		goto detach;
+
+detach:
+	test_task_freeze_cgroup__detach(skel);
+
+out:
+	if (skel)
+		test_task_freeze_cgroup__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c b/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c
new file mode 100644
index 000000000000..07b4b65abc36
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <linux/types.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <errno.h>
+#include "bpf_kfuncs.h"
+#include "bpf_misc.h"
+
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) __ksym;
+void bpf_cgroup_release(struct cgroup *p) __ksym;
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+extern int bpf_cgroup_write_interface(struct cgroup *cgrp,
+				      const char *name__str,
+				      const struct bpf_dynptr *value_p,
+				      loff_t off) __ksym __weak;
+
+char freeze_val[] = "1";
+char unthaw_val[] = "0";
+
+const volatile int parent_pid;
+const volatile int monitor_pid;
+const volatile __u64 cgid;
+int new_pid;
+int freeze;
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(tp_newchild, struct task_struct *task, u64 clone_flags)
+{
+	struct cgroup *cgrp = NULL;
+	struct task_struct *acquired;
+
+	if (monitor_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	acquired = bpf_task_acquire(task);
+	if (!acquired)
+		return 0;
+
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp)
+		goto out;
+
+	/* Update new_pid with current pid */
+	if (bpf_task_under_cgroup(acquired, cgrp))
+		new_pid = acquired->tgid;
+
+out:
+	if (cgrp)
+		bpf_cgroup_release(cgrp);
+	bpf_task_release(acquired);
+
+	return 0;
+}
+
+/* Try to attach from parent to trigger the bpf lsm hook, so from
+ * parent context we unthaw child cgroup.
+ */
+SEC("lsm/task_free")
+int BPF_PROG(lsm_task_free, struct task_struct *task)
+{
+	return 0;
+}
+
+static int process_freeze_cgroup(int pid, int freeze)
+{
+	int ret = 0;
+	struct task_struct *task;
+	struct bpf_dynptr dyn_ptr;
+	struct cgroup *cgrp = NULL;
+
+	task = bpf_task_from_pid(pid);
+	if (!task)
+		return -EINVAL;
+
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!bpf_task_under_cgroup(task, cgrp))
+		goto out;
+
+	if (freeze)
+		bpf_dynptr_from_mem(freeze_val, sizeof(freeze_val), 0, &dyn_ptr);
+	else
+		bpf_dynptr_from_mem(unthaw_val, sizeof(unthaw_val), 0, &dyn_ptr);
+
+	ret = bpf_cgroup_write_interface(cgrp, "cgroup.freeze", &dyn_ptr, 0);
+
+out:
+	if (cgrp)
+		bpf_cgroup_release(cgrp);
+	bpf_task_release(task);
+	return ret;
+}
+
+SEC("lsm.s/bpf")
+int BPF_PROG(lsm_freeze_cgroup, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	int ret = 0;
+	struct task_struct *task;
+	struct cgroup *cgrp = NULL;
+
+	if (cmd != BPF_LINK_CREATE)
+		return 0;
+
+	task = bpf_get_current_task_btf();
+	if (parent_pid == task->pid) {
+		/* Parent context: unthaw child */
+		process_freeze_cgroup(monitor_pid, 0);
+		return 0;
+	}
+
+	/* Nothing todo */
+	if (!freeze)
+		return 0;
+
+	/* Child context */
+	if (monitor_pid != task->pid)
+		return 0;
+
+	/* Ensure we are under the corresponding cgroup so we freeze
+	 * current child from its context
+	 */
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp)
+		return 0;
+
+	if (!bpf_task_under_cgroup(task, cgrp))
+		goto out;
+
+	/* Schedule freeze task and return -EPERM */
+	ret = process_freeze_cgroup(monitor_pid, freeze);
+
+	/* On error or 0 we return zero and we catch at
+	 * user space if the cgroup was not frozen.
+	 */
+	ret = (ret > 0) ? -EPERM : 0;
+
+	/* Reset for next calls */
+	freeze = 0;
+out:
+	if (cgrp)
+		bpf_cgroup_release(cgrp);
+	return ret;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.43.0