Add an interface to the misc cgroup controller that allows masking out hardware capabilities (AT_HWCAP) reported to user-space processes. This provides a mechanism to restrict the features a containerized application can see. The new "misc.mask" cgroup file allows users to specify masks for AT_HWCAP, AT_HWCAP2, AT_HWCAP3, and AT_HWCAP4. The output of "misc.mask" is extended to display the effective mask, which is a combination of the masks from the current cgroup and all its ancestors. Signed-off-by: Andrei Vagin --- fs/binfmt_elf.c | 24 +++++-- include/linux/misc_cgroup.h | 25 +++++++ kernel/cgroup/misc.c | 126 ++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3eb734c192e9..59137784e81d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -182,6 +183,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; + struct misc_cg *misc_cg; + u64 hwcap_mask[4] = {0, 0, 0, 0}; + + misc_cg = get_current_misc_cg(); + misc_cg_get_mask(MISC_CG_MASK_HWCAP, misc_cg, &hwcap_mask[0]); +#ifdef ELF_HWCAP2 + misc_cg_get_mask(MISC_CG_MASK_HWCAP2, misc_cg, &hwcap_mask[1]); +#endif +#ifdef ELF_HWCAP3 + misc_cg_get_mask(MISC_CG_MASK_HWCAP3, misc_cg, &hwcap_mask[2]); +#endif +#ifdef ELF_HWCAP4 + misc_cg_get_mask(MISC_CG_MASK_HWCAP4, misc_cg, &hwcap_mask[3]); +#endif + put_misc_cg(misc_cg); /* * In some cases (e.g. Hyper-Threading), we want to avoid L1 @@ -246,7 +262,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, */ ARCH_DLINFO; #endif - NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP & ~hwcap_mask[0]); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, phdr_addr); @@ -264,13 +280,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 - NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2 & ~hwcap_mask[1]); #endif #ifdef ELF_HWCAP3 - NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3 & ~hwcap_mask[2]); #endif #ifdef ELF_HWCAP4 - NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4 & ~hwcap_mask[3]); #endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 0cb36a3ffc47..cff830c238fb 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -8,6 +8,8 @@ #ifndef _MISC_CGROUP_H_ #define _MISC_CGROUP_H_ +#include + /** * enum misc_res_type - Types of misc cgroup entries supported by the host. */ @@ -26,6 +28,20 @@ enum misc_res_type { MISC_CG_RES_TYPES }; +enum misc_mask_type { + MISC_CG_MASK_HWCAP, +#ifdef ELF_HWCAP2 + MISC_CG_MASK_HWCAP2, +#endif +#ifdef ELF_HWCAP3 + MISC_CG_MASK_HWCAP3, +#endif +#ifdef ELF_HWCAP4 + MISC_CG_MASK_HWCAP4, +#endif + MISC_CG_MASK_TYPES +}; + struct misc_cg; #ifdef CONFIG_CGROUP_MISC @@ -62,12 +78,15 @@ struct misc_cg { struct cgroup_file events_local_file; struct misc_res res[MISC_CG_RES_TYPES]; + u64 mask[MISC_CG_MASK_TYPES]; }; int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); +int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask); + /** * css_misc() - Get misc cgroup from the css. * @css: cgroup subsys state object. @@ -134,5 +153,11 @@ static inline void put_misc_cg(struct misc_cg *cg) { } +static inline int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask) +{ + *pmask = 0; + return 0; +} + #endif /* CONFIG_CGROUP_MISC */ #endif /* _MISC_CGROUP_H_ */ diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 6a01d91ea4cb..d1386d86060f 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -30,6 +30,19 @@ static const char *const misc_res_name[] = { #endif }; +static const char *const misc_mask_name[] = { + "AT_HWCAP", +#ifdef ELF_HWCAP2 + "AT_HWCAP2", +#endif +#ifdef ELF_HWCAP3 + "AT_HWCAP3", +#endif +#ifdef ELF_HWCAP4 + "AT_HWCAP4", +#endif +}; + /* Root misc cgroup */ static struct misc_cg root_cg; @@ -71,6 +84,11 @@ static inline bool valid_type(enum misc_res_type type) return type >= 0 && type < MISC_CG_RES_TYPES; } +static inline bool valid_mask_type(enum misc_mask_type type) +{ + return type >= 0 && type < MISC_CG_MASK_TYPES; +} + /** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. @@ -391,6 +409,109 @@ static int misc_events_local_show(struct seq_file *sf, void *v) return __misc_events_show(sf, true); } +/** + * misc_cg_get_mask() - Get the mask of the specified type. + * @type: The misc mask type. + * @cg: The misc cgroup. + * @pmask: Pointer to the resulting mask. + * + * This function calculates the effective mask for a given cgroup by walking up + * the hierarchy and ORing the masks from all parent cgroupfs. The final result + * is stored in the location pointed to by @pmask. + * + * Context: Any context. + * Return: 0 on success, -EINVAL if @type is invalid. + */ +int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask) +{ + struct misc_cg *i; + u64 mask = 0; + + if (!(valid_mask_type(type))) + return -EINVAL; + + for (i = cg; i; i = parent_misc(i)) + mask |= READ_ONCE(i->mask[type]); + + *pmask = mask; + return 0; +} + +/** + * misc_cg_mask_show() - Show the misc cgroup masks. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_mask_show(struct seq_file *sf, void *v) +{ + struct misc_cg *cg = css_misc(seq_css(sf)); + int i; + + for (i = 0; i < MISC_CG_MASK_TYPES; i++) { + u64 rval, val = READ_ONCE(cg->mask[i]); + + misc_cg_get_mask(i, cg, &rval); + seq_printf(sf, "%s\t%#016llx\t%#016llx\n", misc_mask_name[i], val, rval); + } + + return 0; +} + +/** + * misc_cg_mask_write() - Update the mask of the specified type. + * @of: Handler for the file. + * @buf: The buffer containing the user's input. + * @nbytes: The number of bytes in @buf. + * @off: The offset in the file. + * + * This function parses a user-provided string to update a mask. + * The expected format is " ", for example: + * + * echo "AT_HWCAP 0xf00" > misc.mask + * + * Context: Process context. + * Return: The number of bytes processed on success, or a negative error code + * on failure. + */ +static ssize_t misc_cg_mask_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct misc_cg *cg; + u64 max; + int ret = 0, i; + enum misc_mask_type type = MISC_CG_MASK_TYPES; + char *token; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token || !buf) + return -EINVAL; + + for (i = 0; i < MISC_CG_MASK_TYPES; i++) { + if (!strcmp(misc_mask_name[i], token)) { + type = i; + break; + } + } + + if (type == MISC_CG_MASK_TYPES) + return -EINVAL; + + ret = kstrtou64(buf, 0, &max); + if (ret) + return ret; + + cg = css_misc(of_css(of)); + + WRITE_ONCE(cg->mask[type], max); + + return nbytes; +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -424,6 +545,11 @@ static struct cftype misc_cg_files[] = { .file_offset = offsetof(struct misc_cg, events_local_file), .seq_show = misc_events_local_show, }, + { + .name = "mask", + .write = misc_cg_mask_write, + .seq_show = misc_cg_mask_show, + }, {} }; -- 2.52.0.223.gf5cc29aaa4-goog Add a selftest for the misc.mask cgroup interface. The test verifies that the misc.mask file is present and has the correct default value, that it is possible to write a new mask to the file, and that the mask is inherited by sub-cgroups. Signed-off-by: Andrei Vagin --- tools/testing/selftests/cgroup/.gitignore | 1 + tools/testing/selftests/cgroup/Makefile | 2 + tools/testing/selftests/cgroup/config | 1 + tools/testing/selftests/cgroup/test_misc.c | 118 +++++++++++++++++++++ 4 files changed, 122 insertions(+) create mode 100644 tools/testing/selftests/cgroup/test_misc.c diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 952e4448bf07..3ced02a3634b 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -7,6 +7,7 @@ test_hugetlb_memcg test_kill test_kmem test_memcontrol +test_misc test_pids test_zswap wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index e01584c2189a..6e9e92f89d8a 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -15,6 +15,7 @@ TEST_GEN_PROGS += test_hugetlb_memcg TEST_GEN_PROGS += test_kill TEST_GEN_PROGS += test_kmem TEST_GEN_PROGS += test_memcontrol +TEST_GEN_PROGS += test_misc TEST_GEN_PROGS += test_pids TEST_GEN_PROGS += test_zswap @@ -31,5 +32,6 @@ $(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O) $(OUTPUT)/test_kill: $(LIBCGROUP_O) $(OUTPUT)/test_kmem: $(LIBCGROUP_O) $(OUTPUT)/test_memcontrol: $(LIBCGROUP_O) +$(OUTPUT)/test_misc: $(LIBCGROUP_O) $(OUTPUT)/test_pids: $(LIBCGROUP_O) $(OUTPUT)/test_zswap: $(LIBCGROUP_O) diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 39f979690dd3..9e3d03736f5a 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -1,6 +1,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_MISC=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y CONFIG_PAGE_COUNTER=y diff --git a/tools/testing/selftests/cgroup/test_misc.c b/tools/testing/selftests/cgroup/test_misc.c new file mode 100644 index 000000000000..50e8acb51852 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_misc.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "cgroup_util.h" + +/* + * This test checks that misc.mask works correctly. + */ +static int test_misc_mask(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_misc, *cg_misc_sub = NULL; + + cg_misc = cg_name(root, "misc_test"); + if (!cg_misc) + goto cleanup; + + cg_misc_sub = cg_name(root, "misc_test/sub"); + if (!cg_misc_sub) + goto cleanup; + + if (cg_create(cg_misc)) + goto cleanup; + + if (cg_read_strcmp(cg_misc, "misc.mask", + "AT_HWCAP\t0x00000000000000\t0x00000000000000\n")) + goto cleanup; + + if (cg_write(cg_misc, "misc.mask", "AT_HWCAP 0xf0000000000000")) + goto cleanup; + + if (cg_read_strcmp(cg_misc, "misc.mask", + "AT_HWCAP\t0xf0000000000000\t0xf0000000000000\n")) + goto cleanup; + + if (cg_write(cg_misc, "cgroup.subtree_control", "+misc")) + goto cleanup; + + if (cg_create(cg_misc_sub)) + goto cleanup; + + if (cg_read_strcmp(cg_misc_sub, "misc.mask", + "AT_HWCAP\t0x00000000000000\t0xf0000000000000\n")) + goto cleanup; + + if (cg_write(cg_misc_sub, "misc.mask", "AT_HWCAP 0x01000000000000")) + goto cleanup; + + if (cg_read_strcmp(cg_misc_sub, "misc.mask", + "AT_HWCAP\t0x01000000000000\t0xf1000000000000\n")) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + cg_destroy(cg_misc_sub); + cg_destroy(cg_misc); + free(cg_misc); + free(cg_misc_sub); + + return ret; +} + +#define T(x) { x, #x } +struct misc_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_misc_mask), +}; +#undef T + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + /* + * Check that misc controller is available: + * misc is listed in cgroup.controllers + */ + if (cg_read_strstr(root, "cgroup.controllers", "misc")) + ksft_exit_skip("misc controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "misc")) + if (cg_write(root, "cgroup.subtree_control", "+misc")) + ksft_exit_skip("Failed to set misc controller\n"); + + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + ksft_finished(); +} -- 2.52.0.223.gf5cc29aaa4-goog Updates the cgroup-v2 documentation to include details about the newly introduced 'misc.mask' interface. This interface, part of the 'misc' cgroup controller, allows masking out hardware capabilities (AT_HWCAP, AT_HWCAP2, AT_HWCAP3, AT_HWCAP4) reported to user-space processes within a cgroup. Signed-off-by: Andrei Vagin --- Documentation/admin-guide/cgroup-v2.rst | 25 +++++++++++++++++++++++++ Documentation/arch/arm64/elf_hwcaps.rst | 21 +++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4c072e85acdf..9d9d923e0d4e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2924,6 +2924,31 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_ cgroup i.e. not hierarchical. The file modified event generated on this file reflects only the local events. +Miscellaneous controller provides one interface file to control masks. + + misc.mask + A read-write flat-keyed file shown in all cgroups. It allows + setting/reading the masks. The file format is a series of lines, each + describing a mask of a specific mask type. + + The file has the following format for each line:: + + $NAME\t$LOCAL_MASK\t$EFFECTIVE_MASK + + Where $NAME is the mask type name, $LOCAL_MASK is the mask for the + current cgroup, and $EFFECTIVE_MASK is the effective mask for the + current cgroup, which is a combination of the masks from the current + cgroup and all its ancestors. + + To set a mask, write a string in the following format to the file:: + + $NAME $MASK + + For example, to set a mask for the mask_a type, you would write the + following to the file:: + + # echo "mask_a 0x3000" > misc.mask + Migration and Ownership ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index a15df4956849..5526daff5d30 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -450,3 +450,24 @@ HWCAP3_LSFE For interoperation with userspace, the kernel guarantees that bits 62 and 63 of AT_HWCAP will always be returned as 0. + +5. Masking hwcaps for a group of processes +-------------------------------- + +The misc cgroup controller provides a mechanism to mask hwcaps for a specific +workload. This can be useful for limiting the features available to a +containerized application. + +To mask hwcaps, you can write a mask to the ``misc.mask`` file in the cgroup +directory. The mask is specified per AT_HWCAP entry (AT_HWCAP, AT_HWCAP2, +AT_HWCAP3) in the format `` ``. + +For example, to mask ``HWCAP_FP`` and ``HWCAP_ASIMD`` (which are represented by +bits 0 and 1 of AT_HWCAP, so a mask of 0x3) for a workload, you would write the +mask for AT_HWCAP to the ``misc.mask`` file in the new cgroup directory:: + + # echo "AT_HWCAP 0x3" > /sys/fs/cgroup/misc/my-workload/misc.mask + +Any new processes started in this cgroup will have the specified hwcaps +masked. You can verify this by reading the ``misc.mask`` file, which will +show the effective mask for the cgroup. -- 2.52.0.223.gf5cc29aaa4-goog