From: Cong Wang SECCOMP_IOCTL_NOTIF_PIN_INSTALL maps a supervisor-owned @memfd at @target_addr in the trapped task's mm via vm_mmap_seal_remote(), PROT_READ, MAP_SHARED, MAP_FIXED_NOREPLACE and VM_SEALED. Because the mapping is sealed, neither the target nor a CLONE_VM peer can munmap, mremap, mprotect or MAP_FIXED-stomp it; its contents are immutable from the target's side while the supervisor retains write access through its own mapping of the same memfd. The install needs no target-side cooperation, which is what makes the feature usable for fork+execve sandbox wrappers (Sandlock, Firejail, Bubblewrap-style) that have no trusted post-exec window to install their own mappings. The pin is just a sealed VMA owned by the target's mm: it persists until the task execve()s or exits (a sealed VMA cannot be unmapped piecemeal), and the kernel keeps no per-pin bookkeeping. A supervisor reuses one region across many redirects. Assisted-by: Claude:claude-opus-4.8 Signed-off-by: Cong Wang --- include/linux/seccomp.h | 5 ++ include/uapi/linux/seccomp.h | 54 ++++++++++++++++ kernel/seccomp.c | 117 +++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 9b959972bf4a..a91d1fc8a2b8 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -16,6 +16,11 @@ #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 #define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 +/* sizeof() the first published struct seccomp_notif_pin_install */ +#define SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER0 32 /* up to @size */ +#define SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER1 40 /* adds @offset */ +#define SECCOMP_NOTIFY_PIN_INSTALL_SIZE_LATEST SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER1 + #ifdef CONFIG_SECCOMP #include diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index dbfc9b37fcae..cc34188f8aeb 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -137,6 +137,50 @@ struct seccomp_notif_addfd { __u32 newfd_flags; }; +/** + * struct seccomp_notif_pin_install - have the kernel install a sealed + * MAP_SHARED mapping of @memfd into the trapped task's mm at @target_addr. + * + * The supervisor owns @memfd. The kernel installs the mapping into + * the trapped task's address space without target-side cooperation + * (the target need not mmap or mseal anything itself). The mapping + * is marked VM_SEALED at install time, so the target and any + * CLONE_VM peer cannot munmap, mremap, mprotect, or MAP_FIXED-stomp + * it. The mapping is read-only. The supervisor retains access via its + * own mapping of the same memfd in its own mm. + * + * @memfd must be write-sealed (F_SEAL_WRITE or F_SEAL_FUTURE_WRITE), + * otherwise the ioctl fails with -EINVAL. This guarantees the pin's bytes + * cannot be rewritten through any other reference to the same memfd (for + * example one the target reopened via the supervisor's /proc//fd), + * not just through the read-only pin itself. F_SEAL_FUTURE_WRITE still + * lets the supervisor update the bytes through its own pre-seal mapping. + * + * @offset lets one memfd back several disjoint read-only pins. + * + * @id: The ID of an active seccomp notification on this listener, + * identifying the trapped task whose mm receives the pin. + * @flags: Reserved, must be 0. + * @memfd: Supervisor-side fd for the backing memfd. Must be write-sealed. + * @target_addr: Address in the trapped task's mm to install at. Must be + * page-aligned. If non-zero, MAP_FIXED semantics apply, no + * other mapping may exist in [@target_addr, @target_addr + + * @size). If zero, the kernel chooses a free area in the + * target mm. On success the actual mapped address is written + * back here. + * @size: Size of the pin in bytes. Must be page-aligned. + * @offset: Page-aligned byte offset into @memfd to map from. Zero maps + * from the start of the memfd. + */ +struct seccomp_notif_pin_install { + __u64 id; + __u32 flags; + __u32 memfd; + __u64 target_addr; + __u64 size; + __u64 offset; +}; + #define SECCOMP_IOC_MAGIC '!' #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) @@ -154,4 +198,14 @@ struct seccomp_notif_addfd { #define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) +/* + * Install a sealed memfd-backed pin in the trapped task's mm without + * target-side cooperation. The supervisor owns the backing memfd; + * the kernel installs the mapping and marks it VM_SEALED. The actual + * mapped address is written back to @target_addr (relevant when it was + * passed as 0 to let the kernel choose). + */ +#define SECCOMP_IOCTL_NOTIF_PIN_INSTALL SECCOMP_IOWR(5, \ + struct seccomp_notif_pin_install) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 066909393c38..fa0fb3c960a8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -37,12 +37,19 @@ #ifdef CONFIG_SECCOMP_FILTER #include #include +#include #include #include #include #include #include #include +#include +#include +#include +#include +#include +#include /* * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the @@ -1823,6 +1830,113 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, return ret; } +static unsigned long seccomp_install_pin(struct task_struct *target, + struct file *memfd_file, + unsigned long target_addr, size_t size, + unsigned long offset) +{ + struct mm_struct *mm; + unsigned long ret; + + mm = get_task_mm(target); + if (!mm) + return -ESRCH; + + /* + * Install a sealed, read-only mapping. A fixed request (@target_addr + * != 0) is MAP_FIXED_NOREPLACE: an existing mapping yields -EEXIST + * rather than being silently clobbered. A request of 0 lets the kernel + * pick a free area in the target mm. + */ + ret = vm_mmap_seal_remote(mm, memfd_file, target_addr, size, + offset >> PAGE_SHIFT); + mmput(mm); + if (IS_ERR_VALUE(ret)) + return ret; + if (target_addr && ret != target_addr) + return -ENOMEM; + return ret; +} + +static long seccomp_notify_pin_install(struct seccomp_filter *filter, + struct seccomp_notif_pin_install __user *upin, + unsigned int size) +{ + struct seccomp_notif_pin_install pin; + struct seccomp_knotif *knotif; + struct task_struct *target; + struct file *memfd_file; + unsigned long addr; + int seals; + long ret; + + BUILD_BUG_ON(sizeof(pin) < SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER0); + BUILD_BUG_ON(sizeof(pin) != SECCOMP_NOTIFY_PIN_INSTALL_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&pin, sizeof(pin), upin, size); + if (ret) + return ret; + + if (pin.flags) + return -EINVAL; + if (!pin.size || !IS_ALIGNED(pin.target_addr, PAGE_SIZE) || + !IS_ALIGNED(pin.size, PAGE_SIZE) || !IS_ALIGNED(pin.offset, PAGE_SIZE)) + return -EINVAL; + if (pin.target_addr + pin.size < pin.target_addr) + return -EINVAL; + if (pin.offset + pin.size < pin.offset) + return -EINVAL; + + memfd_file = fget(pin.memfd); + if (!memfd_file) + return -EBADF; + + seals = memfd_get_seals(memfd_file); + if (seals < 0 || !(seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + ret = -EINVAL; + goto out_fput; + } + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out_fput; + + knotif = find_notification(filter, pin.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + target = knotif->task; + get_task_struct(target); + mutex_unlock(&filter->notify_lock); + + addr = seccomp_install_pin(target, memfd_file, + pin.target_addr, pin.size, pin.offset); + put_task_struct(target); + if (IS_ERR_VALUE(addr)) + ret = addr; + else if (put_user(addr, &upin->target_addr)) + /* Pin is installed (and sealed); we just can't report where. */ + ret = -EFAULT; + else + ret = 0; + goto out_fput; + +out_unlock: + mutex_unlock(&filter->notify_lock); +out_fput: + fput(memfd_file); + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1847,6 +1961,9 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, switch (EA_IOCTL(cmd)) { case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_PIN_INSTALL): + return seccomp_notify_pin_install(filter, buf, + _IOC_SIZE(cmd)); default: return -EINVAL; } -- 2.43.0