Hibernation via uswsusp (/dev/snapshot ioctls) has a race: between setting the resume swap area and allocating a swap slot, user-space is not yet frozen, so swapoff can run and cause an incorrect slot allocation. Fix this by keeping swap_type_of() as a static helper that requires swap_lock to be held, and introducing new interfaces that wrap it with proper locking and reference management: - get_hibernation_swap_type(): Lookup under swap_lock + acquire a swap device reference to block swapoff (used by uswsusp). - find_hibernation_swap_type(): Lookup under swap_lock only, no reference. Used by the sysfs path where user-space is already frozen, making swapoff impossible. - put_hibernation_swap_type(): Release the reference. Because the reference is held via get_swap_device(), swapoff will block at wait_for_completion_interruptible() until put_hibernation_swap_type() releases it. The wait is interruptible, so swapoff can be cancelled by a signal. Signed-off-by: Youngjun Park --- include/linux/swap.h | 4 +- kernel/power/swap.c | 2 +- kernel/power/user.c | 15 ++++++-- mm/swapfile.c | 92 ++++++++++++++++++++++++++++++++++++-------- 4 files changed, 92 insertions(+), 21 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977a5..cf8cfdaf34a7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -433,7 +433,9 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int swap_type_of(dev_t device, sector_t offset); +int get_hibernation_swap_type(dev_t device, sector_t offset); +int find_hibernation_swap_type(dev_t device, sector_t offset); +void put_hibernation_swap_type(int type); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2e64869bb5a0..cc4764149e8f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -341,7 +341,7 @@ static int swsusp_swap_check(void) * This is called before saving the image. */ if (swsusp_resume_device) - res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + res = find_hibernation_swap_type(swsusp_resume_device, swsusp_resume_block); else res = find_first_swap(&swsusp_resume_device); if (res < 0) diff --git a/kernel/power/user.c b/kernel/power/user.c index 4401cfe26e5c..3e41544b99d5 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swap_type_of(swsusp_resume_device, 0); + data->swap = get_hibernation_swap_type(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); @@ -90,8 +90,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->free_bitmaps = !error; } } - if (error) + if (error) { + put_hibernation_swap_type(data->swap); hibernate_release(); + } data->frozen = false; data->ready = false; @@ -115,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); + put_hibernation_swap_type(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); @@ -235,11 +238,17 @@ static int snapshot_set_swap_area(struct snapshot_data *data, offset = swap_area.offset; } + /* + * Put the reference if a swap area was already + * set by SNAPSHOT_SET_SWAP_AREA. + */ + put_hibernation_swap_type(data->swap); + /* * User space encodes device types as two-byte values, * so we need to recode them */ - data->swap = swap_type_of(swdev, offset); + data->swap = get_hibernation_swap_type(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; diff --git a/mm/swapfile.c b/mm/swapfile.c index 71a7d6959f3e..7baa0f270cff 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -134,7 +134,7 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { /* May return NULL on invalid type, caller must check for NULL return */ static struct swap_info_struct *swap_type_to_info(int type) { - if (type >= MAX_SWAPFILES) + if (type < 0 || type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } @@ -2139,22 +2139,15 @@ void swap_free_hibernation_slot(swp_entry_t entry) put_swap_device(si); } -/* - * Find the swap type that corresponds to given device (if any). - * - * @offset - number of the PAGE_SIZE-sized block of the device, starting - * from 0, in which the swap header is expected to be located. - * - * This is needed for the suspend to disk (aka swsusp). - */ -int swap_type_of(dev_t device, sector_t offset) +static int swap_type_of(dev_t device, sector_t offset) { int type; + lockdep_assert_held(&swap_lock); + if (!device) return -1; - spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; @@ -2164,16 +2157,70 @@ int swap_type_of(dev_t device, sector_t offset) if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); - if (se->start_block == offset) { - spin_unlock(&swap_lock); + if (se->start_block == offset) return type; - } } } - spin_unlock(&swap_lock); return -ENODEV; } +/* + * Finds the swap type and safely acquires a reference to the swap device + * to prevent race conditions with swapoff. + * + * This should be used in environments like uswsusp where a race condition + * exists between configuring the resume device and allocating a swap slot. + * For sysfs hibernation where user-space is frozen (making swapoff + * impossible), use find_hibernation_swap_type() instead. + * + * The caller must drop the reference using put_hibernation_swap_type(). + */ +int get_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + struct swap_info_struct *sis; + + spin_lock(&swap_lock); + type = swap_type_of(device, offset); + sis = swap_type_to_info(type); + if (!sis || !get_swap_device_info(sis)) + type = -1; + + spin_unlock(&swap_lock); + return type; +} + +/* + * Drops the reference to the swap device previously acquired by + * get_hibernation_swap_type(). + */ +void put_hibernation_swap_type(int type) +{ + struct swap_info_struct *sis; + + sis = swap_type_to_info(type); + if (!sis) + return; + + put_swap_device(sis); +} + +/* + * Simple lookup without acquiring a reference. Used by the sysfs + * hibernation path where user-space is already frozen, making + * swapoff impossible. + */ +int find_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + + spin_lock(&swap_lock); + type = swap_type_of(device, offset); + spin_unlock(&swap_lock); + + return type; +} + int find_first_swap(dev_t *device) { int type; @@ -2971,10 +3018,23 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. + * + * Note: if a hibernation session is actively holding a swap + * device reference, swapoff will block here until the reference + * is released via put_hibernation_swap_type() or the wait is + * interrupted by a signal. */ percpu_ref_kill(&p->users); synchronize_rcu(); - wait_for_completion(&p->comp); + err = wait_for_completion_interruptible(&p->comp); + if (err) { + percpu_ref_resurrect(&p->users); + synchronize_rcu(); + reinit_completion(&p->comp); + reinsert_swap_info(p); + goto out_dput; + } + flush_work(&p->discard_work); flush_work(&p->reclaim_work); -- 2.34.1