From: Zhihao Cheng For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..0742039c53a7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4157,9 +4157,8 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; - unsigned max = blocksize - (offset & (blocksize - 1)); + unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between -- 2.46.1 From: Baokun Li For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0742039c53a7..4c04af7e51c9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4183,7 +4183,6 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -4192,8 +4191,8 @@ static int ext4_block_truncate_page(handle_t *handle, if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + blocksize = i_blocksize(inode); + length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } -- 2.46.1 From: Baokun Li Previously, ext4_rec_len_(to|from)_disk only performed complex rec_len conversions when PAGE_SIZE >= 65536 to reduce complexity. However, we are soon to support file system block sizes greater than page size, which makes these conditional checks unnecessary. Thus, these checks are now removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 24c414605b08..93c2bf4d125a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2464,28 +2464,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len, return (rec_len & ~EXT4_DIR_ROUND); } -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); -#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2495,9 +2486,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif } /* -- 2.46.1 From: Baokun Li Since the block size may be greater than the page size, when a hole extends beyond i_size, we need to align the hole's end upwards to the larger of PAGE_SIZE and blocksize. This is to prevent the issues seen in commit 2be4751b21ae ("ext4: fix 2nd xfstests 127 punch hole failure") from reappearing after BS > PS is supported. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4c04af7e51c9..a63513a3db53 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4401,7 +4401,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) * the page that contains i_size. */ if (end > inode->i_size) - end = round_up(inode->i_size, PAGE_SIZE); + end = round_up(inode->i_size, + umax(PAGE_SIZE, sb->s_blocksize)); if (end > max_end) end = max_end; length = end - offset; -- 2.46.1 From: Baokun Li The dioread_nolock related processes already support large folio, so dioread_nolock is enabled by default regardless of whether the blocksize is less than, equal to, or greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 894529f9b0cc..aa5aee4d1b63 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4383,8 +4383,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize <= PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) -- 2.46.1 From: Baokun Li This commit introduces the s_min_folio_order field to the ext4_sb_info structure. This field will store the minimum folio order required by the current filesystem, laying groundwork for future support of block sizes greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 3 +++ fs/ext4/inode.c | 3 ++- fs/ext4/super.c | 10 +++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 93c2bf4d125a..bca6c3709673 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1677,6 +1677,9 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; + /* minimum folio order of a page cache allocation */ + unsigned int s_min_folio_order; + /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a63513a3db53..889761ed51dd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5174,7 +5174,8 @@ void ext4_set_inode_mapping_order(struct inode *inode) if (!ext4_should_enable_large_folio(inode)) return; - mapping_set_folio_order_range(inode->i_mapping, 0, + mapping_set_folio_order_range(inode->i_mapping, + EXT4_SB(inode->i_sb)->s_min_folio_order, EXT4_MAX_PAGECACHE_ORDER(inode)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index aa5aee4d1b63..d353e25a5b92 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5100,11 +5100,8 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, * If the default block size is not the same as the real block size, * we need to reload it. */ - if (sb->s_blocksize == blocksize) { - *lsb = logical_sb_block; - sbi->s_sbh = bh; - return 0; - } + if (sb->s_blocksize == blocksize) + goto success; /* * bh must be released before kill_bdev(), otherwise @@ -5135,6 +5132,9 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } + +success: + sbi->s_min_folio_order = get_order(blocksize); *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; -- 2.46.1 From: Baokun Li ext4_calculate_overhead() used a single page for its bitmap buffer, which worked fine when PAGE_SIZE >= block size. However, with block size greater than page size (BS > PS) support, the bitmap can exceed a single page. To address this, we now use __get_free_pages() to allocate multiple pages, sized to the block size, to properly support BS > PS. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d353e25a5b92..7338c708ea1d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4182,7 +4182,8 @@ int ext4_calculate_overhead(struct super_block *sb) unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_NOFS); + gfp_t gfp = GFP_NOFS | __GFP_ZERO; + char *buf = (char *)__get_free_pages(gfp, sbi->s_min_folio_order); if (!buf) return -ENOMEM; @@ -4207,7 +4208,7 @@ int ext4_calculate_overhead(struct super_block *sb) blks = count_overhead(sb, i, buf); overhead += blks; if (blks) - memset(buf, 0, PAGE_SIZE); + memset(buf, 0, sb->s_blocksize); cond_resched(); } @@ -4230,7 +4231,7 @@ int ext4_calculate_overhead(struct super_block *sb) } sbi->s_overhead = overhead; smp_wmb(); - free_page((unsigned long) buf); + free_pages((unsigned long)buf, sbi->s_min_folio_order); return 0; } -- 2.46.1 From: Baokun Li In ext4_readdir(), page_cache_sync_readahead() is used to readahead mapped physical blocks. With LBS support, this can lead to a negative right shift. To fix this, the page index is now calculated by first converting the physical block number (pblk) to a file position (pos) before converting it to a page index. Also, the correct number of pages to readahead is now passed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d4164c507a90..256fe2c1d4c1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) continue; } if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_SHIFT - inode->i_blkbits); + pgoff_t index = map.m_pblk << inode->i_blkbits >> + PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, - &file->f_ra, file, - index, 1); + &file->f_ra, file, index, + 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { -- 2.46.1 From: Baokun Li No functional changes. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 20 +++++++++----------- fs/ext4/namei.c | 8 +++----- fs/ext4/verity.c | 2 +- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bca6c3709673..9b236f620b3a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -367,6 +367,7 @@ struct ext4_io_submit { blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) +#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ca5499e9412b..da640c88b863 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4562,7 +4562,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = (loff_t)(map.m_lblk + ret) << blkbits; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 889761ed51dd..73c1da90b604 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -825,9 +825,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -2225,7 +2224,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2251,7 +2249,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2261,7 +2260,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2463,7 +2462,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3503,8 +3502,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3678,7 +3677,6 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; - u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; @@ -3737,7 +3735,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2cd36f59c9e3..78cefb7cc9a7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1076,7 +1076,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, - (block<i_sb)) + EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; @@ -1630,7 +1630,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; @@ -1710,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { - struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; @@ -1729,8 +1728,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, goto errout; retval = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), - res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce97..7a980a8059bd 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -302,7 +302,7 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); - desc_size_pos = (u64)end_lblk << inode->i_blkbits; + desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk); ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) -- 2.46.1 From: Baokun Li As BS > PS support is coming, all block number to page index (and vice-versa) conversions must now go via bytes. Added EXT4_LBLK_TO_P() and EXT4_P_TO_LBLK() macros to simplify these conversions and handle both BS <= PS and BS > PS scenarios cleanly. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9b236f620b3a..8223ed29b343 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -369,6 +369,12 @@ struct ext4_io_submit { (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) #define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) +/* Translate a block number to a page index */ +#define EXT4_LBLK_TO_P(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ + PAGE_SHIFT) +/* Translate a page index to a block number */ +#define EXT4_P_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ + (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ -- 2.46.1 From: Baokun Li Currently, ext4_mb_load_buddy_gfp() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, if buddy and bitmap land in the same folio, we get that folio’s ref instead of looking it up again before updating the buddy. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/mballoc.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6070d3c86678..3494c6fe5bfb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1642,17 +1642,15 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { - int blocks_per_page; int block; int pnum; - int poff; struct folio *folio; int ret; struct ext4_group_info *grp; @@ -1662,7 +1660,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, might_sleep(); mb_debug(sb, "load group %u\n", group); - blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; @@ -1690,8 +1687,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_P(inode, block); /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); @@ -1723,7 +1719,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + - (poff * sb->s_blocksize)); + offset_in_folio(folio, + EXT4_LBLK_TO_B(inode, block))); } folio_unlock(folio); } @@ -1739,12 +1736,18 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_P(inode, block); + /* buddy and bitmap are on the same folio? */ + if (folio_contains(folio, pnum)) { + folio_get(folio); + goto update_buddy; + } + /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) @@ -1779,9 +1782,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } +update_buddy: /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; - e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_buddy = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; -- 2.46.1 From: Baokun Li Currently, ext4_mb_get_buddy_page_lock() uses blocks_per_page to calculate folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, since ext4_mb_get_buddy_page_lock() already fully supports folio, rename it to ext4_mb_get_buddy_folio_lock(). Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/mballoc.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3494c6fe5bfb..d42d768a705a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1510,50 +1510,52 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) } /* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. + * Lock the buddy and bitmap folios. This make sure other parallel init_group + * on the same buddy folio doesn't happen whild holding the buddy folio lock. + * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap + * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, +static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; - int blocks_per_page; + int block, pnum; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; - blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_P(inode, block); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ + block++; + pnum = EXT4_LBLK_TO_P(inode, block); + if (folio_contains(folio, pnum)) { + /* buddy and bitmap are on the same folio */ return 0; } - /* blocks_per_page == 1, hence we need another page for the buddy */ - folio = __filemap_get_folio(inode->i_mapping, block + 1, + /* we need another folio for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_buddy_folio = folio; return 0; } @@ -1592,14 +1594,14 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * This ensures that we don't reinit the buddy cache - * page which map to the group from which we are already + * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. - * The call to ext4_mb_get_buddy_page_lock will mark the - * page accessed. + * would have pinned buddy folio to page cache. + * The call to ext4_mb_get_buddy_folio_lock will mark the + * folio accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); + ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group -- 2.46.1 From: Baokun Li Currently, ext4_mb_init_cache() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. Since we now have the folio, we know its exact size. This allows us to convert {blocks, groups}_per_page to {blocks, groups}_per_folio, thus supporting block sizes greater than page size. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/mballoc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d42d768a705a..31f4c7d65eb4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1329,26 +1329,25 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * block bitmap and buddy information. The information are * stored in the inode as * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. - * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 + * So for each group we take up 2 blocks. A folio can + * contain blocks_per_folio (folio_size / blocksize) blocks. + * So it can have information regarding groups_per_folio which + * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups - * for this page; do not hold this lock when calling this routine! + * for this folio; do not hold this lock when calling this routine! */ - static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; - int blocks_per_page; - int groups_per_page; + int blocks_per_folio; + int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; @@ -1365,27 +1364,24 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); - blocks_per_page = PAGE_SIZE / blocksize; + blocks_per_folio = folio_size(folio) / blocksize; + WARN_ON_ONCE(!blocks_per_folio); + groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); mb_debug(sb, "init folio %lu\n", folio->index); - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* allocate buffer_heads to read bitmaps */ - if (groups_per_page > 1) { - i = sizeof(struct buffer_head *) * groups_per_page; + if (groups_per_folio > 1) { + i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; - first_group = folio->index * blocks_per_page / 2; - /* read all groups the folio covers into the cache */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + first_group = EXT4_P_TO_LBLK(inode, folio->index) / 2; + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; @@ -1393,7 +1389,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) if (!grinfo) continue; /* - * If page is uptodate then we came here after online resize + * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. @@ -1413,7 +1409,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) } /* wait for I/O completion */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) @@ -1423,8 +1419,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) err = err2; } - first_block = folio->index * blocks_per_page; - for (i = 0; i < blocks_per_page; i++) { + first_block = EXT4_P_TO_LBLK(inode, folio->index); + for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1501,7 +1497,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) out: if (bh) { - for (i = 0; i < groups_per_page; i++) + for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); -- 2.46.1 From: Baokun Li We use EXT4_BAD_INO for the buddy cache inode number. This inode is not accessed via __ext4_new_inode() or __ext4_iget(), meaning ext4_set_inode_mapping_order() is not called to set its folio order range. However, future block size greater than page size support requires this inode to support large folios, and the buddy cache code already handles BS > PS. Therefore, ext4_set_inode_mapping_order() is now explicitly called for this specific inode to set its folio order range. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 31f4c7d65eb4..155c43ff2bc2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3493,6 +3493,8 @@ static int ext4_mb_init_backend(struct super_block *sb) * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + ext4_set_inode_mapping_order(sbi->s_buddy_cache); + for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); -- 2.46.1 From: Zhihao Cheng The ext4 multi-block allocator now fully supports folio objects. Update all variable names, function names, and comments to replace legacy 'page' terminology with 'folio', improving clarity and consistency. No functional changes. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/mballoc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 155c43ff2bc2..cf07d1067f5f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -98,14 +98,14 @@ * block bitmap and buddy information. The information are stored in the * inode as: * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / - * blocksize) blocks. So it can have information regarding groups_per_page - * which is blocks_per_page/2 + * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / + * blocksize) blocks. So it can have information regarding groups_per_folio + * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. @@ -1556,7 +1556,7 @@ static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, return 0; } -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); @@ -1570,7 +1570,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack @@ -1618,7 +1618,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in - * the same page we don't need to force + * the same folio we don't need to force * init the buddy */ ret = 0; @@ -1634,7 +1634,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } err: - ext4_mb_put_buddy_page_lock(&e4b); + ext4_mb_put_buddy_folio_lock(&e4b); return ret; } @@ -2227,7 +2227,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_buddy = ret >> 16; /* - * take the page reference. We want the page to be pinned + * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped @@ -2933,7 +2933,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac, if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; - /* This now checks without needing the buddy page */ + /* This now checks without needing the buddy folio */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) @@ -4725,7 +4725,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the - * pages in the ext4_allocation_context so + * folios in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; -- 2.46.1 From: Baokun Li Use the EXT4_P_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/readpage.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f329daf6e5c7..8c8ec9d60b90 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -213,9 +213,7 @@ int ext4_mpage_readpages(struct inode *inode, { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; @@ -251,9 +249,8 @@ int ext4_mpage_readpages(struct inode *inode, blocks_per_folio = folio_size(folio) >> blkbits; first_hole = blocks_per_folio; - block_in_file = next_block = - (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + block_in_file = next_block = EXT4_P_TO_LBLK(inode, folio->index); + last_block = EXT4_P_TO_LBLK(inode, folio->index + nr_pages); last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) -- 2.46.1 From: Baokun Li Use the EXT4_P_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 73c1da90b604..d97ce88d6e0a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1162,8 +1162,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; @@ -1172,12 +1171,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = EXT4_P_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { -- 2.46.1 From: Baokun Li Use the EXT4_P_TO_LBLK/EXT4_LBLK_TO_P macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d97ce88d6e0a..cbf04b473ae7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2289,15 +2289,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + start = EXT4_LBLK_TO_P(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_P(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2308,7 +2307,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - lblk = folio->index << bpp_bits; + lblk = EXT4_P_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* -- 2.46.1 From: Baokun Li Use the EXT4_P_TO_LBLK/EXT4_LBLK_TO_P macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbf04b473ae7..ce48cc6780a3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2610,7 +2610,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; @@ -2649,7 +2648,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_P(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ @@ -2727,8 +2726,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_P_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); -- 2.46.1 From: Zhihao Cheng Use the EXT4_P_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce48cc6780a3..b3fa29923a1d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4066,7 +4066,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = EXT4_P_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); if (!bh) -- 2.46.1 From: Zhihao Cheng There are several places assuming that block size <= PAGE_SIZE, modify them to support large block size (bs > ps). Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li --- fs/ext4/move_extent.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4b091c21908f..cb55cd9e7eeb 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -270,7 +270,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int i, err2, jblocks, retries = 0; int replaced_count = 0; int from; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -288,11 +287,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, return 0; } - orig_blk_offset = orig_page_offset * blocks_per_page + - data_offset_in_page; + orig_blk_offset = EXT4_P_TO_LBLK(orig_inode, orig_page_offset) + + data_offset_in_page; - donor_blk_offset = donor_page_offset * blocks_per_page + - data_offset_in_page; + donor_blk_offset = EXT4_P_TO_LBLK(donor_inode, donor_page_offset) + + data_offset_in_page; /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == @@ -565,7 +564,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; + int blocks_per_page = 1; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; @@ -608,6 +607,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (i_blocksize(orig_inode) < PAGE_SIZE) + blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); @@ -665,10 +667,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, if (o_end - o_start < cur_len) cur_len = o_end - o_start; - orig_page_index = o_start >> (PAGE_SHIFT - - orig_inode->i_blkbits); - donor_page_index = d_start >> (PAGE_SHIFT - - donor_inode->i_blkbits); + orig_page_index = EXT4_LBLK_TO_P(orig_inode, o_start); + donor_page_index = EXT4_LBLK_TO_P(donor_inode, d_start); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; -- 2.46.1 From: Baokun Li In __alloc_pages_slowpath(), allocating page units greater than order-1 with the __GFP_NOFAIL flag may trigger an unexpected WARN_ON. To avoid this, handle the case separately in grow_dev_folio(). This ensures that buffer_head-based filesystems will not encounter the warning when using __GFP_NOFAIL to read metadata after BS > PS support is enabled. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/buffer.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbed..2f5a7dd199b2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1031,6 +1031,35 @@ static sector_t folio_init_buffers(struct folio *folio, return end_block; } +static struct folio *blkdev_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp) +{ + struct folio *folio; + unsigned int min_order = mapping_min_folio_order(mapping); + + /* + * Allocating page units greater than order-1 with __GFP_NOFAIL in + * __alloc_pages_slowpath() can trigger an unexpected WARN_ON. + * Handle this case separately to suppress the warning. + */ + if (min_order <= 1) + return __filemap_get_folio(mapping, index, fgp_flags, gfp); + + while (1) { + folio = __filemap_get_folio(mapping, index, fgp_flags, + gfp & ~__GFP_NOFAIL); + if (!IS_ERR(folio) || !(gfp & __GFP_NOFAIL)) + return folio; + + if (PTR_ERR(folio) != -ENOMEM && PTR_ERR(folio) != -EAGAIN) + return folio; + + memalloc_retry_wait(gfp); + } + + return folio; +} + /* * Create the page-cache folio that contains the requested block. * @@ -1047,8 +1076,8 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block = 0; - folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + folio = blkdev_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return false; -- 2.46.1 From: Baokun Li In __alloc_pages_slowpath(), allocating page units larger than order-1 with __GFP_NOFAIL may trigger an unexpected WARN_ON. To prevent this, handle the case explicitly in jbd2_alloc(), ensuring that the warning does not occur after enabling BS > PS support. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/jbd2/journal.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d480b94117cd..9185f9e2b201 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2761,14 +2761,36 @@ static struct kmem_cache *get_slab(size_t size) void *jbd2_alloc(size_t size, gfp_t flags) { void *ptr; + int order; BUG_ON(size & (size-1)); /* Must be a power of 2 */ - if (size < PAGE_SIZE) + if (size < PAGE_SIZE) { ptr = kmem_cache_alloc(get_slab(size), flags); - else - ptr = (void *)__get_free_pages(flags, get_order(size)); + goto out; + } + + /* + * Allocating page units greater than order-1 with __GFP_NOFAIL in + * __alloc_pages_slowpath() can trigger an unexpected WARN_ON. + * Handle this case separately to suppress the warning. + */ + order = get_order(size); + if (order <= 1) { + ptr = (void *)__get_free_pages(flags, order); + goto out; + } + while (1) { + ptr = (void *)__get_free_pages(flags & ~__GFP_NOFAIL, order); + if (ptr) + break; + if (!(flags & __GFP_NOFAIL)) + break; + memalloc_retry_wait(flags); + } + +out: /* Check alignment; SLUB has gotten this wrong in the past, * and this can lead to user data corruption! */ BUG_ON(((unsigned long) ptr) & (size-1)); -- 2.46.1 From: Baokun Li Supporting a block size greater than the page size (BS > PS) requires support for large folios. However, several features (e.g., verity, encrypt) and mount options (e.g., data=journal) do not yet support large folios. To prevent conflicts, this patch adds checks at mount time to prohibit these features and options from being used when BS > PS. Since the data mode cannot be changed on remount, there is no need to check on remount. A new mount flag, EXT4_MF_LARGE_FOLIO, is introduced. This flag is set after the checks pass, indicating that the filesystem has no features or mount options incompatible with large folios. Subsequent checks can simply test for this flag to avoid redundant verifications. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 10 ++++------ fs/ext4/super.c | 26 ++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8223ed29b343..f1163deb0812 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1859,7 +1859,8 @@ static inline int ext4_get_resgid(struct ext4_super_block *es) enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ + EXT4_MF_JOURNAL_DESTROY,/* Journal is in process of destroying */ + EXT4_MF_LARGE_FOLIO, /* large folio is support */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3fa29923a1d..04f9380d4211 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5143,14 +5143,12 @@ static bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; - if (!S_ISREG(inode->i_mode)) - return false; - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + if (!ext4_test_mount_flag(sb, EXT4_MF_LARGE_FOLIO)) return false; - if (ext4_has_feature_verity(sb)) + + if (!S_ISREG(inode->i_mode)) return false; - if (ext4_has_feature_encrypt(sb)) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return false; return true; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7338c708ea1d..fdc006a973aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5034,6 +5034,28 @@ static const char *ext4_has_journal_option(struct super_block *sb) return NULL; } +static int ext4_check_large_folio(struct super_block *sb) +{ + const char *err_str = NULL; + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + err_str = "data=journal"; + else if (ext4_has_feature_verity(sb)) + err_str = "verity"; + else if (ext4_has_feature_encrypt(sb)) + err_str = "encrypt"; + + if (!err_str) { + ext4_set_mount_flag(sb, EXT4_MF_LARGE_FOLIO); + } else if (sb->s_blocksize > PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s", + sb->s_blocksize, PAGE_SIZE, err_str); + return -EINVAL; + } + + return 0; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5310,6 +5332,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); + err = ext4_check_large_folio(sb); + if (err < 0) + goto failed_mount; + err = ext4_encoding_init(sb, es); if (err) goto failed_mount; -- 2.46.1 From: Baokun Li Since block device (See commit 3c20917120ce ("block/bdev: enable large folio support for large logical block sizes")) and page cache (See commit ab95d23bab220ef8 ("filemap: allocate mapping_min_order folios in the page cache")) has the ability to have a minimum order when allocating folio, and ext4 has supported large folio in commit 7ac67301e82f ("ext4: enable large folio for regular file"), now add support for block_size > PAGE_SIZE in ext4. set_blocksize() -> bdev_validate_blocksize() already validates the block size, so ext4_load_super() does not need to perform additional checks. Here we only need to enable large folio by default when s_min_folio_order is greater than 0 and add the FS_LBS bit to fs_flags. In addition, mark this feature as experimental. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 3 +++ fs/ext4/super.c | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 04f9380d4211..ba6cf05860ae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,6 +5146,9 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!ext4_test_mount_flag(sb, EXT4_MF_LARGE_FOLIO)) return false; + if (EXT4_SB(sb)->s_min_folio_order) + return true; + if (!S_ISREG(inode->i_mode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fdc006a973aa..4c0bd79bdf68 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5053,6 +5053,9 @@ static int ext4_check_large_folio(struct super_block *sb) return -EINVAL; } + if (sb->s_blocksize > PAGE_SIZE) + ext4_msg(sb, KERN_NOTICE, "EXPERIMENTAL bs(%lu) > ps(%lu) enabled.", + sb->s_blocksize, PAGE_SIZE); return 0; } @@ -7432,7 +7435,8 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("ext4"); -- 2.46.1