From: Zhihao Cheng For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 783c883d4d5e..d027441a95a9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4165,9 +4165,8 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; - unsigned max = blocksize - (offset & (blocksize - 1)); + unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between -- 2.46.1 From: Baokun Li For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d027441a95a9..f7ca48729738 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4191,7 +4191,6 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -4200,8 +4199,8 @@ static int ext4_block_truncate_page(handle_t *handle, if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + blocksize = i_blocksize(inode); + length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } -- 2.46.1 From: Baokun Li Previously, ext4_rec_len_(to|from)_disk only performed complex rec_len conversions when PAGE_SIZE >= 65536 to reduce complexity. However, we are soon to support file system block sizes greater than page size, which makes these conditional checks unnecessary. Thus, these checks are now removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f127aedbaee..3d18e6bf43cf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2475,28 +2475,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len, return (rec_len & ~EXT4_DIR_ROUND); } -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); -#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2506,9 +2497,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif } /* -- 2.46.1 From: Baokun Li When preparing for bs > ps support, clean up unnecessary PAGE_SIZE references in ext4_punch_hole(). Previously, when a hole extended beyond i_size, we aligned the hole end upwards to PAGE_SIZE to handle partial folio invalidation. Now that truncate_inode_pages_range() already handles partial folio invalidation correctly, this alignment is no longer required. However, to save pointless tail block zeroing, we still keep rounding up to the block size here. In addition, as Honza pointed out, when the hole end equals i_size, it should also be rounded up to the block size. This patch fixes that as well. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f7ca48729738..6fec3aa2268a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4406,10 +4406,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size. + * the block that contains i_size to save pointless tail block zeroing. */ - if (end > inode->i_size) - end = round_up(inode->i_size, PAGE_SIZE); + if (end >= inode->i_size) + end = round_up(inode->i_size, sb->s_blocksize); if (end > max_end) end = max_end; length = end - offset; -- 2.46.1 From: Baokun Li The dioread_nolock related processes already support large folio, so dioread_nolock is enabled by default regardless of whether the blocksize is less than, equal to, or greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 760c9d7588be..a9fa824487f9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4390,8 +4390,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize <= PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) -- 2.46.1 From: Baokun Li This commit introduces the s_min_folio_order field to the ext4_sb_info structure. This field will store the minimum folio order required by the current filesystem, laying groundwork for future support of block sizes greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 3 +++ fs/ext4/inode.c | 3 ++- fs/ext4/super.c | 10 +++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3d18e6bf43cf..6fe8cc3bf9a5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1688,6 +1688,9 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; + /* minimum folio order of a page cache allocation */ + unsigned int s_min_folio_order; + /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6fec3aa2268a..9faa0cf77075 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5181,7 +5181,8 @@ void ext4_set_inode_mapping_order(struct inode *inode) if (!ext4_should_enable_large_folio(inode)) return; - mapping_set_folio_order_range(inode->i_mapping, 0, + mapping_set_folio_order_range(inode->i_mapping, + EXT4_SB(inode->i_sb)->s_min_folio_order, EXT4_MAX_PAGECACHE_ORDER(inode)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9fa824487f9..a6314a3de51d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5107,11 +5107,8 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, * If the default block size is not the same as the real block size, * we need to reload it. */ - if (sb->s_blocksize == blocksize) { - *lsb = logical_sb_block; - sbi->s_sbh = bh; - return 0; - } + if (sb->s_blocksize == blocksize) + goto success; /* * bh must be released before kill_bdev(), otherwise @@ -5142,6 +5139,9 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } + +success: + sbi->s_min_folio_order = get_order(blocksize); *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; -- 2.46.1 From: Baokun Li ext4_calculate_overhead() used a single page for its bitmap buffer, which worked fine when PAGE_SIZE >= block size. However, with block size greater than page size (BS > PS) support, the bitmap can exceed a single page. To address this, we now use kvmalloc() to allocate memory of the filesystem block size, to properly support BS > PS. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a6314a3de51d..0d32370a459a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4189,7 +4189,7 @@ int ext4_calculate_overhead(struct super_block *sb) unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_NOFS); + char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO); if (!buf) return -ENOMEM; @@ -4214,7 +4214,7 @@ int ext4_calculate_overhead(struct super_block *sb) blks = count_overhead(sb, i, buf); overhead += blks; if (blks) - memset(buf, 0, PAGE_SIZE); + memset(buf, 0, sb->s_blocksize); cond_resched(); } @@ -4237,7 +4237,7 @@ int ext4_calculate_overhead(struct super_block *sb) } sbi->s_overhead = overhead; smp_wmb(); - free_page((unsigned long) buf); + kvfree(buf); return 0; } -- 2.46.1 From: Baokun Li In ext4_readdir(), page_cache_sync_readahead() is used to readahead mapped physical blocks. With LBS support, this can lead to a negative right shift. To fix this, the page index is now calculated by first converting the physical block number (pblk) to a file position (pos) before converting it to a page index. Also, the correct number of pages to readahead is now passed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d4164c507a90..256fe2c1d4c1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) continue; } if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_SHIFT - inode->i_blkbits); + pgoff_t index = map.m_pblk << inode->i_blkbits >> + PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, - &file->f_ra, file, - index, 1); + &file->f_ra, file, index, + 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { -- 2.46.1 From: Baokun Li No functional changes. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 20 +++++++++----------- fs/ext4/namei.c | 8 +++----- fs/ext4/verity.c | 2 +- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6fe8cc3bf9a5..c00ce6db69f0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -368,6 +368,7 @@ struct ext4_io_submit { blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) +#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c7d219e6c6d8..13c3cfeb13bc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4562,7 +4562,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = (loff_t)(map.m_lblk + ret) << blkbits; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9faa0cf77075..1153a26ff963 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -831,9 +831,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -2233,7 +2232,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2259,7 +2257,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2269,7 +2268,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2471,7 +2470,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3511,8 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3686,7 +3685,6 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; - u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; @@ -3745,7 +3743,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 045616033515..c4b5e252af0e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1076,7 +1076,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, - (block<i_sb)) + EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; @@ -1630,7 +1630,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; @@ -1710,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { - struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; @@ -1729,8 +1728,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, goto errout; retval = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), - res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b0acb0c50313..415d9c4d8a32 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -302,7 +302,7 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); - desc_size_pos = (u64)end_lblk << inode->i_blkbits; + desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk); ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) -- 2.46.1 From: Baokun Li As BS > PS support is coming, all block number to page index (and vice-versa) conversions must now go via bytes. Added EXT4_LBLK_TO_PG() and EXT4_PG_TO_LBLK() macros to simplify these conversions and handle both BS <= PS and BS > PS scenarios cleanly. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/ext4.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c00ce6db69f0..4bc0b2b7288a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -370,6 +370,12 @@ struct ext4_io_submit { (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) #define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) +/* Translate a block number to a page index */ +#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ + PAGE_SHIFT) +/* Translate a page index to a block number */ +#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ + (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ -- 2.46.1 From: Baokun Li Currently, ext4_mb_load_buddy_gfp() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, if buddy and bitmap land in the same folio, we get that folio’s ref instead of looking it up again before updating the buddy. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9087183602e4..143d6ff1fdef 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1642,17 +1642,15 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { - int blocks_per_page; int block; int pnum; - int poff; struct folio *folio; int ret; struct ext4_group_info *grp; @@ -1662,7 +1660,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, might_sleep(); mb_debug(sb, "load group %u\n", group); - blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; @@ -1690,8 +1687,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); @@ -1723,7 +1719,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + - (poff * sb->s_blocksize)); + offset_in_folio(folio, + EXT4_LBLK_TO_B(inode, block))); } folio_unlock(folio); } @@ -1739,12 +1736,18 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); + /* buddy and bitmap are on the same folio? */ + if (folio_contains(folio, pnum)) { + folio_get(folio); + goto update_buddy; + } + /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) @@ -1779,9 +1782,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } +update_buddy: /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; - e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_buddy = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; -- 2.46.1 From: Baokun Li Currently, ext4_mb_get_buddy_page_lock() uses blocks_per_page to calculate folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. To support BS > PS, use bytes to compute folio index and offset within folio to get rid of blocks_per_page. Also, since ext4_mb_get_buddy_page_lock() already fully supports folio, rename it to ext4_mb_get_buddy_folio_lock(). Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 143d6ff1fdef..b454a41dd6c1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1510,50 +1510,52 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) } /* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. + * Lock the buddy and bitmap folios. This makes sure other parallel init_group + * on the same buddy folio doesn't happen while holding the buddy folio lock. + * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap + * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, +static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; - int blocks_per_page; + int block, pnum; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; - blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; + pnum = EXT4_LBLK_TO_PG(inode, block); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_bitmap_folio = folio; - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ + block++; + pnum = EXT4_LBLK_TO_PG(inode, block); + if (folio_contains(folio, pnum)) { + /* buddy and bitmap are on the same folio */ return 0; } - /* blocks_per_page == 1, hence we need another page for the buddy */ - folio = __filemap_get_folio(inode->i_mapping, block + 1, + /* we need another folio for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_buddy_folio = folio; return 0; } @@ -1592,14 +1594,14 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * This ensures that we don't reinit the buddy cache - * page which map to the group from which we are already + * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. - * The call to ext4_mb_get_buddy_page_lock will mark the - * page accessed. + * would have pinned buddy folio to page cache. + * The call to ext4_mb_get_buddy_folio_lock will mark the + * folio accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); + ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group -- 2.46.1 From: Baokun Li Currently, ext4_mb_init_cache() uses blocks_per_page to calculate the folio index and offset. However, when blocksize is larger than PAGE_SIZE, blocks_per_page becomes zero, leading to a potential division-by-zero bug. Since we now have the folio, we know its exact size. This allows us to convert {blocks, groups}_per_page to {blocks, groups}_per_folio, thus supporting block sizes greater than page size. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b454a41dd6c1..3f10c64ab2b1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1329,26 +1329,25 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * block bitmap and buddy information. The information are * stored in the inode as * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. - * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 + * So for each group we take up 2 blocks. A folio can + * contain blocks_per_folio (folio_size / blocksize) blocks. + * So it can have information regarding groups_per_folio which + * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups - * for this page; do not hold this lock when calling this routine! + * for this folio; do not hold this lock when calling this routine! */ - static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; - int blocks_per_page; - int groups_per_page; + int blocks_per_folio; + int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; @@ -1365,27 +1364,24 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); - blocks_per_page = PAGE_SIZE / blocksize; + blocks_per_folio = folio_size(folio) / blocksize; + WARN_ON_ONCE(!blocks_per_folio); + groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); mb_debug(sb, "init folio %lu\n", folio->index); - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* allocate buffer_heads to read bitmaps */ - if (groups_per_page > 1) { - i = sizeof(struct buffer_head *) * groups_per_page; + if (groups_per_folio > 1) { + i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; - first_group = folio->index * blocks_per_page / 2; - /* read all groups the folio covers into the cache */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; @@ -1393,7 +1389,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) if (!grinfo) continue; /* - * If page is uptodate then we came here after online resize + * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. @@ -1413,7 +1409,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) } /* wait for I/O completion */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) @@ -1423,8 +1419,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) err = err2; } - first_block = folio->index * blocks_per_page; - for (i = 0; i < blocks_per_page; i++) { + first_block = EXT4_PG_TO_LBLK(inode, folio->index); + for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1501,7 +1497,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) out: if (bh) { - for (i = 0; i < groups_per_page; i++) + for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); -- 2.46.1 From: Baokun Li We use EXT4_BAD_INO for the buddy cache inode number. This inode is not accessed via __ext4_new_inode() or __ext4_iget(), meaning ext4_set_inode_mapping_order() is not called to set its folio order range. However, future block size greater than page size support requires this inode to support large folios, and the buddy cache code already handles BS > PS. Therefore, ext4_set_inode_mapping_order() is now explicitly called for this specific inode to set its folio order range. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3f10c64ab2b1..102c6439eb11 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3493,6 +3493,8 @@ static int ext4_mb_init_backend(struct super_block *sb) * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + ext4_set_inode_mapping_order(sbi->s_buddy_cache); + for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); -- 2.46.1 From: Zhihao Cheng The ext4 multi-block allocator now fully supports folio objects. Update all variable names, function names, and comments to replace legacy 'page' terminology with 'folio', improving clarity and consistency. No functional changes. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 102c6439eb11..0fbd4c5fffc0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -98,14 +98,14 @@ * block bitmap and buddy information. The information are stored in the * inode as: * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / - * blocksize) blocks. So it can have information regarding groups_per_page - * which is blocks_per_page/2 + * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / + * blocksize) blocks. So it can have information regarding groups_per_folio + * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. @@ -1556,7 +1556,7 @@ static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, return 0; } -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); @@ -1570,7 +1570,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack @@ -1618,7 +1618,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in - * the same page we don't need to force + * the same folio we don't need to force * init the buddy */ ret = 0; @@ -1634,7 +1634,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } err: - ext4_mb_put_buddy_page_lock(&e4b); + ext4_mb_put_buddy_folio_lock(&e4b); return ret; } @@ -2227,7 +2227,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_buddy = ret >> 16; /* - * take the page reference. We want the page to be pinned + * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped @@ -2933,7 +2933,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac, if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; - /* This now checks without needing the buddy page */ + /* This now checks without needing the buddy folio */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) @@ -4725,7 +4725,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the - * pages in the ext4_allocation_context so + * folios in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; -- 2.46.1 From: Baokun Li Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/readpage.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f329daf6e5c7..e7f2350c725b 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -213,9 +213,7 @@ int ext4_mpage_readpages(struct inode *inode, { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; @@ -251,9 +249,8 @@ int ext4_mpage_readpages(struct inode *inode, blocks_per_folio = folio_size(folio) >> blkbits; first_hole = blocks_per_folio; - block_in_file = next_block = - (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index); + last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages); last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) -- 2.46.1 From: Baokun Li Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1153a26ff963..80c2860abed2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1168,8 +1168,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; @@ -1178,12 +1177,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { -- 2.46.1 From: Baokun Li Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 80c2860abed2..1ac7ca9479eb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2297,15 +2297,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2316,7 +2315,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - lblk = folio->index << bpp_bits; + lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* -- 2.46.1 From: Baokun Li Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1ac7ca9479eb..c09859786563 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2618,7 +2618,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; @@ -2657,7 +2656,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ @@ -2735,8 +2734,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); -- 2.46.1 From: Zhihao Cheng Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c09859786563..22d215f90c64 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4074,7 +4074,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = EXT4_PG_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); if (!bh) -- 2.46.1 From: Baokun Li Currently, ext4_set_inode_mapping_order() does not set max folio order for files with the data journalling flag. For files that already have large folios enabled, ext4_inode_journal_mode() ignores the data journalling flag once max folio order is set. This is not because data journalling cannot work with large folios, but because credit estimates will go through the roof if there are too many blocks per folio. Since the real constraint is blocks-per-folio, to support data=journal under LBS, we now set max folio order to be equal to min folio order for files with the journalling flag. When LBS is disabled, the max folio order remains unset as before. Additionally, the max_order check in ext4_inode_journal_mode() is removed, and mapping order is reset in ext4_change_inode_journal_flag(). Suggested-by: Jan Kara Signed-off-by: Baokun Li --- fs/ext4/ext4_jbd2.c | 3 +-- fs/ext4/inode.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index a0e66bc10093..05e5946ed9b3 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC) && - !mapping_large_folio_support(inode->i_mapping))) { + !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22d215f90c64..517701024d18 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return false; if (ext4_has_feature_verity(sb)) return false; if (ext4_has_feature_encrypt(sb)) @@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode) umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) void ext4_set_inode_mapping_order(struct inode *inode) { + u32 max_order; + if (!ext4_should_enable_large_folio(inode)) return; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; + else + max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + mapping_set_folio_order_range(inode->i_mapping, EXT4_SB(inode->i_sb)->s_min_folio_order, - EXT4_MAX_PAGECACHE_ORDER(inode)); + max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -6585,6 +6590,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + ext4_set_inode_mapping_order(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); -- 2.46.1 From: Baokun Li Eric Biggers already added support for verifying data from large folios several years ago in commit 5d0f0e57ed90 ("fsverity: support verifying data from large folios"). With ext4 now supporting large block sizes, the fs-verity tests `kvm-xfstests -c ext4/64k -g verity -x encrypt` pass without issues. Therefore, remove the restriction and allow LBS to be enabled together with fs-verity. Cc: Eric Biggers Signed-off-by: Baokun Li --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 517701024d18..b95826e4a419 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5152,8 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (ext4_has_feature_verity(sb)) - return false; if (ext4_has_feature_encrypt(sb)) return false; @@ -5175,7 +5173,8 @@ void ext4_set_inode_mapping_order(struct inode *inode) return; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) || + ext4_has_feature_verity(inode->i_sb)) max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; else max_order = EXT4_MAX_PAGECACHE_ORDER(inode); -- 2.46.1 From: Baokun Li Supporting a block size greater than the page size (BS > PS) requires support for large folios. However, several features (e.g., encrypt) do not yet support large folios. To prevent conflicts, this patch adds checks at mount time to prohibit these features from being used when BS > PS. Since these features cannot be changed on remount, there is no need to check on remount. This patch adds s_max_folio_order, initialized during mount according to filesystem features and mount options. If s_max_folio_order is 0, large folios are disabled. With this in place, ext4_set_inode_mapping_order() can be simplified by checking s_max_folio_order, avoiding redundant checks. Signed-off-by: Baokun Li --- fs/ext4/ext4.h | 4 +++- fs/ext4/inode.c | 39 ++++++++++----------------------------- fs/ext4/super.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4bc0b2b7288a..79dc231d6e22 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1696,7 +1696,9 @@ struct ext4_sb_info { unsigned long s_last_trim_minblks; /* minimum folio order of a page cache allocation */ - unsigned int s_min_folio_order; + u16 s_min_folio_order; + /* supported maximum folio order, 0 means not supported */ + u16 s_max_folio_order; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b95826e4a419..d53dc5b794d4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,42 +5146,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return -EFSCORRUPTED; } -static bool ext4_should_enable_large_folio(struct inode *inode) +void ext4_set_inode_mapping_order(struct inode *inode) { struct super_block *sb = inode->i_sb; + u16 min_order, max_order; - if (!S_ISREG(inode->i_mode)) - return false; - if (ext4_has_feature_encrypt(sb)) - return false; - - return true; -} - -/* - * Limit the maximum folio order to 2048 blocks to prevent overestimation - * of reserve handle credits during the folio writeback in environments - * where the PAGE_SIZE exceeds 4KB. - */ -#define EXT4_MAX_PAGECACHE_ORDER(i) \ - umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) -void ext4_set_inode_mapping_order(struct inode *inode) -{ - u32 max_order; + max_order = EXT4_SB(sb)->s_max_folio_order; + if (!max_order) + return; - if (!ext4_should_enable_large_folio(inode)) + min_order = EXT4_SB(sb)->s_min_folio_order; + if (!min_order && !S_ISREG(inode->i_mode)) return; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) || - ext4_has_feature_verity(inode->i_sb)) - max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; - else - max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = min_order; - mapping_set_folio_order_range(inode->i_mapping, - EXT4_SB(inode->i_sb)->s_min_folio_order, - max_order); + mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0d32370a459a..6735152dd219 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5040,6 +5040,43 @@ static const char *ext4_has_journal_option(struct super_block *sb) return NULL; } +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(sb) \ + umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT)) +static void ext4_set_max_mapping_order(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + sbi->s_max_folio_order = sbi->s_min_folio_order; + else if (ext4_has_feature_verity(sb)) + sbi->s_max_folio_order = sbi->s_min_folio_order; + else + sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb); +} + +static int ext4_check_large_folio(struct super_block *sb) +{ + const char *err_str = NULL; + + if (ext4_has_feature_encrypt(sb)) + err_str = "encrypt"; + + if (!err_str) { + ext4_set_max_mapping_order(sb); + } else if (sb->s_blocksize > PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s", + sb->s_blocksize, PAGE_SIZE, err_str); + return -EINVAL; + } + + return 0; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5316,6 +5353,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); + err = ext4_check_large_folio(sb); + if (err < 0) + goto failed_mount; + err = ext4_encoding_init(sb, es); if (err) goto failed_mount; -- 2.46.1 From: Baokun Li Since block device (See commit 3c20917120ce ("block/bdev: enable large folio support for large logical block sizes")) and page cache (See commit ab95d23bab220ef8 ("filemap: allocate mapping_min_order folios in the page cache")) has the ability to have a minimum order when allocating folio, and ext4 has supported large folio in commit 7ac67301e82f ("ext4: enable large folio for regular file"), now add support for block_size > PAGE_SIZE in ext4. set_blocksize() -> bdev_validate_blocksize() already validates the block size, so ext4_load_super() does not need to perform additional checks. Here we only need to add the FS_LBS bit to fs_flags. In addition, allocation failures for large folios may trigger warn_alloc() warnings. Therefore, as with XFS, mark this feature as experimental. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi --- fs/ext4/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6735152dd219..1fbbae5a0426 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5074,6 +5074,9 @@ static int ext4_check_large_folio(struct super_block *sb) return -EINVAL; } + if (sb->s_blocksize > PAGE_SIZE) + ext4_msg(sb, KERN_NOTICE, "EXPERIMENTAL bs(%lu) > ps(%lu) enabled.", + sb->s_blocksize, PAGE_SIZE); return 0; } @@ -7453,7 +7456,8 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("ext4"); -- 2.46.1