EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which issues an atomic write. This can cause high overhead due to cache contention when multiple threads iterate over groups in a tight loop, as is the case for ext4_mb_prefetch(). We have seen this to be a problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction for this purpose. This change significantly reduces costs of fallocate() operations which trigger linear group scans on large multicore machines where test_and_set_bit issues an atomic write operation unconditionally. Signed-off-by: Bohdan Trach --- fs/ext4/ext4.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 56b82d4a15d7..0713207811a6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3551,7 +3551,17 @@ struct ext4_group_info { #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ - (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) + (ext4_mb_grp_test_and_set_read((grp))) + +static inline int ext4_mb_grp_test_and_set_read(struct ext4_group_info *grp) +{ + int r = test_bit_acquire(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state); + + if (!r) + return test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state); + else + return r; +} #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 -- 2.43.0