From: John Garry <john.g.garry@oracle.com>

This is in preparation for adding atomic write support for buffered
IO. Since the limits reported by FS for atomic write buffered IO
could be different from direct IO, rename STATX_WRITE_ATOMIC ->
STATX_WRITE_ATOMIC_DIO and STATX_ATTR_WRITE_ATOMIC ->
STATX_ATTR_WRITE_ATOMIC_DIO, to make it clear that they are only
relevant to direct IO.

Later we will add a separate flag for reporting atomic write with
buffered IO

Co-developed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 Documentation/filesystems/ext4/atomic_writes.rst  | 4 ++--
 block/bdev.c                                      | 4 ++--
 fs/ext4/inode.c                                   | 2 +-
 fs/stat.c                                         | 8 ++++----
 fs/xfs/xfs_iops.c                                 | 2 +-
 include/trace/misc/fs.h                           | 2 +-
 include/uapi/linux/stat.h                         | 8 ++++++--
 tools/include/uapi/linux/stat.h                   | 8 ++++++--
 tools/perf/trace/beauty/include/uapi/linux/stat.h | 8 ++++++--
 9 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/Documentation/filesystems/ext4/atomic_writes.rst b/Documentation/filesystems/ext4/atomic_writes.rst
index ae8995740aa8..108c9e9cb977 100644
--- a/Documentation/filesystems/ext4/atomic_writes.rst
+++ b/Documentation/filesystems/ext4/atomic_writes.rst
@@ -189,7 +189,7 @@ The write must be aligned to the filesystem's block size and not exceed the
 filesystem's maximum atomic write unit size.
 See ``generic_atomic_write_valid()`` for more details.
 
-``statx()`` system call with ``STATX_WRITE_ATOMIC`` flag can provide following
+``statx()`` system call with ``STATX_WRITE_ATOMIC_DIO`` flag can provide following
 details:
 
  * ``stx_atomic_write_unit_min``: Minimum size of an atomic write request.
@@ -198,7 +198,7 @@ details:
    separate memory buffers that can be gathered into a write operation
    (e.g., the iovcnt parameter for IOV_ITER). Currently, this is always set to one.
 
-The STATX_ATTR_WRITE_ATOMIC flag in ``statx->attributes`` is set if atomic
+The STATX_ATTR_WRITE_ATOMIC_DIO flag in ``statx->attributes`` is set if atomic
 writes are supported.
 
 .. _atomic_write_bdev_support:
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..3bc90d5feb4c 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1308,7 +1308,7 @@ void sync_bdevs(bool wait)
 }
 
 /*
- * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices.
+ * Handle STATX_{DIOALIGN, WRITE_ATOMIC_DIO} for block devices.
  */
 void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
 {
@@ -1330,7 +1330,7 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
 		stat->result_mask |= STATX_DIOALIGN;
 	}
 
-	if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
+	if (request_mask & STATX_WRITE_ATOMIC_DIO && bdev_can_atomic_write(bdev)) {
 		struct request_queue *bd_queue = bdev->bd_queue;
 
 		generic_fill_statx_atomic_writes(stat,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9e4ac87211e..9555149a8ba6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6097,7 +6097,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
 		}
 	}
 
-	if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
+	if ((request_mask & STATX_WRITE_ATOMIC_DIO) && S_ISREG(inode->i_mode)) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		unsigned int awu_min = 0, awu_max = 0;
 
diff --git a/fs/stat.c b/fs/stat.c
index 6c79661e1b96..7eb2a247ab67 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -138,7 +138,7 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
  * @unit_max:	Maximum supported atomic write length in bytes
  * @unit_max_opt: Optimised maximum supported atomic write length in bytes
  *
- * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
+ * Fill in the STATX{_ATTR}_WRITE_ATOMIC_DIO flags in the kstat structure from
  * atomic write unit_min and unit_max values.
  */
 void generic_fill_statx_atomic_writes(struct kstat *stat,
@@ -147,10 +147,10 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
 				      unsigned int unit_max_opt)
 {
 	/* Confirm that the request type is known */
-	stat->result_mask |= STATX_WRITE_ATOMIC;
+	stat->result_mask |= STATX_WRITE_ATOMIC_DIO;
 
 	/* Confirm that the file attribute type is known */
-	stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;
+	stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC_DIO;
 
 	if (unit_min) {
 		stat->atomic_write_unit_min = unit_min;
@@ -160,7 +160,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
 		stat->atomic_write_segments_max = 1;
 
 		/* Confirm atomic writes are actually supported */
-		stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
+		stat->attributes |= STATX_ATTR_WRITE_ATOMIC_DIO;
 	}
 }
 EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faea..f41fcdd3043b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -741,7 +741,7 @@ xfs_vn_getattr(
 	case S_IFREG:
 		if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
 			xfs_report_dioalign(ip, stat);
-		if (request_mask & STATX_WRITE_ATOMIC)
+		if (request_mask & STATX_WRITE_ATOMIC_DIO)
 			xfs_report_atomic_write(ip, stat);
 		fallthrough;
 	default:
diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
index 7ead1c61f0cb..19ea9339b9bd 100644
--- a/include/trace/misc/fs.h
+++ b/include/trace/misc/fs.h
@@ -161,5 +161,5 @@
 		{ STATX_DIOALIGN,	"DIOALIGN" },		\
 		{ STATX_MNT_ID_UNIQUE,	"MNT_ID_UNIQUE" },	\
 		{ STATX_SUBVOL,		"SUBVOL" },		\
-		{ STATX_WRITE_ATOMIC,	"WRITE_ATOMIC" },	\
+		{ STATX_WRITE_ATOMIC_DIO,	"WRITE_ATOMIC_DIO" },   \
 		{ STATX_DIO_READ_ALIGN,	"DIO_READ_ALIGN" })
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1686861aae20..57f558be933e 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -217,7 +217,9 @@ struct statx {
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
 #define STATX_SUBVOL		0x00008000U	/* Want/got stx_subvol */
-#define STATX_WRITE_ATOMIC	0x00010000U	/* Want/got atomic_write_* fields */
+#define STATX_WRITE_ATOMIC_DIO	0x00010000U	/* Want/got dio atomic_write_* fields */
+/* Old name kept for backward compatibility */
+#define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
@@ -254,7 +256,9 @@ struct statx {
 #define STATX_ATTR_MOUNT_ROOT		0x00002000 /* Root of a mount */
 #define STATX_ATTR_VERITY		0x00100000 /* [I] Verity protected file */
 #define STATX_ATTR_DAX			0x00200000 /* File is currently in DAX state */
-#define STATX_ATTR_WRITE_ATOMIC		0x00400000 /* File supports atomic write operations */
+#define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
+/* Old name kept for backward compatibility */
+#define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
 
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h
index 1686861aae20..57f558be933e 100644
--- a/tools/include/uapi/linux/stat.h
+++ b/tools/include/uapi/linux/stat.h
@@ -217,7 +217,9 @@ struct statx {
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
 #define STATX_SUBVOL		0x00008000U	/* Want/got stx_subvol */
-#define STATX_WRITE_ATOMIC	0x00010000U	/* Want/got atomic_write_* fields */
+#define STATX_WRITE_ATOMIC_DIO	0x00010000U	/* Want/got dio atomic_write_* fields */
+/* Old name kept for backward compatibility */
+#define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
@@ -254,7 +256,9 @@ struct statx {
 #define STATX_ATTR_MOUNT_ROOT		0x00002000 /* Root of a mount */
 #define STATX_ATTR_VERITY		0x00100000 /* [I] Verity protected file */
 #define STATX_ATTR_DAX			0x00200000 /* File is currently in DAX state */
-#define STATX_ATTR_WRITE_ATOMIC		0x00400000 /* File supports atomic write operations */
+#define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
+/* Old name kept for backward compatibility */
+#define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
 
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h
index 1686861aae20..57f558be933e 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/stat.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h
@@ -217,7 +217,9 @@ struct statx {
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
 #define STATX_SUBVOL		0x00008000U	/* Want/got stx_subvol */
-#define STATX_WRITE_ATOMIC	0x00010000U	/* Want/got atomic_write_* fields */
+#define STATX_WRITE_ATOMIC_DIO	0x00010000U	/* Want/got dio atomic_write_* fields */
+/* Old name kept for backward compatibility */
+#define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
@@ -254,7 +256,9 @@ struct statx {
 #define STATX_ATTR_MOUNT_ROOT		0x00002000 /* Root of a mount */
 #define STATX_ATTR_VERITY		0x00100000 /* [I] Verity protected file */
 #define STATX_ATTR_DAX			0x00200000 /* File is currently in DAX state */
-#define STATX_ATTR_WRITE_ATOMIC		0x00400000 /* File supports atomic write operations */
+#define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
+/* Old name kept for backward compatibility */
+#define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
 
 
 #endif /* _UAPI_LINUX_STAT_H */
-- 
2.51.0

From: John Garry <john.g.garry@oracle.com>

Add page flag PG_atomic, meaning that a folio needs to be written back
atomically. This will be used by for handling RWF_ATOMIC buffered IO
in upcoming patches.

Co-developed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 include/linux/page-flags.h     | 5 +++++
 include/trace/events/mmflags.h | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0091ad1986bf..bdce0f58a77a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -111,6 +111,7 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
 	PG_dropbehind,		/* drop pages on IO completion */
+	PG_atomic,		/* Page is marked atomic for buffered atomic writes */
 #ifdef CONFIG_MMU
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
@@ -644,6 +645,10 @@ FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
 	__FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
 	FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
 
+FOLIO_FLAG(atomic, FOLIO_HEAD_PAGE)
+	__FOLIO_CLEAR_FLAG(atomic, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(atomic, FOLIO_HEAD_PAGE)
+
 #ifdef CONFIG_MMU
 FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
 	__FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index aa441f593e9a..a8294f6146a5 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -159,7 +159,8 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT);
 	DEF_PAGEFLAG_NAME(reclaim),					\
 	DEF_PAGEFLAG_NAME(swapbacked),					\
 	DEF_PAGEFLAG_NAME(unevictable),					\
-	DEF_PAGEFLAG_NAME(dropbehind)					\
+	DEF_PAGEFLAG_NAME(dropbehind),					\
+	DEF_PAGEFLAG_NAME(atomic)					\
 IF_HAVE_PG_MLOCK(mlocked)						\
 IF_HAVE_PG_HWPOISON(hwpoison)						\
 IF_HAVE_PG_IDLE(idle)							\
-- 
2.51.0

Extend statx system call to return additional info for buffered atomic
write support for a file. Currently only direct IO is supported.

New flags STATX_WRITE_ATOMIC_BUF and STATX_ATTR_WRITE_ATOMIC_BUF are for
indicating whether the file knows and supports buffered atomic writes.

Structure statx members stx_atomic_write_unit_{min, max, segments_max}
will be reused for bufferd atomic writes. Flags STATX_WRITE_ATOMIC_DIO
and STATX_WRITE_ATOMIC_BUF are mutually exclusive. With both flags set,
statx will ignore the request and neither fields in statx.result_mask
will be set.

Also, make sure ext4 and xfs report atomic write unit min and max of 0
when the new flag is passed.

Co-developed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 block/bdev.c                                  |   3 +-
 fs/ext4/inode.c                               |   7 +-
 fs/stat.c                                     |  33 +++--
 fs/xfs/xfs_file.c                             |   9 +-
 fs/xfs/xfs_iops.c                             | 121 ++++++++++--------
 fs/xfs/xfs_iops.h                             |   6 +-
 include/linux/fs.h                            |   3 +-
 include/trace/misc/fs.h                       |   1 +
 include/uapi/linux/stat.h                     |   2 +
 tools/include/uapi/linux/stat.h               |   2 +
 .../trace/beauty/include/uapi/linux/stat.h    |   2 +
 11 files changed, 119 insertions(+), 70 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 3bc90d5feb4c..8f0eab0a1ecf 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1335,8 +1335,7 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
 
 		generic_fill_statx_atomic_writes(stat,
 			queue_atomic_write_unit_min_bytes(bd_queue),
-			queue_atomic_write_unit_max_bytes(bd_queue),
-			0);
+			queue_atomic_write_unit_max_bytes(bd_queue), 0, true);
 	}
 
 	stat->blksize = bdev_io_min(bdev);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9555149a8ba6..0d5013993fba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6106,8 +6106,11 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
 			awu_max = sbi->s_awu_max;
 		}
 
-		generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
-	}
+		generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0,
+						 true);
+	} else if (request_mask & STATX_WRITE_ATOMIC_BUF)
+		/* Atomic writes for buferred IO not supported yet */
+		generic_fill_statx_atomic_writes(stat, 0, 0, 0, false);
 
 	flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 	if (flags & EXT4_APPEND_FL)
diff --git a/fs/stat.c b/fs/stat.c
index 7eb2a247ab67..8ba3993dcd09 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -137,20 +137,27 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
  * @unit_min:	Minimum supported atomic write length in bytes
  * @unit_max:	Maximum supported atomic write length in bytes
  * @unit_max_opt: Optimised maximum supported atomic write length in bytes
+ * @is_dio:	Is the stat request for dio
  *
- * Fill in the STATX{_ATTR}_WRITE_ATOMIC_DIO flags in the kstat structure from
- * atomic write unit_min and unit_max values.
+ * Fill in the STATX{_ATTR}_WRITE_ATOMIC_{DIO,BUF} flags in the kstat structure
+ * from atomic write unit_min and unit_max values.
  */
 void generic_fill_statx_atomic_writes(struct kstat *stat,
 				      unsigned int unit_min,
 				      unsigned int unit_max,
-				      unsigned int unit_max_opt)
+				      unsigned int unit_max_opt,
+				      bool is_dio)
 {
-	/* Confirm that the request type is known */
-	stat->result_mask |= STATX_WRITE_ATOMIC_DIO;
+	if (is_dio) {
+		/* Confirm that the request type is known */
+		stat->result_mask |= STATX_WRITE_ATOMIC_DIO;
 
-	/* Confirm that the file attribute type is known */
-	stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC_DIO;
+		/* Confirm that the file attribute type is known */
+		stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC_DIO;
+	} else {
+		stat->result_mask |= STATX_WRITE_ATOMIC_BUF;
+		stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC_BUF;
+	}
 
 	if (unit_min) {
 		stat->atomic_write_unit_min = unit_min;
@@ -160,7 +167,10 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
 		stat->atomic_write_segments_max = 1;
 
 		/* Confirm atomic writes are actually supported */
-		stat->attributes |= STATX_ATTR_WRITE_ATOMIC_DIO;
+		if (is_dio)
+			stat->attributes |= STATX_ATTR_WRITE_ATOMIC_DIO;
+		else
+			stat->attributes |= STATX_ATTR_WRITE_ATOMIC_BUF;
 	}
 }
 EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);
@@ -206,6 +216,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 	stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
 				  STATX_ATTR_DAX);
 
+	if (request_mask & STATX_WRITE_ATOMIC_BUF &&
+	    request_mask & STATX_WRITE_ATOMIC_DIO) {
+		/* Both are mutually exclusive, disable them */
+		request_mask &=
+			~(STATX_WRITE_ATOMIC_BUF | STATX_WRITE_ATOMIC_DIO);
+	}
+
 	idmap = mnt_idmap(path->mnt);
 	if (inode->i_op->getattr) {
 		int ret;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b9864c8582e..3efa575570ed 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1087,6 +1087,7 @@ xfs_file_write_iter(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	size_t			ocount = iov_iter_count(from);
+	bool is_dio = iocb->ki_flags & IOCB_DIRECT;
 
 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
 
@@ -1097,10 +1098,10 @@ xfs_file_write_iter(
 		return -EIO;
 
 	if (iocb->ki_flags & IOCB_ATOMIC) {
-		if (ocount < xfs_get_atomic_write_min(ip))
+		if (ocount < xfs_get_atomic_write_min(ip, is_dio))
 			return -EINVAL;
 
-		if (ocount > xfs_get_atomic_write_max(ip))
+		if (ocount > xfs_get_atomic_write_max(ip, is_dio))
 			return -EINVAL;
 
 		ret = generic_atomic_write_valid(iocb, from);
@@ -1111,7 +1112,7 @@ xfs_file_write_iter(
 	if (IS_DAX(inode))
 		return xfs_file_dax_write(iocb, from);
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (is_dio) {
 		/*
 		 * Allow a directio write to fall back to a buffered
 		 * write *only* in the case that we're doing a reflink
@@ -1568,7 +1569,7 @@ xfs_file_open(
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
-	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
+	if (xfs_get_atomic_write_min(XFS_I(inode), file->f_flags & O_DIRECT) > 0)
 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 	return generic_file_open(inode, file);
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f41fcdd3043b..f036c46b19c5 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -601,81 +601,99 @@ xfs_report_dioalign(
 
 unsigned int
 xfs_get_atomic_write_min(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	bool			is_dio)
 {
-	struct xfs_mount	*mp = ip->i_mount;
+	if (is_dio) {
+		struct xfs_mount *mp = ip->i_mount;
 
-	/*
-	 * If we can complete an atomic write via atomic out of place writes,
-	 * then advertise a minimum size of one fsblock.  Without this
-	 * mechanism, we can only guarantee atomic writes up to a single LBA.
-	 *
-	 * If out of place writes are not available, we can guarantee an atomic
-	 * write of exactly one single fsblock if the bdev will make that
-	 * guarantee for us.
-	 */
-	if (xfs_inode_can_hw_atomic_write(ip) ||
-	    xfs_inode_can_sw_atomic_write(ip))
-		return mp->m_sb.sb_blocksize;
+		/*
+		 * If we can complete an atomic write via atomic out of place writes,
+		 * then advertise a minimum size of one fsblock.  Without this
+		 * mechanism, we can only guarantee atomic writes up to a single LBA.
+		 *
+		 * If out of place writes are not available, we can guarantee an atomic
+		 * write of exactly one single fsblock if the bdev will make that
+		 * guarantee for us.
+		 */
+		if (xfs_inode_can_hw_atomic_write(ip) ||
+		    xfs_inode_can_sw_atomic_write(ip))
+			return mp->m_sb.sb_blocksize;
+	}
 
+	/* buffered IO not supported yet so return 0 right away */
 	return 0;
 }
 
 unsigned int
 xfs_get_atomic_write_max(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	bool			is_dio)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	/*
-	 * If out of place writes are not available, we can guarantee an atomic
-	 * write of exactly one single fsblock if the bdev will make that
-	 * guarantee for us.
-	 */
-	if (!xfs_inode_can_sw_atomic_write(ip)) {
-		if (xfs_inode_can_hw_atomic_write(ip))
-			return mp->m_sb.sb_blocksize;
-		return 0;
+	if (is_dio) {
+		/*
+		 * If out of place writes are not available, we can guarantee an atomic
+		 * write of exactly one single fsblock if the bdev will make that
+		 * guarantee for us.
+		 */
+		if (!xfs_inode_can_sw_atomic_write(ip)) {
+			if (xfs_inode_can_hw_atomic_write(ip))
+				return mp->m_sb.sb_blocksize;
+			return 0;
+		}
+
+		/*
+		 * If we can complete an atomic write via atomic out of place writes,
+		 * then advertise a maximum size of whatever we can complete through
+		 * that means.  Hardware support is reported via max_opt, not here.
+		 */
+		if (XFS_IS_REALTIME_INODE(ip))
+			return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
+		return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
 	}
 
-	/*
-	 * If we can complete an atomic write via atomic out of place writes,
-	 * then advertise a maximum size of whatever we can complete through
-	 * that means.  Hardware support is reported via max_opt, not here.
-	 */
-	if (XFS_IS_REALTIME_INODE(ip))
-		return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
-	return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
+	/* buffered IO not supported yet so return 0 right away */
+	return 0;
 }
 
 unsigned int
 xfs_get_atomic_write_max_opt(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	bool			is_dio)
 {
-	unsigned int		awu_max = xfs_get_atomic_write_max(ip);
+	if (is_dio) {
+		unsigned int awu_max = xfs_get_atomic_write_max(ip, is_dio);
 
-	/* if the max is 1x block, then just keep behaviour that opt is 0 */
-	if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
-		return 0;
+		/* if the max is 1x block, then just keep behaviour that opt is 0 */
+		if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
+			return 0;
 
-	/*
-	 * Advertise the maximum size of an atomic write that we can tell the
-	 * block device to perform for us.  In general the bdev limit will be
-	 * less than our out of place write limit, but we don't want to exceed
-	 * the awu_max.
-	 */
-	return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
+		/*
+		 * Advertise the maximum size of an atomic write that we can tell the
+		 * block device to perform for us.  In general the bdev limit will be
+		 * less than our out of place write limit, but we don't want to exceed
+		 * the awu_max.
+		 */
+		return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
+	}
+
+	/* buffered IO not supported yet so return 0 right away */
+	return 0;
 }
 
 static void
 xfs_report_atomic_write(
 	struct xfs_inode	*ip,
-	struct kstat		*stat)
+	struct kstat		*stat,
+	bool			is_dio)
 {
 	generic_fill_statx_atomic_writes(stat,
-			xfs_get_atomic_write_min(ip),
-			xfs_get_atomic_write_max(ip),
-			xfs_get_atomic_write_max_opt(ip));
+					 xfs_get_atomic_write_min(ip, is_dio),
+					 xfs_get_atomic_write_max(ip, is_dio),
+					 xfs_get_atomic_write_max_opt(ip, is_dio),
+					 is_dio);
 }
 
 STATIC int
@@ -741,8 +759,11 @@ xfs_vn_getattr(
 	case S_IFREG:
 		if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
 			xfs_report_dioalign(ip, stat);
-		if (request_mask & STATX_WRITE_ATOMIC_DIO)
-			xfs_report_atomic_write(ip, stat);
+		if (request_mask &
+		    (STATX_WRITE_ATOMIC_DIO | STATX_WRITE_ATOMIC_BUF))
+			xfs_report_atomic_write(ip, stat,
+						(request_mask &
+						 STATX_WRITE_ATOMIC_DIO));
 		fallthrough;
 	default:
 		stat->blksize = xfs_stat_blksize(ip);
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 0896f6b8b3b8..09e79263add1 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,8 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
 extern void xfs_setup_inode(struct xfs_inode *ip);
 extern void xfs_setup_iops(struct xfs_inode *ip);
 extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
-unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
-unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
-unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip, bool is_dio);
+unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip, bool is_dio);
+unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip, bool is_dio);
 
 #endif /* __XFS_IOPS_H__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..2dec66913e97 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3563,7 +3563,8 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
 void generic_fill_statx_atomic_writes(struct kstat *stat,
 				      unsigned int unit_min,
 				      unsigned int unit_max,
-				      unsigned int unit_max_opt);
+				      unsigned int unit_max_opt,
+				      bool is_dio);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
index 19ea9339b9bd..3b69910a5998 100644
--- a/include/trace/misc/fs.h
+++ b/include/trace/misc/fs.h
@@ -162,4 +162,5 @@
 		{ STATX_MNT_ID_UNIQUE,	"MNT_ID_UNIQUE" },	\
 		{ STATX_SUBVOL,		"SUBVOL" },		\
 		{ STATX_WRITE_ATOMIC_DIO,	"WRITE_ATOMIC_DIO" },   \
+		{ STATX_WRITE_ATOMIC_BUF,	"WRITE_ATOMIC_BUF" },   \
 		{ STATX_DIO_READ_ALIGN,	"DIO_READ_ALIGN" })
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 57f558be933e..2d77da04df23 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -221,6 +221,7 @@ struct statx {
 /* Old name kept for backward compatibility */
 #define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
+#define STATX_WRITE_ATOMIC_BUF	0x00040000U	/* Want/got buf-io atomic_write_* fields */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
@@ -259,6 +260,7 @@ struct statx {
 #define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
 /* Old name kept for backward compatibility */
 #define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
+#define STATX_ATTR_WRITE_ATOMIC_BUF	0x00800000 /* File supports buf-io atomic write operations */
 
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h
index 57f558be933e..a7e0036669c2 100644
--- a/tools/include/uapi/linux/stat.h
+++ b/tools/include/uapi/linux/stat.h
@@ -221,6 +221,7 @@ struct statx {
 /* Old name kept for backward compatibility */
 #define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
+#define STATX_WRITE_ATOMIC_BUF  0x00040000U	/* Want/got buf-io atomic_write_* fields */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
@@ -259,6 +260,7 @@ struct statx {
 #define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
 /* Old name kept for backward compatibility */
 #define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
+#define STATX_ATTR_WRITE_ATOMIC_BUF	0x00800000 /* File supports buf-io atomic write operations */
 
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h
index 57f558be933e..2d77da04df23 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/stat.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h
@@ -221,6 +221,7 @@ struct statx {
 /* Old name kept for backward compatibility */
 #define STATX_WRITE_ATOMIC	STATX_WRITE_ATOMIC_DIO
 #define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
+#define STATX_WRITE_ATOMIC_BUF	0x00040000U	/* Want/got buf-io atomic_write_* fields */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
@@ -259,6 +260,7 @@ struct statx {
 #define STATX_ATTR_WRITE_ATOMIC_DIO	0x00400000 /* File supports dio atomic write operations */
 /* Old name kept for backward compatibility */
 #define STATX_ATTR_WRITE_ATOMIC	STATX_ATTR_WRITE_ATOMIC_DIO
+#define STATX_ATTR_WRITE_ATOMIC_BUF	0x00800000 /* File supports buf-io atomic write operations */
 
 
 #endif /* _UAPI_LINUX_STAT_H */
-- 
2.51.0

Add special handling of PG_atomic flag to iomap buffered write path.

To flag an iomap iter for an atomic write, set IOMAP_ATOMIC. For a folio
associated with a write which has IOMAP_ATOMIC set, set PG_atomic.
Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.

This means that for an "atomic" folio which has not been written back,
it loses it "atomicity". So if userspace issues a write with RWF_ATOMIC
set and another write with RWF_ATOMIC unset, that folio is not written back
atomically. For such a scenario to occur, it would be considered a userspace
usage error.

To ensure that a buffered atomic write is written back atomically when
the write syscall returns, RWF_SYNC or similar needs to be used (in
conjunction with RWF_ATOMIC).

Only a single BIO should ever be submitted for an atomic write. So
modify iomap_add_to_ioend() to ensure that we don't try to write back an
atomic folio as part of a larger mixed-atomicity BIO.

In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
the allocated BIO. When a folio is written back, again clear PG_atomic,
as it is no longer required.

Currently, RWF_ATOMIC with buffered IO is limited to single block
size writes, and has 2 main restrictions:

1. Only blocksize == pagesize is supported
2. Writes where the user buffer is not aligned to PAGE_SIZE are not
   supported

For more details, refer to the comment in generic_atomic_write_valid()

Co-developed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/iomap/buffered-io.c | 48 ++++++++++++++++++++++++++++++++++++------
 fs/iomap/ioend.c       | 18 ++++++++++++----
 fs/read_write.c        | 34 ++++++++++++++++++++++++++++--
 include/linux/iomap.h  |  2 ++
 4 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f099c086cbe8..947c76c2688a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -850,11 +850,13 @@ static int iomap_write_begin(struct iomap_iter *iter,
 {
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
 	loff_t pos;
-	u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
+	u64 orig_len = min_t(u64, SIZE_MAX, iomap_length(iter));
+	u64 len;
 	struct folio *folio;
 	int status = 0;
+	bool is_atomic = iter->flags & IOMAP_ATOMIC;
 
-	len = min_not_zero(len, *plen);
+	len = min_not_zero(orig_len, *plen);
 	*foliop = NULL;
 	*plen = 0;
 
@@ -922,6 +924,11 @@ static int iomap_write_begin(struct iomap_iter *iter,
 	if (unlikely(status))
 		goto out_unlock;
 
+	if (is_atomic && (len != orig_len)) {
+		status = -EINVAL;
+		goto out_unlock;
+	}
+
 	*foliop = folio;
 	*plen = len;
 	return 0;
@@ -931,7 +938,7 @@ static int iomap_write_begin(struct iomap_iter *iter,
 	return status;
 }
 
-static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
 	flush_dcache_folio(folio);
@@ -951,7 +958,27 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		return false;
 	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
 	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
-	filemap_dirty_folio(inode->i_mapping, folio);
+	filemap_dirty_folio(iter->inode->i_mapping, folio);
+
+	/*
+	 * Policy: non atomic write over a previously atomic range makes the
+	 * range non-atomic. Handle this here.
+	 */
+	if (iter->flags & IOMAP_ATOMIC) {
+		if (copied < len) {
+			/*
+			 * A short atomic write is only okay as long as nothing
+			 * is written at all. If we have a partial write, there
+			 * is a bug in our code.
+			 */
+			WARN_ON_ONCE(copied != 0);
+
+			return false;
+		}
+		folio_set_atomic(folio);
+	} else
+		folio_clear_atomic(folio);
+
 	return true;
 }
 
@@ -997,7 +1024,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
 		return bh_written == copied;
 	}
 
-	return __iomap_write_end(iter->inode, pos, len, copied, folio);
+	return __iomap_write_end(iter, pos, len, copied, folio);
 }
 
 static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
@@ -1124,6 +1151,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 		iter.flags |= IOMAP_NOWAIT;
 	if (iocb->ki_flags & IOCB_DONTCACHE)
 		iter.flags |= IOMAP_DONTCACHE;
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iter.flags |= IOMAP_ATOMIC;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.status = iomap_write_iter(&iter, i, write_ops);
@@ -1588,6 +1617,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 	} else {
 		WARN_ON_ONCE(!folio_test_uptodate(folio));
 		folio_mark_dirty(folio);
+		folio_clear_atomic(folio);
 	}
 
 	return iomap_iter_advance(iter, length);
@@ -1642,8 +1672,10 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
-	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
+		folio_clear_atomic(folio);
 		folio_end_writeback(folio);
+	}
 }
 EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
 
@@ -1807,8 +1839,10 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 		if (atomic_dec_and_test(&ifs->write_bytes_pending))
 			folio_end_writeback(folio);
 	} else {
-		if (!wb_pending)
+		if (!wb_pending) {
+			folio_clear_atomic(folio);
 			folio_end_writeback(folio);
+		}
 	}
 	mapping_set_error(inode->i_mapping, error);
 	return error;
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..c129a695ceca 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -98,13 +98,17 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
 EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
-		loff_t pos, u16 ioend_flags)
+					     loff_t pos, u16 ioend_flags,
+					     bool atomic)
 {
 	struct bio *bio;
+	blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc);
+
+	if (atomic)
+		opf |= REQ_ATOMIC;
 
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
-			       REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
-			       GFP_NOFS, &iomap_ioend_bioset);
+			       opf, GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	bio->bi_write_hint = wpc->inode->i_write_hint;
 	wbc_init_bio(wpc->wbc, bio);
@@ -122,6 +126,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
 	if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
 	    (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
 		return false;
+	if ((ioend_flags & IOMAP_IOEND_ATOMIC) ||
+	    (ioend->io_flags & IOMAP_IOEND_ATOMIC))
+		return false;
 	if (pos != ioend->io_offset + ioend->io_size)
 		return false;
 	if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
@@ -156,6 +163,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 	unsigned int ioend_flags = 0;
 	unsigned int map_len = min_t(u64, dirty_len,
 		wpc->iomap.offset + wpc->iomap.length - pos);
+	bool is_atomic = folio_test_atomic(folio);
 	int error;
 
 	trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
@@ -180,6 +188,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		ioend_flags |= IOMAP_IOEND_DONTCACHE;
 	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
 		ioend_flags |= IOMAP_IOEND_BOUNDARY;
+	if (is_atomic)
+		ioend_flags |= IOMAP_IOEND_ATOMIC;
 
 	if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
 new_ioend:
@@ -188,7 +198,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 			if (error)
 				return error;
 		}
-		wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+		wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags, is_atomic);
 	}
 
 	if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
diff --git a/fs/read_write.c b/fs/read_write.c
index 833bae068770..37546aa40f0d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1802,6 +1802,8 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 
 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 {
+	struct super_block *sb = iocb->ki_filp->f_mapping->host->i_sb;
+
 	size_t len = iov_iter_count(iter);
 
 	if (!iter_is_ubuf(iter))
@@ -1813,8 +1815,36 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 	if (!IS_ALIGNED(iocb->ki_pos, len))
 		return -EINVAL;
 
-	if (!(iocb->ki_flags & IOCB_DIRECT))
-		return -EOPNOTSUPP;
+	if (!(iocb->ki_flags & IOCB_DIRECT)) {
+		/* Some restrictions to buferred IO */
+
+		/*
+		 * We only support block size == page size
+		 * right now. This is to avoid the following:
+		 * 1. 4kb block atomic write marks the complete 64kb folio as
+		 *    atomic.
+		 * 2. Other writes, dirty the whole 64kb folio.
+		 * 3. Writeback sees the whole folio dirty and atomic and tries
+		 *    to send a 64kb atomic write, which might exceed the
+		 *    allowed size and fail.
+		 *
+		 * Once we support sub-page atomic write tracking, we can remove
+		 * this restriction.
+		 */
+		if (sb->s_blocksize != PAGE_SIZE)
+			return -EOPNOTSUPP;
+
+		/*
+		 * If the user buffer of atomic write crosses page boundary,
+		 * there's a possibility of short write, example if 1 user page
+		 * could not be faulted or got reclaimed before the copy
+		 * operation. For now don't allow such a scenario by ensuring
+		 * user buffer is page aligned.
+		 */
+		if (!PAGE_ALIGNED(iov_iter_alignment(iter)))
+			return -EOPNOTSUPP;
+
+	}
 
 	return 0;
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b1ac08c7474..693f3e5ad03c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -390,6 +390,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
 #define IOMAP_IOEND_DIRECT		(1U << 3)
 /* is DONTCACHE I/O */
 #define IOMAP_IOEND_DONTCACHE		(1U << 4)
+/* is atomic I/O. These are never merged */
+#define IOMAP_IOEND_ATOMIC		(1U << 5)
 
 /*
  * Flags that if set on either ioend prevent the merge of two ioends.
-- 
2.51.0

Currently, if the user buffer crosses a page boundary (even if it is a
single block write), we can end up with the following scenario:

1. We prefault the 2 user pages in iomap_write_iter.
2. Due to memory pressure, 1 page is reclaimed.
3. copy_folio_from_iter_atomic() ends up doing a short copy

This is unacceptable for RWF_ATOMIC writes since at this point our folio
is already dirty and we will be unable to recover the old data to
guarantee the atomic semantics.

Get past this issue by taking inspiration from the direct IO code and
performaing the following steps for RWF_ATOMIC:

 1. Pin all the user pages. This pins the physical page but the user
    space mapping can still be unmapped by reclaim code, which can still
    cause a short write in copy_folio_from_iter_atomic().
 2. To get past the user mapping getting unmapped, don't use the user
    iter anymore but rather create a bvec out of the pinned pages. This
    way we area safe from unmapping since we use the kernel's mapping
    directly. Having a bvec also allows us directly reuse
    copy_folio_from_iter_atomic().

This ensures we should never see a short write since we prefault and pin
the pages in case of RWF_ATOMIC

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/iomap/buffered-io.c | 154 +++++++++++++++++++++++++++++++++++++----
 fs/read_write.c        |  11 ---
 2 files changed, 140 insertions(+), 25 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 947c76c2688a..e7dbe9bcb439 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1027,6 +1027,73 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
 	return __iomap_write_end(iter, pos, len, copied, folio);
 }
 
+/*
+ * Prepare an atomic write by pinning its pages and creating a ITER_BVEC out of
+ * them. This function also advances the original iter. Incase we encounter any
+ * error later, we revert the progress.
+ */
+static int iomap_atomic_write_prep(struct iov_iter *i,
+				   struct iov_iter *atomic_iter,
+				   struct bio_vec **atomic_bvecs,
+				   struct page ***pages)
+{
+	size_t pg_off;
+	int bytes_pinned = 0;
+	int k = 0;
+	int len, total_len = 0, off;
+	int pinned_pgs = 0;
+	struct bio_vec *tmp_bvecs;
+
+	bytes_pinned = iov_iter_extract_pages(i, pages, iov_iter_count(i),
+					      UINT_MAX, 0, &pg_off);
+	/*
+	 * iov_iter_extract_pages advances the iter but we didn't
+	 * do any work yet, so revert.
+	 */
+	iov_iter_revert(i, bytes_pinned);
+
+	pinned_pgs = DIV_ROUND_UP(pg_off + bytes_pinned, PAGE_SIZE);
+
+	tmp_bvecs = kcalloc(pinned_pgs, sizeof(struct bio_vec), GFP_KERNEL);
+
+	if (unlikely(!tmp_bvecs))
+		return -ENOMEM;
+
+	for (struct page *p; k < pinned_pgs && iov_iter_count(i); k++) {
+		p = (*pages)[k];
+		off = (unsigned long)((char *)i->ubuf + i->iov_offset) %
+		      PAGE_SIZE;
+		len = min(PAGE_SIZE - off, iov_iter_count(i));
+		bvec_set_page(&tmp_bvecs[k], p, len, off);
+		iov_iter_advance(i, len);
+		total_len += len;
+	}
+
+	iov_iter_bvec(atomic_iter, ITER_SOURCE, tmp_bvecs, k, total_len);
+
+	*atomic_bvecs = tmp_bvecs;
+	return pinned_pgs;
+}
+
+static void iomap_atomic_write_cleanup(struct page ***pages, int *pinned_pgs,
+				       struct bio_vec **atomic_bvecs)
+{
+	if (*pinned_pgs) {
+		unpin_user_pages(*pages, *pinned_pgs);
+		*pinned_pgs = 0;
+	}
+
+	if (*pages) {
+		kfree(*pages);
+		*pages = NULL;
+	}
+
+	if (*atomic_bvecs) {
+		kfree(*atomic_bvecs);
+		*atomic_bvecs = NULL;
+	}
+}
+
 static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 		const struct iomap_write_ops *write_ops)
 {
@@ -1035,6 +1102,11 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 	struct address_space *mapping = iter->inode->i_mapping;
 	size_t chunk = mapping_max_folio_size(mapping);
 	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
+	bool is_atomic = iter->flags & IOMAP_ATOMIC;
+	struct page **pages = NULL;
+	int pinned_pgs;
+	struct iov_iter atomic_iter = {0};
+	struct bio_vec *atomic_bvecs = NULL;
 
 	do {
 		struct folio *folio;
@@ -1057,19 +1129,52 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 		if (bytes > iomap_length(iter))
 			bytes = iomap_length(iter);
 
-		/*
-		 * Bring in the user page that we'll copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * For async buffered writes the assumption is that the user
-		 * page has already been faulted in. This can be optimized by
-		 * faulting the user page.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
-			status = -EFAULT;
-			break;
+		if (is_atomic) {
+			/*
+			 * If the user pages get reclaimed or unmapped, we could
+			 * end up faulting and doing a short copy in
+			 * copy_folio_from_iter_atomic(), which is undesirable
+			 * for RWF_ATOMIC. Hence:
+			 *
+			 * 1. Pin the pages to protect against reclaim
+			 *
+			 * 2. Iter's user page can still get unmapped from user
+			 *    page table leading to short copy. Protect against
+			 *    this by instead using an ITER_BVEC created out of
+			 *    the pinned pages.
+			 */
+
+			pinned_pgs = iomap_atomic_write_prep(i, &atomic_iter, &atomic_bvecs,
+							     &pages);
+			if (unlikely(pinned_pgs <= 0)) {
+				status = pinned_pgs;
+				break;
+			}
+
+			if (pinned_pgs << PAGE_SHIFT < bytes) {
+				WARN_RATELIMIT(
+					true,
+					"Couldn't pin bytes for atomic write: pinned: %d, needed: %lld",
+					pinned_pgs << PAGE_SHIFT, bytes);
+				status = -EFAULT;
+				break;
+			}
+
+		} else {
+			/*
+			 * Bring in the user page that we'll copy from _first_.
+			 * Otherwise there's a nasty deadlock on copying from the
+			 * same page as we're writing to, without it being marked
+			 * up-to-date.
+			 *
+			 * For async buffered writes the assumption is that the user
+			 * page has already been faulted in. This can be optimized by
+			 * faulting the user page.
+			 */
+			if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
+				status = -EFAULT;
+				break;
+			}
 		}
 
 		status = iomap_write_begin(iter, write_ops, &folio, &offset,
@@ -1086,9 +1191,27 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_folio(folio);
 
-		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
+		copied = copy_folio_from_iter_atomic(
+			folio, offset, bytes, is_atomic ? &atomic_iter : i);
 		written = iomap_write_end(iter, bytes, copied, folio) ?
 			  copied : 0;
+		if (is_atomic) {
+			if (written != bytes) {
+				/*
+				 * short copy so revert the iter accordingly.
+				 * This should never happen ideally
+				 */
+				WARN_RATELIMIT(
+					1,
+					"Short atomic write: bytes_pinned:%d bytes:%lld written:%lld\n",
+					pinned_pgs << PAGE_SHIFT, bytes,
+					written);
+				iov_iter_revert(i,
+						iov_iter_count(&atomic_iter));
+			}
+			iomap_atomic_write_cleanup(&pages, &pinned_pgs,
+						   &atomic_bvecs);
+		}
 
 		/*
 		 * Update the in-memory inode size after copying the data into
@@ -1130,6 +1253,9 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 		}
 	} while (iov_iter_count(i) && iomap_length(iter));
 
+	if (is_atomic)
+		iomap_atomic_write_cleanup(&pages, &pinned_pgs, &atomic_bvecs);
+
 	return total_written ? 0 : status;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 37546aa40f0d..7e064561cc4b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1833,17 +1833,6 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 		 */
 		if (sb->s_blocksize != PAGE_SIZE)
 			return -EOPNOTSUPP;
-
-		/*
-		 * If the user buffer of atomic write crosses page boundary,
-		 * there's a possibility of short write, example if 1 user page
-		 * could not be faulted or got reclaimed before the copy
-		 * operation. For now don't allow such a scenario by ensuring
-		 * user buffer is page aligned.
-		 */
-		if (!PAGE_ALIGNED(iov_iter_alignment(iter)))
-			return -EOPNOTSUPP;
-
 	}
 
 	return 0;
-- 
2.51.0

Now that we can reliably perform a HW based single block buffered atomic
write for page size == blocksize, start advertising it in XFS.

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/xfs/xfs_iops.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f036c46b19c5..67d370947d95 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -604,9 +604,10 @@ xfs_get_atomic_write_min(
 	struct xfs_inode	*ip,
 	bool			is_dio)
 {
-	if (is_dio) {
-		struct xfs_mount *mp = ip->i_mount;
+	struct xfs_mount *mp = ip->i_mount;
+	uint32_t bs = mp->m_sb.sb_blocksize;
 
+	if (is_dio) {
 		/*
 		 * If we can complete an atomic write via atomic out of place writes,
 		 * then advertise a minimum size of one fsblock.  Without this
@@ -618,10 +619,15 @@ xfs_get_atomic_write_min(
 		 */
 		if (xfs_inode_can_hw_atomic_write(ip) ||
 		    xfs_inode_can_sw_atomic_write(ip))
-			return mp->m_sb.sb_blocksize;
+			return bs;
 	}
+	/*
+	 * Buffered IO only supports hw single block atomic writes and bs == ps
+	 * configurations.
+	 */
+	if (xfs_inode_can_hw_atomic_write(ip) && bs == PAGE_SIZE)
+		return bs;
 
-	/* buffered IO not supported yet so return 0 right away */
 	return 0;
 }
 
@@ -630,7 +636,8 @@ xfs_get_atomic_write_max(
 	struct xfs_inode	*ip,
 	bool			is_dio)
 {
-	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_mount *mp = ip->i_mount;
+	uint32_t bs = mp->m_sb.sb_blocksize;
 
 	if (is_dio) {
 		/*
@@ -640,7 +647,7 @@ xfs_get_atomic_write_max(
 		 */
 		if (!xfs_inode_can_sw_atomic_write(ip)) {
 			if (xfs_inode_can_hw_atomic_write(ip))
-				return mp->m_sb.sb_blocksize;
+				return bs;
 			return 0;
 		}
 
@@ -653,8 +660,13 @@ xfs_get_atomic_write_max(
 			return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
 		return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
 	}
+	/*
+	 * Buffered IO only supports hw single block atomic writes and bs == ps
+	 * configurations.
+	 */
+	if (xfs_inode_can_hw_atomic_write(ip) && bs == PAGE_SIZE)
+		return bs;
 
-	/* buffered IO not supported yet so return 0 right away */
 	return 0;
 }
 
@@ -679,7 +691,7 @@ xfs_get_atomic_write_max_opt(
 		return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
 	}
 
-	/* buffered IO not supported yet so return 0 right away */
+	/* buffered IO for now only supports 1 filesyste block so max_opt is 0 */
 	return 0;
 }
 
-- 
2.51.0

Lift bs == ps restriction for RWF_ATOMIC buffered writes by adding
sub-page support. This is done by adding 2 more bitmaps in folio -> ifs.

One bitmap tracks which blocks marks a start of atomic write and the
other tracks which marks the end. For single block atomic write, both
start and end would be marked on the same block, but the design is kept
as such so we can easily extend to multi block later.

With the help of the 2 bitmaps, we are able to determine which blocks
needs to go atomically together during writeback. This prevents the
issue where write amplification could cause an RWF_ATOMIC write which is
bigger than supported by the underlying device.

As with bs == ps support, if there is a non atomic write that overlaps
an atomic marked block, we will clear the atomic state of that block
in ifs. Similarly, if the folio is mmapd and written to, we will clear
atomic bit from all blocks in the folio.

To illustrate some examples:

A = Dirty, atomic block
D = Dirty, non-atomic block
Let pagesize = 4k, blocksize = 1k

1)
- Initial state of blocks in folio: A A D D
- Non atomic write from block 0 to 3
- New state: D D D D

2)
- Initial state of blocks in folio: A A A A
- Non atomic write from block 1 to 2
- New state: A D D A

3)
- Initial state of blocks in folio: A A _ _
- mmap write to anyblock in folio
- New state: D D D D

Suggested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/iomap/buffered-io.c     | 207 ++++++++++++++++++++++++++++++++++---
 fs/iomap/ioend.c           |   9 +-
 fs/iomap/trace.h           |  12 ++-
 fs/read_write.c            |  22 ----
 include/linux/iomap.h      |   1 +
 include/linux/page-flags.h |   2 +-
 6 files changed, 207 insertions(+), 46 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e7dbe9bcb439..d86859728e3b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -26,6 +26,10 @@ struct iomap_folio_state {
 	 * Each block has two bits in this bitmap:
 	 * Bits [0..blocks_per_folio) has the uptodate status.
 	 * Bits [b_p_f...(2*b_p_f))   has the dirty status.
+	 * Bits [2*b_p_f..3*b_p_f)    has whether block marks the
+	 *			      start of an RWF_ATOMIC write
+	 * Bits [3*b_p_f..4*b_p_f)    has whether block marks the
+	 *			      end of an RWF_ATOMIC write
 	 */
 	unsigned long		state[];
 };
@@ -76,6 +80,25 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 		folio_mark_uptodate(folio);
 }
 
+static inline bool ifs_block_is_atomic_start(struct folio *folio,
+				      struct iomap_folio_state *ifs, int block)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + (blks_per_folio * 2), ifs->state);
+}
+
+static inline bool ifs_block_is_atomic_end(struct folio *folio,
+					     struct iomap_folio_state *ifs,
+					     int block)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + (blks_per_folio * 3), ifs->state);
+}
+
 static inline bool ifs_block_is_dirty(struct folio *folio,
 		struct iomap_folio_state *ifs, int block)
 {
@@ -85,17 +108,42 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
 	return test_bit(block + blks_per_folio, ifs->state);
 }
 
+/*
+ * Returns false if the folio has atleast 1 atomic block, else true
+ */
+static inline bool ifs_is_fully_non_atomic(struct folio *folio,
+					   struct iomap_folio_state *ifs)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	for (int i = 0; i < blks_per_folio; i++) {
+		if (ifs_block_is_atomic_start(folio, ifs, i))
+			return false;
+	}
+
+	return true;
+}
+
 static unsigned ifs_find_dirty_range(struct folio *folio,
-		struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+				     struct iomap_folio_state *ifs,
+				     u64 *range_start, u64 range_end,
+				     bool *is_atomic_range)
 {
 	struct inode *inode = folio->mapping->host;
+	unsigned folio_nblks = i_blocks_per_folio(inode, folio);
 	unsigned start_blk =
 		offset_in_folio(folio, *range_start) >> inode->i_blkbits;
 	unsigned end_blk = min_not_zero(
 		offset_in_folio(folio, range_end) >> inode->i_blkbits,
-		i_blocks_per_folio(inode, folio));
+		folio_nblks);
 	unsigned nblks = 1;
+	bool is_atomic_folio = folio_test_atomic(folio);
 
+	/*
+	 * We need to be careful in not clubbing together atomic write ranges
+	 * with other dirty blocks
+	 */
 	while (!ifs_block_is_dirty(folio, ifs, start_blk))
 		if (++start_blk == end_blk)
 			return 0;
@@ -106,12 +154,62 @@ static unsigned ifs_find_dirty_range(struct folio *folio,
 		nblks++;
 	}
 
+	*is_atomic_range = false;
+
+	if (is_atomic_folio) {
+		unsigned int first_atomic;
+		unsigned int last = start_blk + nblks;
+		/*
+		 * We now have the dirty range, however if the range has any
+		 * RWF_ATOMIC blocks, we need to make sure to not club them with
+		 * other dirty blocks.
+		 */
+		first_atomic = start_blk;
+		while (!ifs_block_is_atomic_start(folio, ifs, first_atomic)) {
+			if (++first_atomic == start_blk + nblks)
+				break;
+		}
+
+		if (first_atomic != start_blk + nblks) {
+			/* RWF_ATOMIC blocks found in dirty range */
+			if (first_atomic == start_blk) {
+				/*
+				 * range start is RWF_ATOMIC. Return only the
+				 * atomic range.
+				 */
+				nblks = 0;
+				while (first_atomic + nblks < last) {
+					if (ifs_block_is_atomic_end(
+						    folio, ifs,
+						    first_atomic + nblks++))
+						break;
+				}
+
+				if (first_atomic + nblks > last)
+					/*
+					 * RWF_ATOMIC range should
+					 * always be contained in the
+					 * dirty range
+					 */
+					WARN_ON(true);
+
+				*is_atomic_range = true;
+			} else {
+				/*
+				 * RWF_ATOMIC range is in middle of dirty range. Return only
+				 * the starting non-RWF_ATOMIC range
+				 */
+				nblks = first_atomic - start_blk;
+			}
+		}
+	}
+
 	*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
 	return nblks << inode->i_blkbits;
 }
 
 static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
-		u64 range_end)
+		u64 range_end, bool *is_atomic_range)
 {
 	struct iomap_folio_state *ifs = folio->private;
 
@@ -119,10 +217,33 @@ static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
 		return 0;
 
 	if (ifs)
-		return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+		return ifs_find_dirty_range(folio, ifs, range_start, range_end,
+					    is_atomic_range);
+
+	if (folio_test_atomic(folio))
+		*is_atomic_range = true;
+
 	return range_end - *range_start;
 }
 
+static bool ifs_clear_range_atomic(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = (off >> inode->i_blkbits);
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_clear(ifs->state, first_blk + (blks_per_folio * 2), nr_blks);
+	bitmap_clear(ifs->state, last_blk + (blks_per_folio * 3), nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+
+	return ifs_is_fully_non_atomic(folio, ifs);
+}
+
 static void ifs_clear_range_dirty(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
@@ -138,6 +259,18 @@ static void ifs_clear_range_dirty(struct folio *folio,
 	spin_unlock_irqrestore(&ifs->state_lock, flags);
 }
 
+static void iomap_clear_range_atomic(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+	bool fully_non_atomic = true;
+
+	if (ifs)
+		fully_non_atomic = ifs_clear_range_atomic(folio, ifs, off, len);
+
+	if (fully_non_atomic)
+		folio_clear_atomic(folio);
+}
+
 static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
@@ -146,8 +279,34 @@ static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
 		ifs_clear_range_dirty(folio, ifs, off, len);
 }
 
-static void ifs_set_range_dirty(struct folio *folio,
+static void ifs_set_range_atomic(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = (off >> inode->i_blkbits);
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_set(ifs->state, first_blk + (blks_per_folio * 2), 1);
+	bitmap_set(ifs->state, last_blk + (blks_per_folio * 3), 1);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_atomic(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_set_range_atomic(folio, ifs, off, len);
+
+	folio_set_atomic(folio);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+				struct iomap_folio_state *ifs, size_t off,
+				size_t len)
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
@@ -190,8 +349,12 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 	 * The first state tracks per-block uptodate and the
 	 * second tracks per-block dirty state.
 	 */
+
+	/*
+	 * TODO: How can we only selectively allocate atomic bitmaps for ifs?
+	 */
 	ifs = kzalloc(struct_size(ifs, state,
-		      BITS_TO_LONGS(2 * nr_blocks)), gfp);
+		      BITS_TO_LONGS(4 * nr_blocks)), gfp);
 	if (!ifs)
 		return ifs;
 
@@ -941,6 +1104,8 @@ static int iomap_write_begin(struct iomap_iter *iter,
 static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
+	struct inode *inode = iter->inode;
+
 	flush_dcache_folio(folio);
 
 	/*
@@ -975,9 +1140,12 @@ static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 
 			return false;
 		}
-		folio_set_atomic(folio);
-	} else
-		folio_clear_atomic(folio);
+		iomap_set_range_atomic(folio, offset_in_folio(folio, pos), len);
+	} else {
+		if (folio_test_atomic(folio))
+			iomap_clear_range_atomic(
+				folio, offset_in_folio(folio, pos), len);
+	}
 
 	return true;
 }
@@ -1208,7 +1376,11 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 					written);
 				iov_iter_revert(i,
 						iov_iter_count(&atomic_iter));
-			}
+			} else
+				iomap_set_range_atomic(
+					folio, offset_in_folio(folio, pos),
+					written);
+
 			iomap_atomic_write_cleanup(&pages, &pinned_pgs,
 						   &atomic_bvecs);
 		}
@@ -1743,7 +1915,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 	} else {
 		WARN_ON_ONCE(!folio_test_uptodate(folio));
 		folio_mark_dirty(folio);
-		folio_clear_atomic(folio);
+		iomap_clear_range_atomic(folio, 0, folio_size(folio));
 	}
 
 	return iomap_iter_advance(iter, length);
@@ -1799,7 +1971,7 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
 	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
-		folio_clear_atomic(folio);
+		iomap_clear_range_atomic(folio, 0, folio_size(folio));
 		folio_end_writeback(folio);
 	}
 }
@@ -1914,6 +2086,8 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 		if (!ifs) {
 			ifs = ifs_alloc(inode, folio, 0);
 			iomap_set_range_dirty(folio, 0, end_pos - pos);
+			if (folio_test_atomic(folio))
+				iomap_set_range_atomic(folio, 0, end_pos - pos);
 		}
 
 		/*
@@ -1936,7 +2110,8 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 	 * Walk through the folio to find dirty areas to write back.
 	 */
 	end_aligned = round_up(end_pos, i_blocksize(inode));
-	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned,
+					      &wpc->is_atomic_range))) {
 		error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
 				&wb_pending);
 		if (error)
@@ -1962,11 +2137,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 	 * bit ourselves right after unlocking the page.
 	 */
 	if (ifs) {
-		if (atomic_dec_and_test(&ifs->write_bytes_pending))
+		if (atomic_dec_and_test(&ifs->write_bytes_pending)) {
+			iomap_clear_range_atomic(folio, 0, folio_size(folio));
 			folio_end_writeback(folio);
+		}
 	} else {
 		if (!wb_pending) {
-			folio_clear_atomic(folio);
+			iomap_clear_range_atomic(folio, 0, folio_size(folio));
 			folio_end_writeback(folio);
 		}
 	}
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index c129a695ceca..678c052c6443 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -163,10 +163,10 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 	unsigned int ioend_flags = 0;
 	unsigned int map_len = min_t(u64, dirty_len,
 		wpc->iomap.offset + wpc->iomap.length - pos);
-	bool is_atomic = folio_test_atomic(folio);
 	int error;
 
-	trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
+	trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap,
+				 wpc->is_atomic_range);
 
 	WARN_ON_ONCE(!folio->private && map_len < dirty_len);
 
@@ -188,7 +188,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		ioend_flags |= IOMAP_IOEND_DONTCACHE;
 	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
 		ioend_flags |= IOMAP_IOEND_BOUNDARY;
-	if (is_atomic)
+	if (wpc->is_atomic_range)
 		ioend_flags |= IOMAP_IOEND_ATOMIC;
 
 	if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
@@ -198,7 +198,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 			if (error)
 				return error;
 		}
-		wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags, is_atomic);
+		wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags,
+							wpc->is_atomic_range);
 	}
 
 	if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a61c1dae4742..14ad280c03fe 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -172,8 +172,8 @@ DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
 
 TRACE_EVENT(iomap_add_to_ioend,
 	TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
-		 struct iomap *iomap),
-	TP_ARGS(inode, pos, dirty_len, iomap),
+		 struct iomap *iomap, bool is_atomic),
+	TP_ARGS(inode, pos, dirty_len, iomap, is_atomic),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(u64, ino)
@@ -185,6 +185,7 @@ TRACE_EVENT(iomap_add_to_ioend,
 		__field(u16, type)
 		__field(u16, flags)
 		__field(dev_t, bdev)
+		__field(bool, is_atomic)
 	),
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
@@ -197,9 +198,11 @@ TRACE_EVENT(iomap_add_to_ioend,
 		__entry->type = iomap->type;
 		__entry->flags = iomap->flags;
 		__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+		__entry->is_atomic = is_atomic;
 	),
 	TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
-		  "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)",
+		  "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x) "
+		  "is_atomic=%d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -211,7 +214,8 @@ TRACE_EVENT(iomap_add_to_ioend,
 		  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
 		  __entry->type,
 		  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS),
-		  __entry->flags)
+		  __entry->flags,
+		  __entry->is_atomic)
 );
 
 TRACE_EVENT(iomap_iter,
diff --git a/fs/read_write.c b/fs/read_write.c
index 7e064561cc4b..ab5d8e17d86d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1802,8 +1802,6 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 
 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct super_block *sb = iocb->ki_filp->f_mapping->host->i_sb;
-
 	size_t len = iov_iter_count(iter);
 
 	if (!iter_is_ubuf(iter))
@@ -1815,26 +1813,6 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
 	if (!IS_ALIGNED(iocb->ki_pos, len))
 		return -EINVAL;
 
-	if (!(iocb->ki_flags & IOCB_DIRECT)) {
-		/* Some restrictions to buferred IO */
-
-		/*
-		 * We only support block size == page size
-		 * right now. This is to avoid the following:
-		 * 1. 4kb block atomic write marks the complete 64kb folio as
-		 *    atomic.
-		 * 2. Other writes, dirty the whole 64kb folio.
-		 * 3. Writeback sees the whole folio dirty and atomic and tries
-		 *    to send a 64kb atomic write, which might exceed the
-		 *    allowed size and fail.
-		 *
-		 * Once we support sub-page atomic write tracking, we can remove
-		 * this restriction.
-		 */
-		if (sb->s_blocksize != PAGE_SIZE)
-			return -EOPNOTSUPP;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 693f3e5ad03c..033e0ba49f85 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -460,6 +460,7 @@ struct iomap_writepage_ctx {
 	const struct iomap_writeback_ops *ops;
 	u32			nr_folios;	/* folios added to the ioend */
 	void			*wb_ctx;	/* pending writeback context */
+	bool			is_atomic_range;
 };
 
 struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bdce0f58a77a..542e7db6b21b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -111,7 +111,7 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
 	PG_dropbehind,		/* drop pages on IO completion */
-	PG_atomic,		/* Page is marked atomic for buffered atomic writes */
+	PG_atomic,		/* Atlease 1 block in page is marked atomic for buffered atomic writes */
 #ifdef CONFIG_MMU
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
-- 
2.51.0

Now that we support bs < ps for HW atomic writes, lift this restirction from XFS
statx reporting

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/xfs/xfs_iops.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67d370947d95..5bd31aacf514 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -622,10 +622,9 @@ xfs_get_atomic_write_min(
 			return bs;
 	}
 	/*
-	 * Buffered IO only supports hw single block atomic writes and bs == ps
-	 * configurations.
+	 * Buffered IO only supports hw single block atomic writes
 	 */
-	if (xfs_inode_can_hw_atomic_write(ip) && bs == PAGE_SIZE)
+	if (xfs_inode_can_hw_atomic_write(ip))
 		return bs;
 
 	return 0;
@@ -661,10 +660,9 @@ xfs_get_atomic_write_max(
 		return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
 	}
 	/*
-	 * Buffered IO only supports hw single block atomic writes and bs == ps
-	 * configurations.
+	 * Buffered IO only supports hw single block atomic writes
 	 */
-	if (xfs_inode_can_hw_atomic_write(ip) && bs == PAGE_SIZE)
+	if (xfs_inode_can_hw_atomic_write(ip))
 		return bs;
 
 	return 0;
-- 
2.51.0