Cleancache can be thought of as a page-granularity victim cache for clean
pages that the kernel's pageframe replacement algorithm  would like to
keep around, but can't since there isn't enough memory. When the page
reclaim mechanism "evicts" a page, it first attempts to use cleancache
to put the data contained in that page into memory that is not directly
accessible or addressable by the kernel. Later, when the system needs to
access a page in a file on disk, it first checks cleancache to see if it
already contains it; if it does, the page of data is copied into the
kernel and a disk access is avoided.

The patchset borrows the idea, some code and documentation from previous
cleancache implementation but as opposed to being a thin pass-through
layer, it now implements housekeeping code to associate cleancache pages
with their inodes and handling of page pools donated by the cleancache
backends. It also avoids intrusive hooks into filesystem code, limiting
itself to hooks in mm reclaim and page-in paths and two hooks to detect
new filesystem mount/unmount events.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Minchan Kim <minchan@google.com>
---
 MAINTAINERS                |   8 +
 block/bdev.c               |   6 +
 fs/super.c                 |   3 +
 include/linux/cleancache.h |  64 +++
 include/linux/fs.h         |   6 +
 include/linux/mm_types.h   |  12 +-
 include/linux/pagemap.h    |   1 +
 mm/Kconfig                 |  16 +
 mm/Makefile                |   1 +
 mm/cleancache.c            | 852 +++++++++++++++++++++++++++++++++++++
 mm/filemap.c               |  26 ++
 mm/truncate.c              |   4 +
 mm/vmscan.c                |   1 +
 13 files changed, 998 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/cleancache.h
 create mode 100644 mm/cleancache.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c1a1732df7b1..90a6fc0e742c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6053,6 +6053,14 @@ F:	scripts/Makefile.clang
 F:	scripts/clang-tools/
 K:	\b(?i:clang|llvm)\b
 
+CLEANCACHE
+M:	Suren Baghdasaryan <surenb@google.com>
+M:	Minchan Kim <minchan@google.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	include/linux/cleancache.h
+F:	mm/cleancache.c
+
 CLK API
 M:	Russell King <linux@armlinux.org.uk>
 L:	linux-clk@vger.kernel.org
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..e1b785515520 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -28,6 +28,7 @@
 #include <linux/part_stat.h>
 #include <linux/uaccess.h>
 #include <linux/stat.h>
+#include <linux/cleancache.h>
 #include "../fs/internal.h"
 #include "blk.h"
 
@@ -101,6 +102,11 @@ void invalidate_bdev(struct block_device *bdev)
 		lru_add_drain_all();	/* make sure all lru add caches are flushed */
 		invalidate_mapping_pages(mapping, 0, -1);
 	}
+	/*
+	 * 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_invalidate_inode(mapping->host);
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..5639dc069528 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
 #include <linux/fscrypt.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
@@ -374,6 +375,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_time_gran = 1000000000;
 	s->s_time_min = TIME64_MIN;
 	s->s_time_max = TIME64_MAX;
+	cleancache_add_fs(s);
 
 	s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
 				     "sb-%s", type->name);
@@ -469,6 +471,7 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
+		cleancache_remove_fs(s);
 		shrinker_free(s->s_shrink);
 		fs->kill_sb(s);
 
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
new file mode 100644
index 000000000000..419faa183aba
--- /dev/null
+++ b/include/linux/cleancache.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLEANCACHE_H
+#define _LINUX_CLEANCACHE_H
+
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+
+/* super_block->cleancache_id value for an invalid ID */
+#define CLEANCACHE_ID_INVALID	-1
+
+#define CLEANCACHE_KEY_MAX	6
+
+
+#ifdef CONFIG_CLEANCACHE
+
+/* Hooks into MM and FS */
+int cleancache_add_fs(struct super_block *sb);
+void cleancache_remove_fs(struct super_block *sb);
+bool cleancache_store_folio(struct inode *inode, struct folio *folio);
+bool cleancache_restore_folio(struct inode *inode, struct folio *folio);
+bool cleancache_invalidate_folio(struct inode *inode, struct folio *folio);
+bool cleancache_invalidate_inode(struct inode *inode);
+
+/*
+ * Backend API
+ *
+ * Cleancache does not touch folio references. Folio refcount should be 1 when
+ * folio is placed or returned into cleancache and folios obtained from
+ * cleancache will also have their refcount at 1.
+ */
+int cleancache_backend_register_pool(const char *name);
+int cleancache_backend_get_folio(int pool_id, struct folio *folio);
+int cleancache_backend_put_folio(int pool_id, struct folio *folio);
+int cleancache_backend_put_folios(int pool_id, struct list_head *folios);
+
+#else /* CONFIG_CLEANCACHE */
+
+static inline int cleancache_add_fs(struct super_block *sb)
+		{ return -EOPNOTSUPP; }
+static inline void cleancache_remove_fs(struct super_block *sb) {}
+static inline bool cleancache_store_folio(struct inode *inode,
+					  struct folio *folio)
+		{ return false; }
+static inline bool cleancache_restore_folio(struct inode *inode,
+					    struct folio *folio)
+		{ return false; }
+static inline bool cleancache_invalidate_folio(struct inode *inode,
+					       struct folio *folio)
+		{ return false; }
+static inline bool cleancache_invalidate_inode(struct inode *inode)
+		{ return false; }
+static inline int cleancache_backend_register_pool(const char *name)
+		{ return -EOPNOTSUPP; }
+static inline int cleancache_backend_get_folio(int pool_id, struct folio *folio)
+		{ return -EOPNOTSUPP; }
+static inline int cleancache_backend_put_folio(int pool_id, struct folio *folio)
+		{ return -EOPNOTSUPP; }
+static inline int cleancache_backend_put_folios(int pool_id, struct list_head *folios)
+		{ return -EOPNOTSUPP; }
+
+#endif /* CONFIG_CLEANCACHE */
+
+#endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8cf9547a881c..a8ad021836ee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1583,6 +1583,12 @@ struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
+#ifdef CONFIG_CLEANCACHE
+	/*
+	 * Saved identifier for cleancache (CLEANCACHE_ID_INVALID means none)
+	 */
+	int cleancache_id;
+#endif
 } __randomize_layout;
 
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5021047485a9..720315e30c58 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -33,6 +33,7 @@
 struct address_space;
 struct futex_private_hash;
 struct mem_cgroup;
+struct cleancache_inode;
 
 typedef struct {
 	unsigned long f;
@@ -392,16 +393,23 @@ struct folio {
 	/* public: */
 				struct dev_pagemap *pgmap;
 			};
-			struct address_space *mapping;
+			union {
+				struct address_space *mapping;
+				struct cleancache_inode *cc_inode;
+			};
 			union {
 				pgoff_t index;
+				pgoff_t cc_index;
 				unsigned long share;
 			};
 			union {
 				void *private;
 				swp_entry_t swap;
 			};
-			atomic_t _mapcount;
+			union {
+				atomic_t _mapcount;
+				int cc_pool_id;
+			};
 			atomic_t _refcount;
 #ifdef CONFIG_MEMCG
 			unsigned long memcg_data;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..7d9fa68ad0c9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1269,6 +1269,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 		pgoff_t index, gfp_t gfp);
 void filemap_remove_folio(struct folio *folio);
+void store_into_cleancache(struct address_space *mapping, struct folio *folio);
 void __filemap_remove_folio(struct folio *folio, void *shadow);
 void replace_page_cache_folio(struct folio *old, struct folio *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
diff --git a/mm/Kconfig b/mm/Kconfig
index a5a90b169435..1255b543030b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1033,6 +1033,22 @@ config USE_PERCPU_NUMA_NODE_ID
 config HAVE_SETUP_PER_CPU_AREA
 	bool
 
+config CLEANCACHE
+	bool "Enable cleancache to cache clean pages"
+	help
+	  Cleancache can be thought of as a page-granularity victim cache for
+	  clean pages that the kernel's pageframe replacement algorithm would
+	  like to keep around, but can't since there isn't enough memory.
+	  When the page reclaim mechanism "evicts" a page, it first attempts to
+	  to put the data contained in that page into cleancache, backed by
+	  memory that is not directly accessible or addressable by the kernel
+	  and is of unknown and possibly time-varying size. When system wishes
+	  to access a page in a file on disk, it first checks cleancache to see
+	  if it already contains required data; if it does, the page is copied
+	  into the kernel and a disk access is avoided.
+
+	  If unsure, say N.
+
 config CMA
 	bool "Contiguous Memory Allocator"
 	depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index 21abb3353550..b78073b87aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -146,3 +146,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
 obj-$(CONFIG_EXECMEM) += execmem.o
 obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
 obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..26fb91b987b7
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,852 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/cleancache.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+
+/*
+ * Lock nesting:
+ *	ccinode->folios.xa_lock
+ *		fs->hash_lock
+ *
+ *	ccinode->folios.xa_lock
+ *		pool->lock
+ */
+
+#define INODE_HASH_BITS		6
+
+/* represents each file system instance hosted by the cleancache */
+struct cleancache_fs {
+	refcount_t ref_count;
+	DECLARE_HASHTABLE(inode_hash, INODE_HASH_BITS);
+	spinlock_t hash_lock; /* protects inode_hash */
+	struct rcu_head rcu;
+};
+
+/*
+ * @cleancache_inode represents each ccinode in @cleancache_fs
+ *
+ * The cleancache_inode will be freed by RCU when the last folio from xarray
+ * is freed, except for invalidate_inode() case.
+ */
+struct cleancache_inode {
+	struct inode *inode;
+	struct hlist_node hash;
+	refcount_t ref_count;
+	struct xarray folios;
+	struct cleancache_fs *fs;
+	struct rcu_head rcu;
+};
+
+/* Cleancache backend memory pool */
+struct cleancache_pool {
+	struct list_head folio_list;
+	spinlock_t lock; /* protects folio_list */
+};
+
+#define CLEANCACHE_MAX_POOLS	64
+
+static DEFINE_XARRAY_ALLOC(fs_xa);
+static struct kmem_cache *slab_inode; /* cleancache_inode slab */
+static struct cleancache_pool pools[CLEANCACHE_MAX_POOLS];
+static atomic_t nr_pools = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(pools_lock); /* protects pools */
+
+static inline void init_cleancache_folio(struct folio *folio, int pool_id)
+{
+	/* Folio is being donated and has no refs. No locking is needed. */
+	VM_BUG_ON(folio_ref_count(folio) != 0);
+
+	folio->cc_pool_id = pool_id;
+	folio->cc_inode = NULL;
+	folio->cc_index = 0;
+}
+
+static inline void clear_cleancache_folio(struct folio *folio)
+{
+	/* Folio must be detached and not in the pool. No locking is needed. */
+	VM_BUG_ON(folio->cc_inode);
+
+	folio->cc_pool_id = -1;
+}
+
+static inline int folio_pool_id(struct folio *folio)
+{
+	return folio->cc_pool_id;
+}
+
+static inline struct cleancache_pool *folio_pool(struct folio *folio)
+{
+	return &pools[folio_pool_id(folio)];
+}
+
+static void attach_folio(struct folio *folio, struct cleancache_inode *ccinode,
+			 pgoff_t offset)
+{
+	lockdep_assert_held(&(folio_pool(folio)->lock));
+
+	folio->cc_inode = ccinode;
+	folio->cc_index = offset;
+}
+
+static void detach_folio(struct folio *folio)
+{
+	lockdep_assert_held(&(folio_pool(folio)->lock));
+
+	folio->cc_inode = NULL;
+	folio->cc_index = 0;
+}
+
+static void folio_attachment(struct folio *folio,
+			     struct cleancache_inode **ccinode, pgoff_t *offset)
+{
+	lockdep_assert_held(&(folio_pool(folio)->lock));
+
+	*ccinode = folio->cc_inode;
+	*offset = folio->cc_index;
+}
+
+static inline bool is_folio_attached(struct folio *folio)
+{
+	lockdep_assert_held(&(folio_pool(folio)->lock));
+
+	return folio->cc_inode != NULL;
+}
+
+/*
+ * Folio pool helpers.
+ *	Only detached folios are stored in the pool->folio_list.
+ *
+ * Locking:
+ *	pool->folio_list is accessed under pool->lock.
+ */
+static void add_folio_to_pool(struct folio *folio, struct cleancache_pool *pool)
+{
+	lockdep_assert_held(&pool->lock);
+	VM_BUG_ON(folio_pool(folio) != pool);
+	VM_BUG_ON(!list_empty(&folio->lru));
+	VM_BUG_ON(is_folio_attached(folio));
+
+	list_add(&folio->lru, &pool->folio_list);
+}
+
+static struct folio *remove_folio_from_pool(struct folio *folio, struct cleancache_pool *pool)
+{
+	lockdep_assert_held(&pool->lock);
+	VM_BUG_ON(folio_pool(folio) != pool);
+
+	if (is_folio_attached(folio))
+		return NULL;
+
+	list_del_init(&folio->lru);
+
+	return folio;
+}
+
+static struct folio *pick_folio_from_any_pool(void)
+{
+	struct cleancache_pool *pool;
+	struct folio *folio = NULL;
+	int count;
+
+	/* nr_pools can only increase, so the following loop is safe */
+	count = atomic_read_acquire(&nr_pools);
+	for (int i = 0; i < count; i++) {
+		pool = &pools[i];
+		spin_lock(&pool->lock);
+		if (!list_empty(&pool->folio_list)) {
+			folio = list_last_entry(&pool->folio_list,
+						struct folio, lru);
+			WARN_ON(!remove_folio_from_pool(folio, pool));
+			spin_unlock(&pool->lock);
+			break;
+		}
+		spin_unlock(&pool->lock);
+	}
+
+	return folio;
+}
+
+/* FS helpers */
+static struct cleancache_fs *get_fs(int fs_id)
+{
+	struct cleancache_fs *fs;
+
+	rcu_read_lock();
+	fs = xa_load(&fs_xa, fs_id);
+	if (fs && !refcount_inc_not_zero(&fs->ref_count))
+		fs = NULL;
+	rcu_read_unlock();
+
+	return fs;
+}
+
+static unsigned int invalidate_inode(struct cleancache_fs *fs,
+				     struct inode *inode);
+
+static void put_fs(struct cleancache_fs *fs)
+{
+	if (refcount_dec_and_test(&fs->ref_count)) {
+		struct cleancache_inode *ccinode;
+		struct hlist_node *tmp;
+		int cursor;
+
+		/*
+		 * There are no concurrent RCU walkers because they
+		 * would have taken fs reference.
+		 * We don't need to hold fs->hash_lock because there
+		 * are no other users and no way to reach fs.
+		 */
+		hash_for_each_safe(fs->inode_hash, cursor, tmp, ccinode, hash)
+			invalidate_inode(fs, ccinode->inode);
+		/*
+		 * Don't need to synchronize_rcu() and wait for all inodes to be
+		 * freed because RCU read walkers can't take fs refcount anymore
+		 * to start their walk.
+		 */
+		kfree_rcu(fs, rcu);
+	}
+}
+
+/* cleancache_inode helpers. */
+static struct cleancache_inode *alloc_cleancache_inode(struct cleancache_fs *fs,
+						       struct inode *inode)
+{
+	struct cleancache_inode *ccinode;
+
+	ccinode = kmem_cache_alloc(slab_inode, GFP_ATOMIC|__GFP_NOWARN);
+	if (ccinode) {
+		ccinode->inode = inode;
+		xa_init_flags(&ccinode->folios, XA_FLAGS_LOCK_IRQ);
+		INIT_HLIST_NODE(&ccinode->hash);
+		ccinode->fs = fs;
+		refcount_set(&ccinode->ref_count, 1);
+	}
+
+	return ccinode;
+}
+
+static void inode_free_rcu(struct rcu_head *rcu)
+{
+	struct cleancache_inode *ccinode;
+
+	ccinode = container_of(rcu, struct cleancache_inode, rcu);
+	VM_BUG_ON(!xa_empty(&ccinode->folios));
+	kmem_cache_free(slab_inode, ccinode);
+}
+
+static inline bool get_inode(struct cleancache_inode *ccinode)
+{
+	return refcount_inc_not_zero(&ccinode->ref_count);
+}
+
+static void put_inode(struct cleancache_inode *ccinode)
+{
+	VM_BUG_ON(refcount_read(&ccinode->ref_count) == 0);
+	if (!refcount_dec_and_test(&ccinode->ref_count))
+		return;
+
+	lockdep_assert_not_held(&ccinode->folios.xa_lock);
+	VM_BUG_ON(!xa_empty(&ccinode->folios));
+	call_rcu(&ccinode->rcu, inode_free_rcu);
+}
+
+static void remove_inode_if_empty(struct cleancache_inode *ccinode)
+{
+	struct cleancache_fs *fs = ccinode->fs;
+
+	lockdep_assert_held(&ccinode->folios.xa_lock);
+
+	if (!xa_empty(&ccinode->folios))
+		return;
+
+	spin_lock(&fs->hash_lock);
+	hlist_del_init_rcu(&ccinode->hash);
+	spin_unlock(&fs->hash_lock);
+	/*
+	 * Drop the refcount set in alloc_cleancache_inode(). Caller should
+	 * have taken an extra refcount to keep ccinode valid, so ccinode
+	 * will be freed once the caller releases it.
+	 */
+	put_inode(ccinode);
+}
+
+static bool store_folio_in_inode(struct cleancache_inode *ccinode,
+				 pgoff_t offset, struct folio *folio)
+{
+	struct cleancache_pool *pool = folio_pool(folio);
+	int err;
+
+	lockdep_assert_held(&ccinode->folios.xa_lock);
+	VM_BUG_ON(!list_empty(&folio->lru));
+
+	spin_lock(&pool->lock);
+	err = xa_err(__xa_store(&ccinode->folios, offset, folio,
+				GFP_ATOMIC|__GFP_NOWARN));
+	if (!err)
+		attach_folio(folio, ccinode, offset);
+	spin_unlock(&pool->lock);
+
+	return err == 0;
+}
+
+static void erase_folio_from_inode(struct cleancache_inode *ccinode,
+				   unsigned long offset, struct folio *folio)
+{
+	bool removed;
+
+	lockdep_assert_held(&ccinode->folios.xa_lock);
+
+	removed = __xa_erase(&ccinode->folios, offset);
+	VM_BUG_ON(!removed);
+	remove_inode_if_empty(ccinode);
+}
+
+static void move_folio_from_inode_to_pool(struct cleancache_inode *ccinode,
+					  unsigned long offset, struct folio *folio)
+{
+	struct cleancache_pool *pool = folio_pool(folio);
+
+	erase_folio_from_inode(ccinode, offset, folio);
+	spin_lock(&pool->lock);
+	detach_folio(folio);
+	add_folio_to_pool(folio, pool);
+	spin_unlock(&pool->lock);
+}
+
+static bool isolate_folio_from_inode(struct cleancache_inode *ccinode,
+				     unsigned long offset, struct folio *folio)
+{
+	bool isolated = false;
+
+	xa_lock(&ccinode->folios);
+	if (xa_load(&ccinode->folios, offset) == folio) {
+		struct cleancache_pool *pool = folio_pool(folio);
+
+		erase_folio_from_inode(ccinode, offset, folio);
+		spin_lock(&pool->lock);
+		detach_folio(folio);
+		spin_unlock(&pool->lock);
+		isolated = true;
+	}
+	xa_unlock(&ccinode->folios);
+
+	return isolated;
+}
+
+static unsigned int erase_folios_from_inode(struct cleancache_inode *ccinode,
+					    struct xa_state *xas)
+{
+	unsigned int ret = 0;
+	struct folio *folio;
+
+	lockdep_assert_held(&ccinode->folios.xa_lock);
+
+	xas_for_each(xas, folio, ULONG_MAX) {
+		move_folio_from_inode_to_pool(ccinode, xas->xa_index, folio);
+		ret++;
+	}
+
+	return ret;
+}
+
+static struct cleancache_inode *find_and_get_inode(struct cleancache_fs *fs,
+						   struct inode *inode)
+{
+	struct cleancache_inode *ccinode = NULL;
+	struct cleancache_inode *tmp;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(fs->inode_hash, tmp, hash, inode->i_ino) {
+		if (tmp->inode != inode)
+			continue;
+
+		if (get_inode(tmp)) {
+			ccinode = tmp;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ccinode;
+}
+
+static struct cleancache_inode *add_and_get_inode(struct cleancache_fs *fs,
+						  struct inode *inode)
+{
+	struct cleancache_inode *ccinode, *tmp;
+
+	ccinode = alloc_cleancache_inode(fs, inode);
+	if (!ccinode)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&fs->hash_lock);
+	tmp = find_and_get_inode(fs, inode);
+	if (tmp) {
+		spin_unlock(&fs->hash_lock);
+		/* someone already added it */
+		put_inode(ccinode);
+		put_inode(tmp);
+		return ERR_PTR(-EEXIST);
+	}
+	hash_add_rcu(fs->inode_hash, &ccinode->hash, inode->i_ino);
+	get_inode(ccinode);
+	spin_unlock(&fs->hash_lock);
+
+	return ccinode;
+}
+
+static void copy_folio_content(struct folio *from, struct folio *to)
+{
+	void *src = kmap_local_folio(from, 0);
+	void *dst = kmap_local_folio(to, 0);
+
+	memcpy(dst, src, PAGE_SIZE);
+	kunmap_local(dst);
+	kunmap_local(src);
+}
+
+/*
+ * We want to store only workingset folios in the cleancache to increase hit
+ * ratio so there are four cases:
+ *
+ * @folio is workingset but cleancache doesn't have it: use new cleancache folio
+ * @folio is workingset and cleancache has it: overwrite the stale data
+ * @folio is !workingset and cleancache doesn't have it: just bail out
+ * @folio is !workingset and cleancache has it: remove the stale @folio
+ */
+static bool store_into_inode(struct cleancache_fs *fs,
+			     struct inode *inode,
+			     pgoff_t offset, struct folio *folio)
+{
+	bool workingset = folio_test_workingset(folio);
+	struct cleancache_inode *ccinode;
+	struct folio *stored_folio;
+	bool new_inode = false;
+	bool ret = false;
+
+find_inode:
+	ccinode = find_and_get_inode(fs, inode);
+	if (!ccinode) {
+		if (!workingset)
+			return false;
+
+		ccinode = add_and_get_inode(fs, inode);
+		if (IS_ERR_OR_NULL(ccinode)) {
+			/*
+			 * Retry if someone just added new ccinode from under us.
+			 */
+			if (PTR_ERR(ccinode) == -EEXIST)
+				goto find_inode;
+
+			return false;
+		}
+		new_inode = true;
+	}
+
+	xa_lock(&ccinode->folios);
+	stored_folio = xa_load(&ccinode->folios, offset);
+	if (stored_folio) {
+		if (!workingset) {
+			move_folio_from_inode_to_pool(ccinode, offset, stored_folio);
+			goto out_unlock;
+		}
+	} else {
+		if (!workingset)
+			goto out_unlock;
+
+		stored_folio = pick_folio_from_any_pool();
+		if (!stored_folio) {
+			/* No free folios, TODO: try reclaiming */
+			goto out_unlock;
+		}
+
+		if (!store_folio_in_inode(ccinode, offset, stored_folio)) {
+			struct cleancache_pool *pool = folio_pool(stored_folio);
+
+			/* Return stored_folio back into pool */
+			spin_lock(&pool->lock);
+			add_folio_to_pool(stored_folio, pool);
+			spin_unlock(&pool->lock);
+			goto out_unlock;
+		}
+	}
+	copy_folio_content(folio, stored_folio);
+
+	ret = true;
+out_unlock:
+	/* Free ccinode if it was created but no folio was stored in it. */
+	if (new_inode)
+		remove_inode_if_empty(ccinode);
+	xa_unlock(&ccinode->folios);
+	put_inode(ccinode);
+
+	return ret;
+}
+
+static bool load_from_inode(struct cleancache_fs *fs,
+			    struct inode *inode,
+			    pgoff_t offset, struct folio *folio)
+{
+	struct cleancache_inode *ccinode;
+	struct folio *stored_folio;
+	bool ret = false;
+
+	ccinode = find_and_get_inode(fs, inode);
+	if (!ccinode)
+		return false;
+
+	xa_lock(&ccinode->folios);
+	stored_folio = xa_load(&ccinode->folios, offset);
+	if (stored_folio) {
+		copy_folio_content(stored_folio, folio);
+		ret = true;
+	}
+	xa_unlock(&ccinode->folios);
+	put_inode(ccinode);
+
+	return ret;
+}
+
+static bool invalidate_folio(struct cleancache_fs *fs,
+			     struct inode *inode, pgoff_t offset)
+{
+	struct cleancache_inode *ccinode;
+	struct folio *folio;
+
+	ccinode = find_and_get_inode(fs, inode);
+	if (!ccinode)
+		return false;
+
+	xa_lock(&ccinode->folios);
+	folio = xa_load(&ccinode->folios, offset);
+	if (folio)
+		move_folio_from_inode_to_pool(ccinode, offset, folio);
+	xa_unlock(&ccinode->folios);
+	put_inode(ccinode);
+
+	return folio != NULL;
+}
+
+static unsigned int invalidate_inode(struct cleancache_fs *fs,
+				     struct inode *inode)
+{
+	struct cleancache_inode *ccinode;
+	unsigned int ret;
+
+	ccinode = find_and_get_inode(fs, inode);
+	if (ccinode) {
+		XA_STATE(xas, &ccinode->folios, 0);
+
+		xas_lock(&xas);
+		ret = erase_folios_from_inode(ccinode, &xas);
+		xas_unlock(&xas);
+		put_inode(ccinode);
+
+		return ret;
+	}
+
+	return 0;
+}
+
+/* Hooks into MM and FS */
+int cleancache_add_fs(struct super_block *sb)
+{
+	struct cleancache_fs *fs;
+	int fs_id;
+	int ret;
+
+	fs = kzalloc(sizeof(struct cleancache_fs), GFP_KERNEL);
+	if (!fs) {
+		sb->cleancache_id = CLEANCACHE_ID_INVALID;
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&fs->hash_lock);
+	hash_init(fs->inode_hash);
+	refcount_set(&fs->ref_count, 1);
+	ret = xa_alloc(&fs_xa, &fs_id, fs, xa_limit_32b, GFP_KERNEL);
+	if (ret) {
+		if (ret == -EBUSY)
+			pr_warn("too many file systems\n");
+
+		sb->cleancache_id = CLEANCACHE_ID_INVALID;
+		kfree(fs);
+	} else {
+		sb->cleancache_id = fs_id;
+	}
+
+	return ret;
+}
+
+void cleancache_remove_fs(struct super_block *sb)
+{
+	int fs_id = sb->cleancache_id;
+	struct cleancache_fs *fs;
+
+	sb->cleancache_id = CLEANCACHE_ID_INVALID;
+	fs = get_fs(fs_id);
+	if (!fs)
+		return;
+
+	xa_erase(&fs_xa, fs_id);
+	put_fs(fs);
+
+	/* free the object */
+	put_fs(fs);
+}
+
+bool cleancache_store_folio(struct inode *inode, struct folio *folio)
+{
+	struct cleancache_fs *fs;
+	int fs_id;
+	bool ret;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	if (!inode)
+		return false;
+
+	/* Do not support large folios yet */
+	if (folio_test_large(folio))
+		return false;
+
+	fs_id = folio->mapping->host->i_sb->cleancache_id;
+	if (fs_id == CLEANCACHE_ID_INVALID)
+		return false;
+
+	fs = get_fs(fs_id);
+	if (!fs)
+		return false;
+
+	ret = store_into_inode(fs, inode, folio->index, folio);
+	put_fs(fs);
+
+	return ret;
+}
+
+bool cleancache_restore_folio(struct inode *inode, struct folio *folio)
+{
+	struct cleancache_fs *fs;
+	int fs_id;
+	bool ret;
+
+	if (!inode)
+		return false;
+
+	/* Do not support large folios yet */
+	if (folio_test_large(folio))
+		return false;
+
+	fs_id = folio->mapping->host->i_sb->cleancache_id;
+	if (fs_id == CLEANCACHE_ID_INVALID)
+		return false;
+
+	fs = get_fs(fs_id);
+	if (!fs)
+		return false;
+
+	ret = load_from_inode(fs, inode, folio->index, folio);
+	put_fs(fs);
+
+	return ret;
+}
+
+bool cleancache_invalidate_folio(struct inode *inode, struct folio *folio)
+{
+	struct cleancache_fs *fs;
+	int fs_id;
+	bool ret;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	if (!inode)
+		return false;
+
+	/* Do not support large folios yet */
+	if (folio_test_large(folio))
+		return false;
+
+	/* Careful, folio->mapping can be NULL */
+	fs_id = inode->i_sb->cleancache_id;
+	if (fs_id == CLEANCACHE_ID_INVALID)
+		return false;
+
+	fs = get_fs(fs_id);
+	if (!fs)
+		return false;
+
+	ret = invalidate_folio(fs, inode, folio->index);
+	put_fs(fs);
+
+	return ret;
+}
+
+bool cleancache_invalidate_inode(struct inode *inode)
+{
+	struct cleancache_fs *fs;
+	unsigned int count;
+	int fs_id;
+
+	if (!inode)
+		return false;
+
+	fs_id = inode->i_sb->cleancache_id;
+	if (fs_id == CLEANCACHE_ID_INVALID)
+		return false;
+
+	fs = get_fs(fs_id);
+	if (!fs)
+		return false;
+
+	count = invalidate_inode(fs, inode);
+	put_fs(fs);
+
+	return count > 0;
+}
+
+/* Backend API */
+/*
+ * Register a new backend and add its folios for cleancache to use.
+ * Returns pool id on success or a negative error code on failure.
+ */
+int cleancache_backend_register_pool(const char *name)
+{
+	struct cleancache_pool *pool;
+	int pool_id;
+
+	/* pools_lock prevents concurrent registrations */
+	spin_lock(&pools_lock);
+	pool_id = atomic_read(&nr_pools);
+	if (pool_id >= CLEANCACHE_MAX_POOLS) {
+		spin_unlock(&pools_lock);
+		return -ENOMEM;
+	}
+
+	pool = &pools[pool_id];
+	INIT_LIST_HEAD(&pool->folio_list);
+	spin_lock_init(&pool->lock);
+	/* Ensure above stores complete before we increase the count */
+	atomic_set_release(&nr_pools, pool_id + 1);
+	spin_unlock(&pools_lock);
+
+	pr_info("Registered \'%s\' cleancache backend, pool id %d\n",
+		name ? : "none", pool_id);
+
+	return pool_id;
+}
+EXPORT_SYMBOL(cleancache_backend_register_pool);
+
+int cleancache_backend_get_folio(int pool_id, struct folio *folio)
+{
+	struct cleancache_inode *ccinode;
+	struct cleancache_pool *pool;
+	pgoff_t offset;
+
+	/* Do not support large folios yet */
+	if (folio_test_large(folio))
+		return -EOPNOTSUPP;
+
+	/* Does the folio belong to the requesting backend */
+	if (folio_pool_id(folio) != pool_id)
+		return -EINVAL;
+
+	pool = &pools[pool_id];
+again:
+	spin_lock(&pool->lock);
+
+	/* If folio is free in the pool, return it */
+	if (remove_folio_from_pool(folio, pool)) {
+		spin_unlock(&pool->lock);
+		goto out;
+	}
+	/*
+	 * The folio is not free, therefore it has to belong
+	 * to a valid ccinode.
+	 */
+	folio_attachment(folio, &ccinode, &offset);
+	if (WARN_ON(!ccinode || !get_inode(ccinode))) {
+		spin_unlock(&pool->lock);
+		return -EINVAL;
+	}
+
+	spin_unlock(&pool->lock);
+
+	/* Retry if the folio got erased from the ccinode */
+	if (!isolate_folio_from_inode(ccinode, offset, folio)) {
+		put_inode(ccinode);
+		goto again;
+	}
+
+	put_inode(ccinode);
+out:
+	VM_BUG_ON_FOLIO(folio_ref_count(folio) != 0, (folio));
+	clear_cleancache_folio(folio);
+
+	return 0;
+}
+EXPORT_SYMBOL(cleancache_backend_get_folio);
+
+int cleancache_backend_put_folio(int pool_id, struct folio *folio)
+{
+	struct cleancache_pool *pool = &pools[pool_id];
+
+	/* Do not support large folios yet */
+	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+
+	/* Can't put a still used folio into cleancache */
+	if (folio_ref_count(folio) != 0)
+		return -EINVAL;
+
+	/* Reset struct folio fields */
+	init_cleancache_folio(folio, pool_id);
+	INIT_LIST_HEAD(&folio->lru);
+	spin_lock(&pool->lock);
+	add_folio_to_pool(folio, pool);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(cleancache_backend_put_folio);
+
+int cleancache_backend_put_folios(int pool_id, struct list_head *folios)
+{
+	struct cleancache_pool *pool = &pools[pool_id];
+	LIST_HEAD(unused_folios);
+	struct folio *folio;
+	struct folio *tmp;
+
+	list_for_each_entry_safe(folio, tmp, folios, lru) {
+		/* Do not support large folios yet */
+		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+		if (folio_ref_count(folio) != 0)
+			continue;
+
+		init_cleancache_folio(folio, pool_id);
+		list_move(&folio->lru, &unused_folios);
+	}
+
+	spin_lock(&pool->lock);
+	list_splice_init(&unused_folios, &pool->folio_list);
+	spin_unlock(&pool->lock);
+
+	return list_empty(folios) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(cleancache_backend_put_folios);
+
+static int __init init_cleancache(void)
+{
+	slab_inode = KMEM_CACHE(cleancache_inode, 0);
+	if (!slab_inode)
+		return -ENOMEM;
+
+	return 0;
+}
+core_initcall(init_cleancache);
diff --git a/mm/filemap.c b/mm/filemap.c
index d78112183e79..d4a64179ec2d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,7 @@
 #include <linux/cpuset.h>
 #include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
+#include <linux/cleancache.h>
 #include <linux/shmem_fs.h>
 #include <linux/rmap.h>
 #include <linux/delayacct.h>
@@ -213,6 +214,19 @@ static void filemap_unaccount_folio(struct address_space *mapping,
 		folio_account_cleaned(folio, inode_to_wb(mapping->host));
 }
 
+void store_into_cleancache(struct address_space *mapping, struct folio *folio)
+{
+	/*
+	 * If we're uptodate, flush out into the cleancache, otherwise
+	 * invalidate any existing cleancache entries.  We can't leave
+	 * stale data around in the cleancache once our page is gone.
+	 */
+	if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio))
+		cleancache_store_folio(mapping->host, folio);
+	else
+		cleancache_invalidate_folio(mapping->host, folio);
+}
+
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
@@ -251,6 +265,9 @@ void filemap_remove_folio(struct folio *folio)
 	struct address_space *mapping = folio->mapping;
 
 	BUG_ON(!folio_test_locked(folio));
+
+	store_into_cleancache(mapping, folio);
+
 	spin_lock(&mapping->host->i_lock);
 	xa_lock_irq(&mapping->i_pages);
 	__filemap_remove_folio(folio, NULL);
@@ -324,6 +341,9 @@ void delete_from_page_cache_batch(struct address_space *mapping,
 	if (!folio_batch_count(fbatch))
 		return;
 
+	for (i = 0; i < folio_batch_count(fbatch); i++)
+		store_into_cleancache(mapping, fbatch->folios[i]);
+
 	spin_lock(&mapping->host->i_lock);
 	xa_lock_irq(&mapping->i_pages);
 	for (i = 0; i < folio_batch_count(fbatch); i++) {
@@ -2438,6 +2458,12 @@ static int filemap_read_folio(struct file *file, filler_t filler,
 	unsigned long pflags;
 	int error;
 
+	if (cleancache_restore_folio(folio->mapping->host, folio)) {
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+		return 0;
+	}
+
 	/* Start the actual read. The read will unlock the page. */
 	if (unlikely(workingset))
 		psi_memstall_enter(&pflags);
diff --git a/mm/truncate.c b/mm/truncate.c
index 9210cf808f5c..31f8ebd32245 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,6 +20,7 @@
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/shmem_fs.h>
+#include <linux/cleancache.h>
 #include <linux/rmap.h>
 #include "internal.h"
 
@@ -136,6 +137,7 @@ void folio_invalidate(struct folio *folio, size_t offset, size_t length)
 {
 	const struct address_space_operations *aops = folio->mapping->a_ops;
 
+	cleancache_invalidate_folio(folio->mapping->host, folio);
 	if (aops->invalidate_folio)
 		aops->invalidate_folio(folio, offset, length);
 }
@@ -615,6 +617,8 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
 	if (!filemap_release_folio(folio, gfp))
 		return -EBUSY;
 
+	cleancache_invalidate_folio(mapping->host, folio);
+
 	spin_lock(&mapping->host->i_lock);
 	xa_lock_irq(&mapping->i_pages);
 	if (folio_test_dirty(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c922bad2b8fd..3f2f18715c3a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -716,6 +716,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 	if (folio_test_swapcache(folio)) {
 		ci = swap_cluster_get_and_lock_irq(folio);
 	} else {
+		store_into_cleancache(mapping, folio);
 		spin_lock(&mapping->host->i_lock);
 		xa_lock_irq(&mapping->i_pages);
 	}
-- 
2.51.1.851.g4ebd6896fd-goog

Once all folios in the cleancache are used to store data from previously
evicted folios, no more data can be stored there. To avoid that situation
we can drop older data and make space for new one.
Add an LRU for cleancache folios to reclaim the oldest folio when
cleancache is full and we need to store a new folio.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Minchan Kim <minchan@google.com>
---
 mm/cleancache.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 2 deletions(-)

diff --git a/mm/cleancache.c b/mm/cleancache.c
index 26fb91b987b7..3acf46c0cdd1 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -18,6 +18,13 @@
  *
  *	ccinode->folios.xa_lock
  *		pool->lock
+ *
+ *	ccinode->folios.xa_lock
+ *		lru_lock
+ *
+ *	ccinode->folios.xa_lock
+ *		lru_lock
+ *			pool->lock
  */
 
 #define INODE_HASH_BITS		6
@@ -58,6 +65,8 @@ static struct kmem_cache *slab_inode; /* cleancache_inode slab */
 static struct cleancache_pool pools[CLEANCACHE_MAX_POOLS];
 static atomic_t nr_pools = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(pools_lock); /* protects pools */
+static LIST_HEAD(cleancache_lru);
+static DEFINE_SPINLOCK(lru_lock); /* protects cleancache_lru */
 
 static inline void init_cleancache_folio(struct folio *folio, int pool_id)
 {
@@ -73,6 +82,7 @@ static inline void clear_cleancache_folio(struct folio *folio)
 {
 	/* Folio must be detached and not in the pool. No locking is needed. */
 	VM_BUG_ON(folio->cc_inode);
+	VM_BUG_ON(!list_empty(&folio->lru));
 
 	folio->cc_pool_id = -1;
 }
@@ -123,6 +133,7 @@ static inline bool is_folio_attached(struct folio *folio)
 /*
  * Folio pool helpers.
  *	Only detached folios are stored in the pool->folio_list.
+ *	Once a folio gets attached, it's placed on the cleancache LRU list.
  *
  * Locking:
  *	pool->folio_list is accessed under pool->lock.
@@ -174,6 +185,32 @@ static struct folio *pick_folio_from_any_pool(void)
 	return folio;
 }
 
+/* Folio LRU helpers. Only attached folios are stored in the cleancache_lru. */
+static void add_folio_to_lru(struct folio *folio)
+{
+	VM_BUG_ON(!list_empty(&folio->lru));
+
+	spin_lock(&lru_lock);
+	list_add(&folio->lru, &cleancache_lru);
+	spin_unlock(&lru_lock);
+}
+
+static void rotate_lru_folio(struct folio *folio)
+{
+	spin_lock(&lru_lock);
+	if (!list_empty(&folio->lru))
+		list_move(&folio->lru, &cleancache_lru);
+	spin_unlock(&lru_lock);
+}
+
+static void delete_folio_from_lru(struct folio *folio)
+{
+	spin_lock(&lru_lock);
+	if (!list_empty(&folio->lru))
+		list_del_init(&folio->lru);
+	spin_unlock(&lru_lock);
+}
+
 /* FS helpers */
 static struct cleancache_fs *get_fs(int fs_id)
 {
@@ -306,6 +343,7 @@ static void erase_folio_from_inode(struct cleancache_inode *ccinode,
 
 	removed = __xa_erase(&ccinode->folios, offset);
 	VM_BUG_ON(!removed);
+	delete_folio_from_lru(folio);
 	remove_inode_if_empty(ccinode);
 }
 
@@ -403,6 +441,48 @@ static struct cleancache_inode *add_and_get_inode(struct cleancache_fs *fs,
 	return ccinode;
 }
 
+static struct folio *reclaim_folio_from_lru(void)
+{
+	struct cleancache_inode *ccinode;
+	struct folio *folio;
+	pgoff_t offset;
+
+again:
+	spin_lock(&lru_lock);
+	if (list_empty(&cleancache_lru)) {
+		spin_unlock(&lru_lock);
+		return NULL;
+	}
+	ccinode = NULL;
+	/* Get the ccinode of the folio at the LRU tail */
+	list_for_each_entry_reverse(folio, &cleancache_lru, lru) {
+		struct cleancache_pool *pool = folio_pool(folio);
+
+		/* Find and get ccinode */
+		spin_lock(&pool->lock);
+		folio_attachment(folio, &ccinode, &offset);
+		if (ccinode && !get_inode(ccinode))
+			ccinode = NULL;
+		spin_unlock(&pool->lock);
+		if (ccinode)
+			break;
+	}
+	spin_unlock(&lru_lock);
+
+	if (!ccinode)
+		return NULL; /* No ccinode to reclaim */
+
+	if (!isolate_folio_from_inode(ccinode, offset, folio)) {
+		/* Retry if the folio got erased from the ccinode */
+		put_inode(ccinode);
+		goto again;
+	}
+
+	put_inode(ccinode);
+
+	return folio;
+}
+
 static void copy_folio_content(struct folio *from, struct folio *to)
 {
 	void *src = kmap_local_folio(from, 0);
@@ -458,14 +538,19 @@ static bool store_into_inode(struct cleancache_fs *fs,
 			move_folio_from_inode_to_pool(ccinode, offset, stored_folio);
 			goto out_unlock;
 		}
+		rotate_lru_folio(stored_folio);
 	} else {
 		if (!workingset)
 			goto out_unlock;
 
 		stored_folio = pick_folio_from_any_pool();
 		if (!stored_folio) {
-			/* No free folios, TODO: try reclaiming */
-			goto out_unlock;
+			/* No free folios, try reclaiming */
+			xa_unlock(&ccinode->folios);
+			stored_folio = reclaim_folio_from_lru();
+			xa_lock(&ccinode->folios);
+			if (!stored_folio)
+				goto out_unlock;
 		}
 
 		if (!store_folio_in_inode(ccinode, offset, stored_folio)) {
@@ -477,6 +562,7 @@ static bool store_into_inode(struct cleancache_fs *fs,
 			spin_unlock(&pool->lock);
 			goto out_unlock;
 		}
+		add_folio_to_lru(stored_folio);
 	}
 	copy_folio_content(folio, stored_folio);
 
@@ -506,6 +592,7 @@ static bool load_from_inode(struct cleancache_fs *fs,
 	xa_lock(&ccinode->folios);
 	stored_folio = xa_load(&ccinode->folios, offset);
 	if (stored_folio) {
+		rotate_lru_folio(stored_folio);
 		copy_folio_content(stored_folio, folio);
 		ret = true;
 	}
-- 
2.51.1.851.g4ebd6896fd-goog

Restore pages from the cleancache during readahead operation.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/cleancache.h | 13 +++++++++
 mm/cleancache.c            | 58 ++++++++++++++++++++++++++++++++++++++
 mm/readahead.c             | 54 +++++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+)

diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 419faa183aba..75361d1cfe3f 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -11,6 +11,7 @@
 
 #define CLEANCACHE_KEY_MAX	6
 
+struct cleancache_inode;
 
 #ifdef CONFIG_CLEANCACHE
 
@@ -21,6 +22,11 @@ bool cleancache_store_folio(struct inode *inode, struct folio *folio);
 bool cleancache_restore_folio(struct inode *inode, struct folio *folio);
 bool cleancache_invalidate_folio(struct inode *inode, struct folio *folio);
 bool cleancache_invalidate_inode(struct inode *inode);
+struct cleancache_inode *
+cleancache_start_inode_walk(struct inode *inode, unsigned long count);
+void cleancache_end_inode_walk(struct cleancache_inode *ccinode);
+bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
+				   struct folio *folio);
 
 /*
  * Backend API
@@ -50,6 +56,13 @@ static inline bool cleancache_invalidate_folio(struct inode *inode,
 		{ return false; }
 static inline bool cleancache_invalidate_inode(struct inode *inode)
 		{ return false; }
+static inline struct cleancache_inode *
+cleancache_start_inode_walk(struct inode *inode, unsigned long count)
+		{ return NULL; }
+static inline void cleancache_end_inode_walk(struct cleancache_inode *ccinode) {}
+static inline bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
+						 struct folio *folio)
+		{ return false; }
 static inline int cleancache_backend_register_pool(const char *name)
 		{ return -EOPNOTSUPP; }
 static inline int cleancache_backend_get_folio(int pool_id, struct folio *folio)
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 3acf46c0cdd1..6be86938c8fe 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -799,6 +799,64 @@ bool cleancache_invalidate_inode(struct inode *inode)
 	return count > 0;
 }
 
+struct cleancache_inode *
+cleancache_start_inode_walk(struct inode *inode, unsigned long count)
+{
+	struct cleancache_inode *ccinode;
+	struct cleancache_fs *fs;
+	int fs_id;
+
+	if (!inode)
+		return ERR_PTR(-EINVAL);
+
+	fs_id = inode->i_sb->cleancache_id;
+	if (fs_id == CLEANCACHE_ID_INVALID)
+		return ERR_PTR(-EINVAL);
+
+	fs = get_fs(fs_id);
+	if (!fs)
+		return NULL;
+
+	ccinode = find_and_get_inode(fs, inode);
+	if (!ccinode) {
+		put_fs(fs);
+		return NULL;
+	}
+
+	return ccinode;
+}
+
+void cleancache_end_inode_walk(struct cleancache_inode *ccinode)
+{
+	struct cleancache_fs *fs = ccinode->fs;
+
+	put_inode(ccinode);
+	put_fs(fs);
+}
+
+bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
+				   struct folio *folio)
+{
+	struct folio *stored_folio;
+	void *src, *dst;
+	bool ret = false;
+
+	xa_lock(&ccinode->folios);
+	stored_folio = xa_load(&ccinode->folios, folio->index);
+	if (stored_folio) {
+		rotate_lru_folio(stored_folio);
+		src = kmap_local_folio(stored_folio, 0);
+		dst = kmap_local_folio(folio, 0);
+		memcpy(dst, src, PAGE_SIZE);
+		kunmap_local(dst);
+		kunmap_local(src);
+		ret = true;
+	}
+	xa_unlock(&ccinode->folios);
+
+	return ret;
+}
+
 /* Backend API */
 /*
  * Register a new backend and add its folios for cleancache to use.
diff --git a/mm/readahead.c b/mm/readahead.c
index 3a4b5d58eeb6..878cc8dfa48e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,6 +128,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/fadvise.h>
 #include <linux/sched/mm.h>
+#include <linux/cleancache.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/readahead.h>
@@ -146,12 +147,65 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
 
+static inline bool restore_from_cleancache(struct readahead_control *rac)
+{
+	XA_STATE(xas, &rac->mapping->i_pages, rac->_index);
+	struct cleancache_inode *ccinode;
+	struct folio *folio;
+	unsigned long end;
+	bool ret = true;
+
+	int count = readahead_count(rac);
+
+	/* Readahead should not have started yet. */
+	VM_BUG_ON(rac->_batch_count != 0);
+
+	if (!count)
+		return true;
+
+	ccinode = cleancache_start_inode_walk(rac->mapping->host, count);
+	if (!ccinode)
+		return false;
+
+	end = rac->_index + rac->_nr_pages - 1;
+	xas_for_each(&xas, folio, end) {
+		unsigned long nr;
+
+		if (xas_retry(&xas, folio)) {
+			ret = false;
+			break;
+		}
+
+		if (!cleancache_restore_from_inode(ccinode, folio)) {
+			ret = false;
+			break;
+		}
+
+		nr = folio_nr_pages(folio);
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+		rac->_index += nr;
+		rac->_nr_pages -= nr;
+		rac->ra->size -= nr;
+		if (rac->ra->async_size >= nr)
+			rac->ra->async_size -= nr;
+	}
+
+	cleancache_end_inode_walk(ccinode);
+
+	return ret;
+}
+
 static void read_pages(struct readahead_control *rac)
 {
 	const struct address_space_operations *aops = rac->mapping->a_ops;
 	struct folio *folio;
 	struct blk_plug plug;
 
+	/* Try to read all pages from the cleancache */
+	if (restore_from_cleancache(rac))
+		return;
+
 	if (!readahead_count(rac))
 		return;
 
-- 
2.51.1.851.g4ebd6896fd-goog

Create sysfs API under /sys/kernel/mm/cleancache/ to report the following
metrics:
  stored      - number of successful cleancache folio stores
  skipped     - number of folios skipped during cleancache store operation
  restored    - number of successful cleancache folio restore operations
  missed      - number of failed cleancache folio restore operations
  reclaimed   - number of folios dropped due to their age
  recalled    - number of folios dropped because cleancache backend took
                them back
  invalidated - number of folios dropped due to invalidation
  cached      - number of folios currently cached in the cleancache

In addition each pool creates a /sys/kernel/mm/cleancache/<pool name>
directory containing the following metrics:
  size        - number of folios in the pool
  cached      - number of folios currently cached in the pool
  recalled    - number of folios dropped from the pool because cleancache
                backend took them back

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 MAINTAINERS           |   2 +
 mm/Kconfig            |   8 ++
 mm/Makefile           |   1 +
 mm/cleancache.c       | 113 +++++++++++++++++++++--
 mm/cleancache_sysfs.c | 209 ++++++++++++++++++++++++++++++++++++++++++
 mm/cleancache_sysfs.h |  58 ++++++++++++
 6 files changed, 383 insertions(+), 8 deletions(-)
 create mode 100644 mm/cleancache_sysfs.c
 create mode 100644 mm/cleancache_sysfs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 90a6fc0e742c..84c65441925c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6060,6 +6060,8 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	include/linux/cleancache.h
 F:	mm/cleancache.c
+F:	mm/cleancache_sysfs.c
+F:	mm/cleancache_sysfs.h
 
 CLK API
 M:	Russell King <linux@armlinux.org.uk>
diff --git a/mm/Kconfig b/mm/Kconfig
index 1255b543030b..e1a169d5e5de 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1049,6 +1049,14 @@ config CLEANCACHE
 
 	  If unsure, say N.
 
+config CLEANCACHE_SYSFS
+	bool "Cleancache information through sysfs interface"
+	depends on CLEANCACHE && SYSFS
+	help
+	  This option exposes sysfs attributes to get information from
+	  cleancache. The user space can use this interface for querying
+	  cleancache and individual cleancache pool metrics.
+
 config CMA
 	bool "Contiguous Memory Allocator"
 	depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index b78073b87aea..a7a635f762ee 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_EXECMEM) += execmem.o
 obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
 obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_CLEANCACHE_SYSFS)	+= cleancache_sysfs.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 6be86938c8fe..e05393fb6cbc 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -11,6 +11,8 @@
 #include <linux/workqueue.h>
 #include <linux/xarray.h>
 
+#include "cleancache_sysfs.h"
+
 /*
  * Lock nesting:
  *	ccinode->folios.xa_lock
@@ -56,6 +58,8 @@ struct cleancache_inode {
 struct cleancache_pool {
 	struct list_head folio_list;
 	spinlock_t lock; /* protects folio_list */
+	char *name;
+	struct cleancache_pool_stats *stats;
 };
 
 #define CLEANCACHE_MAX_POOLS	64
@@ -104,6 +108,7 @@ static void attach_folio(struct folio *folio, struct cleancache_inode *ccinode,
 
 	folio->cc_inode = ccinode;
 	folio->cc_index = offset;
+	cleancache_pool_stat_inc(folio_pool(folio)->stats, POOL_CACHED);
 }
 
 static void detach_folio(struct folio *folio)
@@ -112,6 +117,7 @@ static void detach_folio(struct folio *folio)
 
 	folio->cc_inode = NULL;
 	folio->cc_index = 0;
+	cleancache_pool_stat_dec(folio_pool(folio)->stats, POOL_CACHED);
 }
 
 static void folio_attachment(struct folio *folio,
@@ -516,7 +522,7 @@ static bool store_into_inode(struct cleancache_fs *fs,
 	ccinode = find_and_get_inode(fs, inode);
 	if (!ccinode) {
 		if (!workingset)
-			return false;
+			goto out;
 
 		ccinode = add_and_get_inode(fs, inode);
 		if (IS_ERR_OR_NULL(ccinode)) {
@@ -536,6 +542,7 @@ static bool store_into_inode(struct cleancache_fs *fs,
 	if (stored_folio) {
 		if (!workingset) {
 			move_folio_from_inode_to_pool(ccinode, offset, stored_folio);
+			cleancache_stat_inc(RECLAIMED);
 			goto out_unlock;
 		}
 		rotate_lru_folio(stored_folio);
@@ -551,6 +558,8 @@ static bool store_into_inode(struct cleancache_fs *fs,
 			xa_lock(&ccinode->folios);
 			if (!stored_folio)
 				goto out_unlock;
+
+			cleancache_stat_inc(RECLAIMED);
 		}
 
 		if (!store_folio_in_inode(ccinode, offset, stored_folio)) {
@@ -562,6 +571,7 @@ static bool store_into_inode(struct cleancache_fs *fs,
 			spin_unlock(&pool->lock);
 			goto out_unlock;
 		}
+		cleancache_stat_inc(STORED);
 		add_folio_to_lru(stored_folio);
 	}
 	copy_folio_content(folio, stored_folio);
@@ -573,6 +583,8 @@ static bool store_into_inode(struct cleancache_fs *fs,
 		remove_inode_if_empty(ccinode);
 	xa_unlock(&ccinode->folios);
 	put_inode(ccinode);
+out:
+	cleancache_stat_inc(SKIPPED);
 
 	return ret;
 }
@@ -583,23 +595,26 @@ static bool load_from_inode(struct cleancache_fs *fs,
 {
 	struct cleancache_inode *ccinode;
 	struct folio *stored_folio;
-	bool ret = false;
 
 	ccinode = find_and_get_inode(fs, inode);
-	if (!ccinode)
+	if (!ccinode) {
+		cleancache_stat_inc(MISSED);
 		return false;
+	}
 
 	xa_lock(&ccinode->folios);
 	stored_folio = xa_load(&ccinode->folios, offset);
 	if (stored_folio) {
 		rotate_lru_folio(stored_folio);
 		copy_folio_content(stored_folio, folio);
-		ret = true;
+		cleancache_stat_inc(RESTORED);
+	} else {
+		cleancache_stat_inc(MISSED);
 	}
 	xa_unlock(&ccinode->folios);
 	put_inode(ccinode);
 
-	return ret;
+	return !!stored_folio;
 }
 
 static bool invalidate_folio(struct cleancache_fs *fs,
@@ -614,8 +629,10 @@ static bool invalidate_folio(struct cleancache_fs *fs,
 
 	xa_lock(&ccinode->folios);
 	folio = xa_load(&ccinode->folios, offset);
-	if (folio)
+	if (folio) {
 		move_folio_from_inode_to_pool(ccinode, offset, folio);
+		cleancache_stat_inc(INVALIDATED);
+	}
 	xa_unlock(&ccinode->folios);
 	put_inode(ccinode);
 
@@ -636,6 +653,7 @@ static unsigned int invalidate_inode(struct cleancache_fs *fs,
 		ret = erase_folios_from_inode(ccinode, &xas);
 		xas_unlock(&xas);
 		put_inode(ccinode);
+		cleancache_stat_add(INVALIDATED, ret);
 
 		return ret;
 	}
@@ -643,6 +661,53 @@ static unsigned int invalidate_inode(struct cleancache_fs *fs,
 	return 0;
 }
 
+/* Sysfs helpers */
+#ifdef CONFIG_CLEANCACHE_SYSFS
+
+static struct kobject *kobj_sysfs_root;
+
+static void __init cleancache_sysfs_init(void)
+{
+	struct cleancache_pool *pool;
+	int pool_id, pool_count;
+	struct kobject *kobj;
+
+	kobj = cleancache_sysfs_create_root();
+	if (IS_ERR(kobj)) {
+		pr_warn("Failed to create cleancache sysfs root\n");
+		return;
+	}
+
+	kobj_sysfs_root = kobj;
+	if (!kobj_sysfs_root)
+		return;
+
+	pool_count = atomic_read(&nr_pools);
+	pool = &pools[0];
+	for (pool_id = 0; pool_id < pool_count; pool_id++, pool++)
+		if (cleancache_sysfs_create_pool(kobj_sysfs_root, pool->stats, pool->name))
+			pr_warn("Failed to create sysfs nodes for \'%s\' cleancache backend\n",
+				pool->name);
+}
+
+static void cleancache_sysfs_pool_init(struct cleancache_pool_stats *pool_stats,
+				       const char *name)
+{
+	/* Skip if sysfs was not initialized yet. */
+	if (!kobj_sysfs_root)
+		return;
+
+	if (cleancache_sysfs_create_pool(kobj_sysfs_root, pool_stats, name))
+		pr_warn("Failed to create sysfs nodes for \'%s\' cleancache backend\n",
+			name);
+}
+
+#else /* CONFIG_CLEANCACHE_SYSFS */
+static inline void cleancache_sysfs_init(void) {}
+static inline void cleancache_sysfs_pool_init(struct cleancache_pool_stats *pool_stats,
+					      const char *name) {}
+#endif /* CONFIG_CLEANCACHE_SYSFS */
+
 /* Hooks into MM and FS */
 int cleancache_add_fs(struct super_block *sb)
 {
@@ -820,6 +885,7 @@ cleancache_start_inode_walk(struct inode *inode, unsigned long count)
 	ccinode = find_and_get_inode(fs, inode);
 	if (!ccinode) {
 		put_fs(fs);
+		cleancache_stat_add(MISSED, count);
 		return NULL;
 	}
 
@@ -850,7 +916,10 @@ bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
 		memcpy(dst, src, PAGE_SIZE);
 		kunmap_local(dst);
 		kunmap_local(src);
+		cleancache_stat_inc(RESTORED);
 		ret = true;
+	} else {
+		cleancache_stat_inc(MISSED);
 	}
 	xa_unlock(&ccinode->folios);
 
@@ -864,9 +933,18 @@ bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
  */
 int cleancache_backend_register_pool(const char *name)
 {
+	struct cleancache_pool_stats *pool_stats;
 	struct cleancache_pool *pool;
+	char *pool_name;
 	int pool_id;
 
+	if (!name)
+		return -EINVAL;
+
+	pool_name = kstrdup(name, GFP_KERNEL);
+	if (!pool_name)
+		return -ENOMEM;
+
 	/* pools_lock prevents concurrent registrations */
 	spin_lock(&pools_lock);
 	pool_id = atomic_read(&nr_pools);
@@ -878,12 +956,22 @@ int cleancache_backend_register_pool(const char *name)
 	pool = &pools[pool_id];
 	INIT_LIST_HEAD(&pool->folio_list);
 	spin_lock_init(&pool->lock);
+	pool->name = pool_name;
 	/* Ensure above stores complete before we increase the count */
 	atomic_set_release(&nr_pools, pool_id + 1);
 	spin_unlock(&pools_lock);
 
+	pool_stats = cleancache_create_pool_stats(pool_id);
+	if (!IS_ERR(pool_stats)) {
+		pool->stats = pool_stats;
+		cleancache_sysfs_pool_init(pool_stats, pool->name);
+	} else {
+		pr_warn("Failed to create pool stats for \'%s\' cleancache backend\n",
+			pool->name);
+	}
+
 	pr_info("Registered \'%s\' cleancache backend, pool id %d\n",
-		name ? : "none", pool_id);
+		name, pool_id);
 
 	return pool_id;
 }
@@ -930,10 +1018,13 @@ int cleancache_backend_get_folio(int pool_id, struct folio *folio)
 		goto again;
 	}
 
+	cleancache_stat_inc(RECALLED);
+	cleancache_pool_stat_inc(folio_pool(folio)->stats, POOL_RECALLED);
 	put_inode(ccinode);
 out:
 	VM_BUG_ON_FOLIO(folio_ref_count(folio) != 0, (folio));
 	clear_cleancache_folio(folio);
+	cleancache_pool_stat_dec(pool->stats, POOL_SIZE);
 
 	return 0;
 }
@@ -955,6 +1046,7 @@ int cleancache_backend_put_folio(int pool_id, struct folio *folio)
 	INIT_LIST_HEAD(&folio->lru);
 	spin_lock(&pool->lock);
 	add_folio_to_pool(folio, pool);
+	cleancache_pool_stat_inc(pool->stats, POOL_SIZE);
 	spin_unlock(&pool->lock);
 
 	return 0;
@@ -967,6 +1059,7 @@ int cleancache_backend_put_folios(int pool_id, struct list_head *folios)
 	LIST_HEAD(unused_folios);
 	struct folio *folio;
 	struct folio *tmp;
+	int count = 0;
 
 	list_for_each_entry_safe(folio, tmp, folios, lru) {
 		/* Do not support large folios yet */
@@ -976,10 +1069,12 @@ int cleancache_backend_put_folios(int pool_id, struct list_head *folios)
 
 		init_cleancache_folio(folio, pool_id);
 		list_move(&folio->lru, &unused_folios);
+		count++;
 	}
 
 	spin_lock(&pool->lock);
 	list_splice_init(&unused_folios, &pool->folio_list);
+	cleancache_pool_stat_add(pool->stats, POOL_SIZE, count);
 	spin_unlock(&pool->lock);
 
 	return list_empty(folios) ? 0 : -EINVAL;
@@ -992,6 +1087,8 @@ static int __init init_cleancache(void)
 	if (!slab_inode)
 		return -ENOMEM;
 
+	cleancache_sysfs_init();
+
 	return 0;
 }
-core_initcall(init_cleancache);
+subsys_initcall(init_cleancache);
diff --git a/mm/cleancache_sysfs.c b/mm/cleancache_sysfs.c
new file mode 100644
index 000000000000..5ad7ae84ca1d
--- /dev/null
+++ b/mm/cleancache_sysfs.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include "cleancache_sysfs.h"
+
+static atomic64_t stats[CLEANCACHE_STAT_NR];
+
+void cleancache_stat_inc(enum cleancache_stat type)
+{
+	atomic64_inc(&stats[type]);
+}
+
+void cleancache_stat_add(enum cleancache_stat type, unsigned long delta)
+{
+	atomic64_add(delta, &stats[type]);
+}
+
+void cleancache_pool_stat_inc(struct cleancache_pool_stats *pool_stats,
+			      enum cleancache_pool_stat type)
+{
+	atomic64_inc(&pool_stats->stats[type]);
+}
+
+void cleancache_pool_stat_dec(struct cleancache_pool_stats *pool_stats,
+			      enum cleancache_pool_stat type)
+{
+	atomic64_dec(&pool_stats->stats[type]);
+}
+
+void cleancache_pool_stat_add(struct cleancache_pool_stats *pool_stats,
+			      enum cleancache_pool_stat type, long delta)
+{
+	atomic64_add(delta, &pool_stats->stats[type]);
+}
+
+#define CLEANCACHE_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static inline struct cleancache_pool_stats *kobj_to_stats(struct kobject *kobj)
+{
+	return container_of(kobj, struct cleancache_pool_stats, kobj);
+}
+
+static ssize_t stored_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[STORED]));
+}
+CLEANCACHE_ATTR_RO(stored);
+
+static ssize_t skipped_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[SKIPPED]));
+}
+CLEANCACHE_ATTR_RO(skipped);
+
+static ssize_t restored_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[RESTORED]));
+}
+CLEANCACHE_ATTR_RO(restored);
+
+static ssize_t missed_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[MISSED]));
+}
+CLEANCACHE_ATTR_RO(missed);
+
+static ssize_t reclaimed_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[RECLAIMED]));
+}
+CLEANCACHE_ATTR_RO(reclaimed);
+
+static ssize_t recalled_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[RECALLED]));
+}
+CLEANCACHE_ATTR_RO(recalled);
+
+static ssize_t invalidated_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&stats[INVALIDATED]));
+}
+CLEANCACHE_ATTR_RO(invalidated);
+
+static ssize_t cached_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	s64 dropped = atomic64_read(&stats[INVALIDATED]) +
+			atomic64_read(&stats[RECLAIMED]) +
+			atomic64_read(&stats[RECALLED]);
+
+	return sysfs_emit(buf, "%llu\n", (u64)(atomic64_read(&stats[STORED]) - dropped));
+}
+CLEANCACHE_ATTR_RO(cached);
+
+static struct attribute *cleancache_attrs[] = {
+	&stored_attr.attr,
+	&skipped_attr.attr,
+	&restored_attr.attr,
+	&missed_attr.attr,
+	&reclaimed_attr.attr,
+	&recalled_attr.attr,
+	&invalidated_attr.attr,
+	&cached_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cleancache);
+
+#define CLEANCACHE_POOL_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_pool_attr = {		\
+		.attr	= { .name = __stringify(_name), .mode = 0444 },	\
+		.show	= _name##_pool_show,				\
+}
+
+static ssize_t size_pool_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n",
+		(u64)atomic64_read(&kobj_to_stats(kobj)->stats[POOL_SIZE]));
+}
+CLEANCACHE_POOL_ATTR_RO(size);
+
+static ssize_t cached_pool_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n",
+		(u64)atomic64_read(&kobj_to_stats(kobj)->stats[POOL_CACHED]));
+}
+CLEANCACHE_POOL_ATTR_RO(cached);
+
+static ssize_t recalled_pool_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n",
+		(u64)atomic64_read(&kobj_to_stats(kobj)->stats[POOL_RECALLED]));
+}
+CLEANCACHE_POOL_ATTR_RO(recalled);
+
+
+static struct attribute *cleancache_pool_attrs[] = {
+	&size_pool_attr.attr,
+	&cached_pool_attr.attr,
+	&recalled_pool_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cleancache_pool);
+
+static void cleancache_pool_release(struct kobject *kobj)
+{
+	kfree(kobj_to_stats(kobj));
+}
+
+static const struct kobj_type cleancache_pool_ktype = {
+	.release = &cleancache_pool_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = cleancache_pool_groups,
+};
+
+struct cleancache_pool_stats *cleancache_create_pool_stats(int pool_id)
+{
+	struct cleancache_pool_stats *pool_stats;
+
+	pool_stats = kzalloc(sizeof(*pool_stats), GFP_KERNEL);
+	if (!pool_stats)
+		return ERR_PTR(-ENOMEM);
+
+	pool_stats->pool_id = pool_id;
+
+	return pool_stats;
+}
+
+struct kobject * __init cleancache_sysfs_create_root(void)
+{
+	struct kobject *kobj;
+	int err;
+
+	kobj = kobject_create_and_add("cleancache", mm_kobj);
+	if (unlikely(!kobj)) {
+		pr_err("Failed to create cleancache kobject\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = sysfs_create_group(kobj, cleancache_groups[0]);
+	if (err) {
+		kobject_put(kobj);
+		pr_err("Failed to create cleancache group kobject\n");
+		return ERR_PTR(err);
+	}
+
+	return kobj;
+}
+
+int cleancache_sysfs_create_pool(struct kobject *root_kobj,
+				 struct cleancache_pool_stats *pool_stats,
+				 const char *name)
+{
+	return kobject_init_and_add(&pool_stats->kobj, &cleancache_pool_ktype,
+				    root_kobj, name);
+}
diff --git a/mm/cleancache_sysfs.h b/mm/cleancache_sysfs.h
new file mode 100644
index 000000000000..fb8d2a72be63
--- /dev/null
+++ b/mm/cleancache_sysfs.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CLEANCACHE_SYSFS_H__
+#define __CLEANCACHE_SYSFS_H__
+
+enum cleancache_stat {
+	STORED,
+	SKIPPED,
+	RESTORED,
+	MISSED,
+	RECLAIMED,
+	RECALLED,
+	INVALIDATED,
+	CLEANCACHE_STAT_NR
+};
+
+enum cleancache_pool_stat {
+	POOL_SIZE,
+	POOL_CACHED,
+	POOL_RECALLED,
+	CLEANCACHE_POOL_STAT_NR
+};
+
+struct cleancache_pool_stats {
+	struct kobject kobj;
+	int pool_id;
+	atomic64_t stats[CLEANCACHE_POOL_STAT_NR];
+};
+
+#ifdef CONFIG_CLEANCACHE_SYSFS
+void cleancache_stat_inc(enum cleancache_stat type);
+void cleancache_stat_add(enum cleancache_stat type, unsigned long delta);
+void cleancache_pool_stat_inc(struct cleancache_pool_stats *pool_stats,
+			 enum cleancache_pool_stat type);
+void cleancache_pool_stat_dec(struct cleancache_pool_stats *pool_stats,
+			 enum cleancache_pool_stat type);
+void cleancache_pool_stat_add(struct cleancache_pool_stats *pool_stats,
+			 enum cleancache_pool_stat type, long delta);
+struct cleancache_pool_stats *cleancache_create_pool_stats(int pool_id);
+struct kobject * __init cleancache_sysfs_create_root(void);
+int cleancache_sysfs_create_pool(struct kobject *root_kobj,
+				 struct cleancache_pool_stats *pool_stats,
+				 const char *name);
+
+#else /* CONFIG_CLEANCACHE_SYSFS */
+static inline void cleancache_stat_inc(enum cleancache_stat type) {}
+static inline void cleancache_stat_add(enum cleancache_stat type, unsigned long delta) {}
+static inline void cleancache_pool_stat_inc(struct cleancache_pool_stats *pool_stats,
+				       enum cleancache_pool_stat type) {}
+static inline void cleancache_pool_stat_dec(struct cleancache_pool_stats *pool_stats,
+				       enum cleancache_pool_stat type) {}
+static inline void cleancache_pool_stat_add(struct cleancache_pool_stats *pool_stats,
+				       enum cleancache_pool_stat type, long delta) {}
+static inline
+struct cleancache_pool_stats *cleancache_create_pool_stats(int pool_id) { return NULL; }
+
+#endif /* CONFIG_CLEANCACHE_SYSFS */
+
+#endif /* __CLEANCACHE_SYSFS_H__ */
-- 
2.51.1.851.g4ebd6896fd-goog

Introduce a kunit test that creates fake inodes, fills them with folios
with predefined content, registers a cleancache pool, allocates and
donates folios to the new pool. After this initialization it runs
several scenarios:
1. cleancache_restore_test - stores fake inode pages into cleancache,
then restores them into auxiliary folios and checks restored content;
2. cleancache_invalidate_test - stores a folio, successfully restores
it, invalidates it and tries to restore again expecting a failure;
3. cleancache_reclaim_test - fills up the cleancache, stores one
more folio and verifies that the oldest folio got reclaimed;
4. cleancache_backend_api_test - takes all donated folios and puts them
back verifying the results;

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 MAINTAINERS                 |   1 +
 mm/Kconfig.debug            |  13 ++
 mm/Makefile                 |   1 +
 mm/cleancache.c             |  35 ++-
 mm/tests/Makefile           |   6 +
 mm/tests/cleancache_kunit.c | 420 ++++++++++++++++++++++++++++++++++++
 6 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 mm/tests/Makefile
 create mode 100644 mm/tests/cleancache_kunit.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 84c65441925c..eb35973e10c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6062,6 +6062,7 @@ F:	include/linux/cleancache.h
 F:	mm/cleancache.c
 F:	mm/cleancache_sysfs.c
 F:	mm/cleancache_sysfs.h
+F:	mm/tests/cleancache_kunit.c
 
 CLK API
 M:	Russell King <linux@armlinux.org.uk>
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 32b65073d0cc..c3482f7bc977 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -309,3 +309,16 @@ config PER_VMA_LOCK_STATS
 	  overhead in the page fault path.
 
 	  If in doubt, say N.
+
+config CLEANCACHE_KUNIT
+	tristate "KUnit test for cleancache" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	depends on CLEANCACHE
+	default KUNIT_ALL_TESTS
+	help
+	  This builds the cleancache unit test.
+	  Tests the clencache functionality.
+	  For more information on KUnit and unit tests in general please refer
+	  to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+	  If unsure, say N.
diff --git a/mm/Makefile b/mm/Makefile
index a7a635f762ee..845841a140e3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -70,6 +70,7 @@ obj-y += init-mm.o
 obj-y += memblock.o
 obj-y += $(memory-hotplug-y)
 obj-y += slub.o
+obj-y += tests/
 
 ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index e05393fb6cbc..0ed67afd23ec 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -10,6 +10,8 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/xarray.h>
+#include <kunit/test-bug.h>
+#include <kunit/test.h>
 
 #include "cleancache_sysfs.h"
 
@@ -72,6 +74,28 @@ static DEFINE_SPINLOCK(pools_lock); /* protects pools */
 static LIST_HEAD(cleancache_lru);
 static DEFINE_SPINLOCK(lru_lock); /* protects cleancache_lru */
 
+#if IS_ENABLED(CONFIG_CLEANCACHE_KUNIT)
+
+static bool is_pool_allowed(int pool_id)
+{
+	struct kunit *test = kunit_get_current_test();
+
+	/* Restrict kunit tests to using only the test pool */
+	return test && *((int *)test->priv) == pool_id;
+}
+
+#else /* CONFIG_CLEANCACHE_KUNIT */
+
+static bool is_pool_allowed(int pool_id) { return true; }
+
+#endif /* CONFIG_CLEANCACHE_KUNIT */
+
+#if IS_MODULE(CONFIG_CLEANCACHE_KUNIT)
+#define EXPORT_SYMBOL_FOR_KUNIT(x) EXPORT_SYMBOL(x)
+#else
+#define EXPORT_SYMBOL_FOR_KUNIT(x)
+#endif
+
 static inline void init_cleancache_folio(struct folio *folio, int pool_id)
 {
 	/* Folio is being donated and has no refs. No locking is needed. */
@@ -178,7 +202,7 @@ static struct folio *pick_folio_from_any_pool(void)
 	for (int i = 0; i < count; i++) {
 		pool = &pools[i];
 		spin_lock(&pool->lock);
-		if (!list_empty(&pool->folio_list)) {
+		if (!list_empty(&pool->folio_list) && is_pool_allowed(i)) {
 			folio = list_last_entry(&pool->folio_list,
 						struct folio, lru);
 			WARN_ON(!remove_folio_from_pool(folio, pool));
@@ -737,6 +761,7 @@ int cleancache_add_fs(struct super_block *sb)
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_add_fs);
 
 void cleancache_remove_fs(struct super_block *sb)
 {
@@ -754,6 +779,7 @@ void cleancache_remove_fs(struct super_block *sb)
 	/* free the object */
 	put_fs(fs);
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_remove_fs);
 
 bool cleancache_store_folio(struct inode *inode, struct folio *folio)
 {
@@ -783,6 +809,7 @@ bool cleancache_store_folio(struct inode *inode, struct folio *folio)
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_store_folio);
 
 bool cleancache_restore_folio(struct inode *inode, struct folio *folio)
 {
@@ -810,6 +837,7 @@ bool cleancache_restore_folio(struct inode *inode, struct folio *folio)
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_restore_folio);
 
 bool cleancache_invalidate_folio(struct inode *inode, struct folio *folio)
 {
@@ -840,6 +868,7 @@ bool cleancache_invalidate_folio(struct inode *inode, struct folio *folio)
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_invalidate_folio);
 
 bool cleancache_invalidate_inode(struct inode *inode)
 {
@@ -863,6 +892,7 @@ bool cleancache_invalidate_inode(struct inode *inode)
 
 	return count > 0;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_invalidate_inode);
 
 struct cleancache_inode *
 cleancache_start_inode_walk(struct inode *inode, unsigned long count)
@@ -891,6 +921,7 @@ cleancache_start_inode_walk(struct inode *inode, unsigned long count)
 
 	return ccinode;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_start_inode_walk);
 
 void cleancache_end_inode_walk(struct cleancache_inode *ccinode)
 {
@@ -899,6 +930,7 @@ void cleancache_end_inode_walk(struct cleancache_inode *ccinode)
 	put_inode(ccinode);
 	put_fs(fs);
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_end_inode_walk);
 
 bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
 				   struct folio *folio)
@@ -925,6 +957,7 @@ bool cleancache_restore_from_inode(struct cleancache_inode *ccinode,
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KUNIT(cleancache_restore_from_inode);
 
 /* Backend API */
 /*
diff --git a/mm/tests/Makefile b/mm/tests/Makefile
new file mode 100644
index 000000000000..fac2e964b4d5
--- /dev/null
+++ b/mm/tests/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for tests of kernel mm subsystem.
+
+# KUnit tests
+obj-$(CONFIG_CLEANCACHE_KUNIT) += cleancache_kunit.o
diff --git a/mm/tests/cleancache_kunit.c b/mm/tests/cleancache_kunit.c
new file mode 100644
index 000000000000..bb431f8021a6
--- /dev/null
+++ b/mm/tests/cleancache_kunit.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit test for the Cleancache.
+ *
+ * Copyright (C) 2025, Google LLC.
+ * Author: Suren Baghdasaryan <surenb@google.com>
+ */
+#include <kunit/test.h>
+
+#include <linux/cleancache.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include "../internal.h"
+
+#define INODE_COUNT		5
+#define FOLIOS_PER_INODE	4
+#define FOLIO_COUNT		(INODE_COUNT * FOLIOS_PER_INODE)
+
+static const u32 TEST_CONTENT = 0xBADCAB32;
+
+struct inode_data {
+	struct address_space mapping;
+	struct inode inode;
+	struct folio *folios[FOLIOS_PER_INODE];
+};
+
+static struct test_data {
+	/* Mock a fs */
+	struct super_block sb;
+	struct inode_data inodes[INODE_COUNT];
+	/* Folios donated to the cleancache pools */
+	struct folio *pool_folios[FOLIO_COUNT];
+	/* Auxiliary folio */
+	struct folio *aux_folio;
+	int pool_id;
+} test_data;
+
+static void set_folio_content(struct folio *folio, u32 value)
+{
+	u32 *data;
+
+	data = kmap_local_folio(folio, 0);
+	*data = value;
+	kunmap_local(data);
+}
+
+static u32 get_folio_content(struct folio *folio)
+{
+	unsigned long value;
+	u32 *data;
+
+	data = kmap_local_folio(folio, 0);
+	value = *data;
+	kunmap_local(data);
+
+	return value;
+}
+
+static void fill_cleancache(struct kunit *test)
+{
+	struct inode_data *inode_data;
+	struct folio *folio;
+
+	/* Store inode folios into cleancache */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		inode_data = &test_data.inodes[inode];
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			folio = inode_data->folios[fidx];
+			KUNIT_EXPECT_NOT_NULL(test, folio);
+			folio_lock(folio); /* Folio has to be locked */
+			folio_set_workingset(folio);
+			KUNIT_EXPECT_TRUE(test, cleancache_store_folio(&inode_data->inode, folio));
+			folio_unlock(folio);
+		}
+	}
+}
+
+static int cleancache_suite_init(struct kunit_suite *suite)
+{
+	LIST_HEAD(pool_folios);
+
+	/* Add a fake fs superblock */
+	cleancache_add_fs(&test_data.sb);
+
+	/* Initialize fake inodes */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		struct inode_data *inode_data = &test_data.inodes[inode];
+
+		inode_data->inode.i_sb = &test_data.sb;
+		inode_data->inode.i_ino = inode;
+		inode_data->inode.i_mapping = &inode_data->mapping;
+		inode_data->mapping.host = &inode_data->inode;
+
+		/* Allocate folios for the inode  */
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			struct folio *folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+			if (!folio)
+				return -ENOMEM;
+
+			set_folio_content(folio, (u32)fidx);
+			folio->mapping = &inode_data->mapping;
+			folio->index = PAGE_SIZE * fidx;
+			inode_data->folios[fidx] = folio;
+		}
+	}
+
+	/* Register new cleancache pool and donate test folios */
+	test_data.pool_id = cleancache_backend_register_pool("kunit_pool");
+	if (test_data.pool_id < 0)
+		return -EINVAL;
+
+	/* Allocate folios and put them to cleancache  */
+	for (int fidx = 0; fidx < FOLIO_COUNT; fidx++) {
+		struct folio *folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+		if (!folio)
+			return -ENOMEM;
+
+		folio_ref_freeze(folio, 1);
+		test_data.pool_folios[fidx] = folio;
+		list_add(&folio->lru, &pool_folios);
+	}
+
+	cleancache_backend_put_folios(test_data.pool_id, &pool_folios);
+
+	/* Allocate auxiliary folio for testing  */
+	test_data.aux_folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+	if (!test_data.aux_folio)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void cleancache_suite_exit(struct kunit_suite *suite)
+{
+	/* Take back donated folios and free them */
+	for (int fidx = 0; fidx < FOLIO_COUNT; fidx++) {
+		struct folio *folio = test_data.pool_folios[fidx];
+
+		if (folio) {
+			if (!cleancache_backend_get_folio(test_data.pool_id,
+							  folio))
+				set_page_refcounted(&folio->page);
+			folio_put(folio);
+		}
+	}
+
+	/* Free the auxiliary folio */
+	if (test_data.aux_folio) {
+		test_data.aux_folio->mapping = NULL;
+		folio_put(test_data.aux_folio);
+	}
+
+	/* Free inode folios */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			struct folio *folio = test_data.inodes[inode].folios[fidx];
+
+			if (folio) {
+				folio->mapping = NULL;
+				folio_put(folio);
+			}
+		}
+	}
+
+	cleancache_remove_fs(&test_data.sb);
+}
+
+static int cleancache_test_init(struct kunit *test)
+{
+	/* Pass pool_id to cleancache to restrict pools that can be used for tests */
+	test->priv = &test_data.pool_id;
+
+	return 0;
+}
+
+static void cleancache_restore_test(struct kunit *test)
+{
+	struct inode_data *inode_data;
+	struct folio *folio;
+
+	/* Store inode folios into cleancache */
+	fill_cleancache(test);
+
+	/* Restore and validate folios stored in cleancache */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		inode_data = &test_data.inodes[inode];
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			folio = inode_data->folios[fidx];
+			test_data.aux_folio->mapping = folio->mapping;
+			test_data.aux_folio->index = folio->index;
+			KUNIT_EXPECT_TRUE(test, cleancache_restore_folio(&inode_data->inode,
+									 test_data.aux_folio));
+			KUNIT_EXPECT_EQ(test, get_folio_content(test_data.aux_folio),
+					get_folio_content(folio));
+		}
+	}
+}
+
+static void cleancache_walk_and_restore_test(struct kunit *test)
+{
+	struct cleancache_inode *ccinode;
+	struct inode_data *inode_data;
+	struct folio *folio;
+
+	/* Store inode folios into cleancache */
+	fill_cleancache(test);
+
+	/* Restore and validate folios stored in the first inode */
+	inode_data = &test_data.inodes[0];
+	ccinode = cleancache_start_inode_walk(&inode_data->inode, FOLIOS_PER_INODE);
+	KUNIT_EXPECT_NOT_NULL(test, ccinode);
+	for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+		folio = inode_data->folios[fidx];
+		test_data.aux_folio->mapping = folio->mapping;
+		test_data.aux_folio->index = folio->index;
+		KUNIT_EXPECT_TRUE(test, cleancache_restore_from_inode(ccinode,
+								      test_data.aux_folio));
+		KUNIT_EXPECT_EQ(test, get_folio_content(test_data.aux_folio),
+				get_folio_content(folio));
+	}
+	cleancache_end_inode_walk(ccinode);
+}
+
+static void cleancache_invalidate_test(struct kunit *test)
+{
+	struct inode_data *inode_data;
+	struct folio *folio;
+
+	/* Store inode folios into cleancache */
+	fill_cleancache(test);
+
+	/* Invalidate one folio */
+	inode_data = &test_data.inodes[0];
+	folio = inode_data->folios[0];
+	test_data.aux_folio->mapping = folio->mapping;
+	test_data.aux_folio->index = folio->index;
+	KUNIT_EXPECT_TRUE(test, cleancache_restore_folio(&inode_data->inode,
+							 test_data.aux_folio));
+	folio_lock(folio); /* Folio has to be locked */
+	KUNIT_EXPECT_TRUE(test, cleancache_invalidate_folio(&inode_data->inode,
+							    inode_data->folios[0]));
+	folio_unlock(folio);
+	KUNIT_EXPECT_FALSE(test, cleancache_restore_folio(&inode_data->inode,
+							  test_data.aux_folio));
+
+	/* Invalidate one node */
+	inode_data = &test_data.inodes[1];
+	KUNIT_EXPECT_TRUE(test, cleancache_invalidate_inode(&inode_data->inode));
+
+	/* Verify results */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		inode_data = &test_data.inodes[inode];
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			folio = inode_data->folios[fidx];
+			test_data.aux_folio->mapping = folio->mapping;
+			test_data.aux_folio->index = folio->index;
+			if (inode == 0 && fidx == 0) {
+				/* Folio should be missing */
+				KUNIT_EXPECT_FALSE(test,
+					cleancache_restore_folio(&inode_data->inode,
+								 test_data.aux_folio));
+				continue;
+			}
+			if (inode == 1) {
+				/* Folios in the node should be missing */
+				KUNIT_EXPECT_FALSE(test,
+					cleancache_restore_folio(&inode_data->inode,
+								 test_data.aux_folio));
+				continue;
+			}
+			KUNIT_EXPECT_TRUE(test,
+					cleancache_restore_folio(&inode_data->inode,
+								 test_data.aux_folio));
+			KUNIT_EXPECT_EQ(test, get_folio_content(test_data.aux_folio),
+					get_folio_content(folio));
+		}
+	}
+}
+
+static void cleancache_reclaim_test(struct kunit *test)
+{
+	struct inode_data *inode_data;
+	struct inode_data *inode_new;
+	unsigned long new_index;
+	struct folio *folio;
+
+	/* Store inode folios into cleancache */
+	fill_cleancache(test);
+
+	/*
+	 * Store one extra new folio. There should be no free folios, so the
+	 * oldest folio will be reclaimed to store new folio. Add it into the
+	 * last node at the next unoccupied offset.
+	 */
+	inode_new = &test_data.inodes[INODE_COUNT - 1];
+	new_index = inode_new->folios[FOLIOS_PER_INODE - 1]->index + PAGE_SIZE;
+
+	test_data.aux_folio->mapping = &inode_new->mapping;
+	test_data.aux_folio->index = new_index;
+	set_folio_content(test_data.aux_folio, TEST_CONTENT);
+	folio_lock(test_data.aux_folio); /* Folio has to be locked */
+	folio_set_workingset(test_data.aux_folio);
+	KUNIT_EXPECT_TRUE(test, cleancache_store_folio(&inode_new->inode, test_data.aux_folio));
+	folio_unlock(test_data.aux_folio);
+
+	/* Verify results */
+	for (int inode = 0; inode < INODE_COUNT; inode++) {
+		inode_data = &test_data.inodes[inode];
+		for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+			folio = inode_data->folios[fidx];
+			test_data.aux_folio->mapping = folio->mapping;
+			test_data.aux_folio->index = folio->index;
+			/*
+			 * The first folio of the first node was added first,
+			 * so it's the oldest and must have been reclaimed.
+			 */
+			if (inode == 0 && fidx == 0) {
+				/* Reclaimed folio should be missing */
+				KUNIT_EXPECT_FALSE_MSG(test,
+						cleancache_restore_folio(&inode_data->inode,
+									 test_data.aux_folio),
+						"inode %d, folio %d is invalid\n", inode, fidx);
+				continue;
+			}
+			KUNIT_EXPECT_TRUE_MSG(test,
+					cleancache_restore_folio(&inode_data->inode,
+								 test_data.aux_folio),
+								"inode %d, folio %d is invalid\n",
+								inode, fidx);
+			KUNIT_EXPECT_EQ_MSG(test, get_folio_content(test_data.aux_folio),
+					    get_folio_content(folio),
+					    "inode %d, folio %d content is invalid\n",
+					    inode, fidx);
+		}
+	}
+
+	/* Auxiliary folio should be stored */
+	test_data.aux_folio->mapping = &inode_new->mapping;
+	test_data.aux_folio->index = new_index;
+	KUNIT_EXPECT_TRUE_MSG(test,
+			      cleancache_restore_folio(&inode_new->inode, test_data.aux_folio),
+			      "inode %lu, folio %ld is invalid\n",
+			      inode_new->inode.i_ino, new_index);
+	KUNIT_EXPECT_EQ_MSG(test, get_folio_content(test_data.aux_folio), TEST_CONTENT,
+			    "inode %lu, folio %ld content is invalid\n",
+			    inode_new->inode.i_ino, new_index);
+}
+
+static void cleancache_backend_api_test(struct kunit *test)
+{
+	struct folio *folio;
+	LIST_HEAD(folios);
+	int used = 0;
+
+	/* Store inode folios into cleancache */
+	fill_cleancache(test);
+
+	/* Get all donated folios back */
+	for (int fidx = 0; fidx < FOLIO_COUNT; fidx++) {
+		KUNIT_EXPECT_EQ(test, cleancache_backend_get_folio(test_data.pool_id,
+						test_data.pool_folios[fidx]),  0);
+		set_page_refcounted(&test_data.pool_folios[fidx]->page);
+	}
+
+	/* Try putting a refcounted folio */
+	KUNIT_EXPECT_NE(test, cleancache_backend_put_folio(test_data.pool_id,
+					test_data.pool_folios[0]), 0);
+
+	/* Put some of the folios back into cleancache */
+	for (int fidx = 0; fidx < FOLIOS_PER_INODE; fidx++) {
+		folio_ref_freeze(test_data.pool_folios[fidx], 1);
+		KUNIT_EXPECT_EQ(test, cleancache_backend_put_folio(test_data.pool_id,
+						test_data.pool_folios[fidx]), 0);
+	}
+
+	/* Put the rest back into cleancache but keep half of folios still refcounted */
+	for (int fidx = FOLIOS_PER_INODE; fidx < FOLIO_COUNT; fidx++) {
+		if (fidx % 2)
+			folio_ref_freeze(test_data.pool_folios[fidx], 1);
+		else
+			used++;
+		list_add(&test_data.pool_folios[fidx]->lru, &folios);
+	}
+	KUNIT_EXPECT_NE(test, cleancache_backend_put_folios(test_data.pool_id,
+					&folios), 0);
+	/* Used folios should be still in the list */
+	KUNIT_EXPECT_EQ(test, list_count_nodes(&folios), used);
+
+	/* Release refcounts and put the remaining folios into cleancache */
+	list_for_each_entry(folio, &folios, lru)
+		folio_ref_freeze(folio, 1);
+	KUNIT_EXPECT_EQ(test, cleancache_backend_put_folios(test_data.pool_id,
+					&folios), 0);
+	KUNIT_EXPECT_TRUE(test, list_empty(&folios));
+}
+
+static struct kunit_case cleancache_test_cases[] = {
+	KUNIT_CASE(cleancache_restore_test),
+	KUNIT_CASE(cleancache_walk_and_restore_test),
+	KUNIT_CASE(cleancache_invalidate_test),
+	KUNIT_CASE(cleancache_reclaim_test),
+	KUNIT_CASE(cleancache_backend_api_test),
+	{},
+};
+
+static struct kunit_suite hashtable_test_module = {
+	.name = "cleancache",
+	.init = cleancache_test_init,
+	.suite_init = cleancache_suite_init,
+	.suite_exit = cleancache_suite_exit,
+	.test_cases = cleancache_test_cases,
+};
+
+kunit_test_suites(&hashtable_test_module);
+
+MODULE_DESCRIPTION("KUnit test for the Kernel Cleancache");
+MODULE_LICENSE("GPL");
-- 
2.51.1.851.g4ebd6896fd-goog

Document cleancache, its APIs and sysfs interface.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 .../admin-guide/mm/cleancache_sysfs.rst       | 51 ++++++++++++++
 Documentation/admin-guide/mm/index.rst        |  1 +
 Documentation/mm/cleancache.rst               | 68 +++++++++++++++++++
 Documentation/mm/index.rst                    |  1 +
 MAINTAINERS                                   |  2 +
 5 files changed, 123 insertions(+)
 create mode 100644 Documentation/admin-guide/mm/cleancache_sysfs.rst
 create mode 100644 Documentation/mm/cleancache.rst

diff --git a/Documentation/admin-guide/mm/cleancache_sysfs.rst b/Documentation/admin-guide/mm/cleancache_sysfs.rst
new file mode 100644
index 000000000000..503f17008046
--- /dev/null
+++ b/Documentation/admin-guide/mm/cleancache_sysfs.rst
@@ -0,0 +1,51 @@
+==========================
+Cleancache Sysfs Interface
+==========================
+
+If CONFIG_CLEANCACHE_SYSFS is enabled, monitoring of cleancache performance
+can be done via sysfs in the ``/sys/kernel/mm/cleancache`` directory.
+The effectiveness of cleancache can be measured (across all filesystems)
+with provided stats.
+Global stats are published directly under ``/sys/kernel/mm/cleancache`` and
+include:
+
+``stored``
+       number of successful cleancache folio stores.
+
+``skipped``
+       number of folios skipped during cleancache store operation.
+
+``restored``
+       number of successful cleancache folio restore operations.
+
+``missed``
+       number of failed cleancache folio restore operations.
+
+``reclaimed``
+       number of folios reclaimed from the cleancache due to insufficient
+       memory.
+
+``recalled``
+       number of times cleancache folio content was discarded as a result
+       of the cleancache backend taking the folio back.
+
+``invalidated``
+       number of times cleancache folio content was discarded as a result
+       of invalidation.
+
+``cached``
+       number of folios currently cached in the cleancache.
+
+Per-pool stats are published under ``/sys/kernel/mm/cleancache/<pool name>``
+where "pool name" is the name pool was registered under. These stats
+include:
+
+``size``
+       number of folios donated to this pool.
+
+``cached``
+       number of folios currently cached in the pool.
+
+``recalled``
+       number of times cleancache folio content was discarded as a result
+       of the cleancache backend taking the folio back from the pool.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index ebc83ca20fdc..e22336e5c9d2 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -25,6 +25,7 @@ the Linux memory management.
    :maxdepth: 1
 
    concepts
+   cleancache_sysfs
    cma_debugfs
    damon/index
    hugetlbpage
diff --git a/Documentation/mm/cleancache.rst b/Documentation/mm/cleancache.rst
new file mode 100644
index 000000000000..bd4ee7df2125
--- /dev/null
+++ b/Documentation/mm/cleancache.rst
@@ -0,0 +1,68 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Cleancache
+==========
+
+Motivation
+==========
+
+Cleancache is a feature to utilize unused reserved memory for extending
+page cache.
+
+Cleancache can be thought of as a folio-granularity victim cache for clean
+file-backed pages that the kernel's pageframe replacement algorithm would
+like to keep around, but can't since there isn't enough memory. When the
+memory reclaim mechanism "evicts" a folio, it stores the data contained
+in the folio into cleancache memory which is not directly accessible or
+addressable by the kernel and is of unknown and possibly time-varying
+size.
+
+Later, when a filesystem wishes to access a folio in a file on disk, it
+first checks cleancache to see if it already contains required data; if
+it does, the folio data is copied into the kernel and a disk access is
+avoided.
+
+The memory cleancache uses is donated by other system components, which
+reserve memory not directly addressable by the kernel. By donating this
+memory to cleancache, the memory owner enables its utilization while it
+is not used. Memory donation is done using cleancache backend API and any
+donated memory can be taken back at any time by its donor with no delay
+and with guaranteed success. Since cleancache uses this memory only to
+store clean file-backed data, it can be dropped at any time and therefore
+the donor's request to take back the memory can always be satisfied.
+
+Implementation Overview
+=======================
+
+Cleancache "backend" registers itself with cleancache "frontend" and gets
+a unique pool_id, which it can use in all later API calls to identify the
+pool of folios it donates.
+Once registered, backend can call cleancache_backend_put_folio() or
+cleancache_backend_put_folios() to donate memory to cleancache. Note that
+cleancache currently supports only 0-order folios and will not accept
+larger-order ones. Once the backend needs that memory back, it can get it
+by calling cleancache_backend_get_folio(). Only the original backend can
+take the folio it donated from the cleancache.
+
+Kernel uses cleancache by first calling cleancache_add_fs() to register
+each file system and then using a combination of cleancache_store_folio(),
+cleancache_restore_folio(), cleancache_invalidate_{folio|inode} to store,
+restore and invalidate folio content.
+cleancache_{start|end}_inode_walk() are used to walk over folios inside
+an inode and cleancache_restore_from_inode() is used to restore folios
+during such walks.
+
+From kernel's point of view folios which are copied into cleancache have
+an indefinite lifetime which is completely unknowable by the kernel and so
+may or may not still be in cleancache at any later time. Thus, as its name
+implies, cleancache is not suitable for dirty folios. Cleancache has
+complete discretion over what folios to preserve and what folios to discard
+and when.
+
+Cleancache Performance Metrics
+==============================
+
+Cleancache performance can be measured and monitored using metrics provided
+via sysfs interface under ``/sys/kernel/mm/cleancache`` directory. The
+interface is described in Documentation/admin-guide/mm/cleancache_sysfs.rst.
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
index ba6a8872849b..7997879e0695 100644
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -41,6 +41,7 @@ documentation, or deleted if it has served its purpose.
    allocation-profiling
    arch_pgtable_helpers
    balance
+   cleancache
    damon/index
    free_page_reporting
    hmm
diff --git a/MAINTAINERS b/MAINTAINERS
index eb35973e10c8..3aabed281b71 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6058,6 +6058,8 @@ M:	Suren Baghdasaryan <surenb@google.com>
 M:	Minchan Kim <minchan@google.com>
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	Documentation/admin-guide/mm/cleancache_sysfs.rst
+F:	Documentation/mm/cleancache.rst
 F:	include/linux/cleancache.h
 F:	mm/cleancache.c
 F:	mm/cleancache_sysfs.c
-- 
2.51.1.851.g4ebd6896fd-goog

From: Minchan Kim <minchan@google.com>

This patch introduces GCMA (Guaranteed Contiguous Memory Allocator)
cleacache backend which reserves some amount of memory at the boot
and then donates it to store clean file-backed pages in the cleancache.
GCMA aims to guarantee contiguous memory allocation success as well as
low and deterministic allocation latency.

Notes:
Originally, the idea was posted by SeongJae Park and Minchan Kim [1].
Later Minchan reworked it to be used in Android as a reference for
Android vendors to use [2].

[1] https://lwn.net/Articles/619865/
[2] https://android-review.googlesource.com/q/topic:%22gcma_6.12%22

Signed-off-by: Minchan Kim <minchan@google.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 MAINTAINERS          |   2 +
 include/linux/gcma.h |  36 +++++++
 mm/Kconfig           |  15 +++
 mm/Makefile          |   1 +
 mm/gcma.c            | 244 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 298 insertions(+)
 create mode 100644 include/linux/gcma.h
 create mode 100644 mm/gcma.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3aabed281b71..40de200d1124 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16384,6 +16384,7 @@ F:	Documentation/admin-guide/mm/
 F:	Documentation/mm/
 F:	include/linux/cma.h
 F:	include/linux/dmapool.h
+F:	include/linux/gcma.h
 F:	include/linux/ioremap.h
 F:	include/linux/memory-tiers.h
 F:	include/linux/page_idle.h
@@ -16395,6 +16396,7 @@ F:	mm/dmapool.c
 F:	mm/dmapool_test.c
 F:	mm/early_ioremap.c
 F:	mm/fadvise.c
+F:	mm/gcma.c
 F:	mm/ioremap.c
 F:	mm/mapping_dirty_helpers.c
 F:	mm/memory-tiers.c
diff --git a/include/linux/gcma.h b/include/linux/gcma.h
new file mode 100644
index 000000000000..20b2c85de87b
--- /dev/null
+++ b/include/linux/gcma.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __GCMA_H__
+#define __GCMA_H__
+
+#include <linux/types.h>
+
+#ifdef CONFIG_GCMA
+
+int gcma_register_area(const char *name,
+		       unsigned long start_pfn, unsigned long count);
+
+/*
+ * NOTE: allocated pages are still marked reserved and when freeing them
+ * the caller should ensure they are isolated and not referenced by anyone
+ * other than the caller.
+ */
+int gcma_alloc_range(unsigned long start_pfn, unsigned long count, gfp_t gfp);
+int gcma_free_range(unsigned long start_pfn, unsigned long count);
+
+#else /* CONFIG_GCMA */
+
+static inline int gcma_register_area(const char *name,
+				     unsigned long start_pfn,
+				     unsigned long count)
+		{ return -EOPNOTSUPP; }
+static inline int gcma_alloc_range(unsigned long start_pfn,
+				   unsigned long count, gfp_t gfp)
+		{ return -EOPNOTSUPP; }
+
+static inline int gcma_free_range(unsigned long start_pfn,
+				   unsigned long count)
+		{ return -EOPNOTSUPP; }
+
+#endif /* CONFIG_GCMA */
+
+#endif /* __GCMA_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index e1a169d5e5de..3166fde83340 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1097,6 +1097,21 @@ config CMA_AREAS
 
 	  If unsure, leave the default value "8" in UMA and "20" in NUMA.
 
+config GCMA
+       bool "GCMA (Guaranteed Contiguous Memory Allocator)"
+       depends on CLEANCACHE
+	help
+	  This enables the Guaranteed Contiguous Memory Allocator to allow
+	  low latency guaranteed contiguous memory allocations. Memory
+	  reserved by GCMA is donated to cleancache to be used as pagecache
+	  extension. Once GCMA allocation is requested, necessary pages are
+	  taken back from the cleancache and used to satisfy the request.
+	  Cleancache guarantees low latency successful allocation as long
+	  as the total size of GCMA allocations does not exceed the size of
+	  the memory donated to the cleancache.
+
+	  If unsure, say "N".
+
 #
 # Select this config option from the architecture Kconfig, if available, to set
 # the max page order for physically contiguous allocations.
diff --git a/mm/Makefile b/mm/Makefile
index 845841a140e3..05aee66a8b07 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -149,3 +149,4 @@ obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
 obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_CLEANCACHE_SYSFS)	+= cleancache_sysfs.o
+obj-$(CONFIG_GCMA)	+= gcma.o
diff --git a/mm/gcma.c b/mm/gcma.c
new file mode 100644
index 000000000000..b86f82b8fe9d
--- /dev/null
+++ b/mm/gcma.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GCMA (Guaranteed Contiguous Memory Allocator)
+ *
+ */
+
+#define pr_fmt(fmt) "gcma: " fmt
+
+#include <linux/cleancache.h>
+#include <linux/gcma.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include "internal.h"
+
+#define MAX_GCMA_AREAS		64
+#define GCMA_AREA_NAME_MAX_LEN	32
+
+struct gcma_area {
+	int pool_id;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	char name[GCMA_AREA_NAME_MAX_LEN];
+};
+
+static struct gcma_area areas[MAX_GCMA_AREAS];
+static atomic_t nr_gcma_area = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(gcma_area_lock);
+
+static int free_folio_range(struct gcma_area *area,
+			     unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long scanned = 0;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		int err;
+
+		if (!(++scanned % XA_CHECK_SCHED))
+			cond_resched();
+
+		err = cleancache_backend_put_folio(area->pool_id, pfn_folio(pfn));
+		if (err) {
+			pr_warn("PFN %lu: folio is still in use\n", pfn);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int alloc_folio_range(struct gcma_area *area,
+			      unsigned long start_pfn, unsigned long end_pfn,
+			      gfp_t gfp)
+{
+	unsigned long scanned = 0;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		int err;
+
+		if (!(++scanned % XA_CHECK_SCHED))
+			cond_resched();
+
+		err = cleancache_backend_get_folio(area->pool_id, pfn_folio(pfn));
+		if (err) {
+			free_folio_range(area, start_pfn, pfn);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static struct gcma_area *find_area(unsigned long start_pfn, unsigned long end_pfn)
+{
+	int nr_area = atomic_read_acquire(&nr_gcma_area);
+	int i;
+
+	for (i = 0; i < nr_area; i++) {
+		struct gcma_area *area = &areas[i];
+
+		if (area->end_pfn <= start_pfn)
+			continue;
+
+		if (area->start_pfn > end_pfn)
+			continue;
+
+		/* The entire range should belong to a single area */
+		if (start_pfn < area->start_pfn || end_pfn > area->end_pfn)
+			break;
+
+		/* Found the area containing the entire range */
+		return area;
+	}
+
+	return NULL;
+}
+
+int gcma_register_area(const char *name,
+		       unsigned long start_pfn, unsigned long count)
+{
+	LIST_HEAD(folios);
+	int i, pool_id;
+	int nr_area;
+	int ret = 0;
+
+	pool_id = cleancache_backend_register_pool(name);
+	if (pool_id < 0)
+		return pool_id;
+
+	for (i = 0; i < count; i++) {
+		struct folio *folio;
+
+		folio = pfn_folio(start_pfn + i);
+		folio_clear_reserved(folio);
+		folio_set_count(folio, 0);
+		list_add(&folio->lru, &folios);
+	}
+
+	cleancache_backend_put_folios(pool_id, &folios);
+
+	spin_lock(&gcma_area_lock);
+
+	nr_area = atomic_read(&nr_gcma_area);
+	if (nr_area < MAX_GCMA_AREAS) {
+		struct gcma_area *area = &areas[nr_area];
+
+		area->pool_id = pool_id;
+		area->start_pfn = start_pfn;
+		area->end_pfn = start_pfn + count;
+		strscpy(area->name, name);
+		/* Ensure above stores complete before we increase the count */
+		atomic_set_release(&nr_gcma_area, nr_area + 1);
+	} else {
+		ret = -ENOMEM;
+	}
+
+	spin_unlock(&gcma_area_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gcma_register_area);
+
+int gcma_alloc_range(unsigned long start_pfn, unsigned long count, gfp_t gfp)
+{
+	unsigned long end_pfn = start_pfn + count;
+	struct gcma_area *area;
+	struct folio *folio;
+	int err, order = 0;
+
+	gfp = current_gfp_context(gfp);
+	if (gfp & __GFP_COMP) {
+		if (!is_power_of_2(count))
+			return -EINVAL;
+
+		order = ilog2(count);
+		if (order >= MAX_PAGE_ORDER)
+			return -EINVAL;
+	}
+
+	area = find_area(start_pfn, end_pfn);
+	if (!area)
+		return -EINVAL;
+
+	err = alloc_folio_range(area, start_pfn, end_pfn, gfp);
+	if (err)
+		return err;
+
+	/*
+	 * GCMA returns pages with refcount 1 and expects them to have
+	 * the same refcount 1 when they are freed.
+	 */
+	if (order) {
+		folio = pfn_folio(start_pfn);
+		post_alloc_hook(&folio->page, order, gfp);
+		set_page_refcounted(&folio->page);
+		prep_compound_page(&folio->page, order);
+	} else {
+		for (unsigned long pfn = start_pfn; pfn < end_pfn; pfn++) {
+			folio = pfn_folio(pfn);
+			post_alloc_hook(&folio->page, order, gfp);
+			set_page_refcounted(&folio->page);
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gcma_alloc_range);
+
+int gcma_free_range(unsigned long start_pfn, unsigned long count)
+{
+	unsigned long end_pfn = start_pfn + count;
+	struct gcma_area *area;
+	unsigned long pfn;
+	int err = -EINVAL;
+
+	area = find_area(start_pfn, end_pfn);
+	if (!area)
+		return -EINVAL;
+
+	/* First pass checks and drops folio refcounts */
+	for (pfn = start_pfn; pfn < end_pfn;) {
+		struct folio *folio = pfn_folio(pfn);
+		unsigned long nr_pages = folio_nr_pages(folio);
+
+		if (pfn + nr_pages > end_pfn) {
+			end_pfn = pfn;
+			goto error;
+
+		}
+		if (!folio_ref_dec_and_test(folio)) {
+			end_pfn = pfn + nr_pages;
+			goto error;
+		}
+		pfn += nr_pages;
+	}
+
+	/* Second pass prepares the folios */
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		struct folio *folio = pfn_folio(pfn);
+
+		free_pages_prepare(&folio->page, folio_order(folio));
+		pfn += folio_nr_pages(folio);
+	}
+
+	err = free_folio_range(area, start_pfn, end_pfn);
+	if (!err)
+		return 0;
+
+error:
+	/* Restore folio refcounts */
+	for (pfn = start_pfn; pfn < end_pfn;) {
+		struct folio *folio = pfn_folio(pfn);
+
+		folio_ref_inc(folio);
+		pfn += folio_nr_pages(folio);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(gcma_free_range);
-- 
2.51.1.851.g4ebd6896fd-goog

Introduce a new "guarantee" property for shared-dma-pool to enable
GCMA-backed memory pools. Memory allocations from such pools will
have low latency and will be guaranteed to succeed as long as there
is contiguous space inside the reservation.
dt-schema for shared-dma-pool [1] will need to be updated once this
patch is accepted.

[1] https://github.com/devicetree-org/dt-schema/blob/main/dtschema/schemas/reserved-memory/shared-dma-pool.yaml

Signed-off-by: Minchan Kim <minchan@google.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/cma.h     | 11 +++++++++--
 kernel/dma/contiguous.c | 11 ++++++++++-
 mm/Kconfig              |  2 +-
 mm/cma.c                | 37 +++++++++++++++++++++++++++----------
 mm/cma.h                |  1 +
 mm/cma_sysfs.c          | 10 ++++++++++
 mm/gcma.c               |  2 +-
 7 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 62d9c1cf6326..3ec2e76a8666 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -43,10 +43,17 @@ static inline int __init cma_declare_contiguous(phys_addr_t base,
 extern int __init cma_declare_contiguous_multi(phys_addr_t size,
 			phys_addr_t align, unsigned int order_per_bit,
 			const char *name, struct cma **res_cma, int nid);
-extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+extern int __cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					const char *name,
-					struct cma **res_cma);
+					struct cma **res_cma, bool gcma);
+static inline int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+					unsigned int order_per_bit,
+					const char *name,
+					struct cma **res_cma)
+{
+	return __cma_init_reserved_mem(base, size, order_per_bit, name, res_cma, false);
+}
 extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
 			      bool no_warn);
 extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d9b9dcba6ff7..73a699ef0377 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -461,6 +461,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 	unsigned long node = rmem->fdt_node;
 	bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
 	struct cma *cma;
+	bool gcma;
 	int err;
 
 	if (size_cmdline != -1 && default_cma) {
@@ -478,7 +479,15 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 		return -EINVAL;
 	}
 
-	err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
+	gcma = !!of_get_flat_dt_prop(node, "guarantee", NULL);
+#ifndef CONFIG_GCMA
+	if (gcma) {
+		pr_err("Reserved memory: unable to setup GCMA region, GCMA is not enabled\n");
+		return -EINVAL;
+	}
+#endif
+	err = __cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name,
+				      &cma, gcma);
 	if (err) {
 		pr_err("Reserved memory: unable to setup CMA region\n");
 		return err;
diff --git a/mm/Kconfig b/mm/Kconfig
index 3166fde83340..1c8b20d90790 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1099,7 +1099,7 @@ config CMA_AREAS
 
 config GCMA
        bool "GCMA (Guaranteed Contiguous Memory Allocator)"
-       depends on CLEANCACHE
+       depends on CLEANCACHE && CMA
 	help
 	  This enables the Guaranteed Contiguous Memory Allocator to allow
 	  low latency guaranteed contiguous memory allocations. Memory
diff --git a/mm/cma.c b/mm/cma.c
index 813e6dc7b095..71fb494ef2a4 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -28,6 +28,7 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
 #include <linux/kmemleak.h>
+#include <linux/gcma.h>
 #include <trace/events/cma.h>
 
 #include "internal.h"
@@ -161,11 +162,18 @@ static void __init cma_activate_area(struct cma *cma)
 			count = early_pfn[r] - cmr->base_pfn;
 			bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 			bitmap_set(cmr->bitmap, 0, bitmap_count);
+		} else {
+			count = 0;
 		}
 
-		for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
-		     pfn += pageblock_nr_pages)
-			init_cma_reserved_pageblock(pfn_to_page(pfn));
+		if (cma->gcma) {
+			gcma_register_area(cma->name, early_pfn[r],
+					   cma->count - count);
+		} else {
+			for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
+			     pfn += pageblock_nr_pages)
+				init_cma_reserved_pageblock(pfn_to_page(pfn));
+		}
 	}
 
 	spin_lock_init(&cma->lock);
@@ -252,7 +260,7 @@ static void __init cma_drop_area(struct cma *cma)
 }
 
 /**
- * cma_init_reserved_mem() - create custom contiguous area from reserved memory
+ * __cma_init_reserved_mem() - create custom contiguous area from reserved memory
  * @base: Base address of the reserved area
  * @size: Size of the reserved area (in bytes),
  * @order_per_bit: Order of pages represented by one bit on bitmap.
@@ -260,13 +268,14 @@ static void __init cma_drop_area(struct cma *cma)
  *        the area will be set to "cmaN", where N is a running counter of
  *        used areas.
  * @res_cma: Pointer to store the created cma region.
+ * @gcma: Flag to reserve guaranteed reserved memory area.
  *
  * This function creates custom contiguous area from already reserved memory.
  */
-int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
-				 unsigned int order_per_bit,
-				 const char *name,
-				 struct cma **res_cma)
+int __init __cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+				   unsigned int order_per_bit,
+				   const char *name,
+				   struct cma **res_cma, bool gcma)
 {
 	struct cma *cma;
 	int ret;
@@ -297,6 +306,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 	cma->ranges[0].count = cma->count;
 	cma->nranges = 1;
 	cma->nid = NUMA_NO_NODE;
+	cma->gcma = gcma;
 
 	*res_cma = cma;
 
@@ -836,7 +846,11 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 		spin_unlock_irq(&cma->lock);
 
 		mutex_lock(&cma->alloc_mutex);
-		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
+		if (cma->gcma)
+			ret = gcma_alloc_range(pfn, count, gfp);
+		else
+			ret = alloc_contig_range(pfn, pfn + count,
+						 ACR_FLAGS_CMA, gfp);
 		mutex_unlock(&cma->alloc_mutex);
 		if (!ret)
 			break;
@@ -1009,7 +1023,10 @@ bool cma_release(struct cma *cma, const struct page *pages,
 	if (r == cma->nranges)
 		return false;
 
-	free_contig_range(pfn, count);
+	if (cma->gcma)
+		gcma_free_range(pfn, count);
+	else
+		free_contig_range(pfn, count);
 	cma_clear_bitmap(cma, cmr, pfn, count);
 	cma_sysfs_account_release_pages(cma, count);
 	trace_cma_release(cma->name, pfn, pages, count);
diff --git a/mm/cma.h b/mm/cma.h
index c70180c36559..3b09e8619082 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -49,6 +49,7 @@ struct cma {
 	char name[CMA_MAX_NAME];
 	int nranges;
 	struct cma_memrange ranges[CMA_MAX_RANGES];
+	bool gcma;
 #ifdef CONFIG_CMA_SYSFS
 	/* the number of CMA page successful allocations */
 	atomic64_t nr_pages_succeeded;
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index 97acd3e5a6a5..4ecc36270a4d 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -80,6 +80,15 @@ static ssize_t available_pages_show(struct kobject *kobj,
 }
 CMA_ATTR_RO(available_pages);
 
+static ssize_t gcma_show(struct kobject *kobj,
+			 struct kobj_attribute *attr, char *buf)
+{
+	struct cma *cma = cma_from_kobj(kobj);
+
+	return sysfs_emit(buf, "%d\n", cma->gcma);
+}
+CMA_ATTR_RO(gcma);
+
 static void cma_kobj_release(struct kobject *kobj)
 {
 	struct cma *cma = cma_from_kobj(kobj);
@@ -95,6 +104,7 @@ static struct attribute *cma_attrs[] = {
 	&release_pages_success_attr.attr,
 	&total_pages_attr.attr,
 	&available_pages_attr.attr,
+	&gcma_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(cma);
diff --git a/mm/gcma.c b/mm/gcma.c
index b86f82b8fe9d..fcf1d3c0283f 100644
--- a/mm/gcma.c
+++ b/mm/gcma.c
@@ -119,7 +119,7 @@ int gcma_register_area(const char *name,
 		folio_set_count(folio, 0);
 		list_add(&folio->lru, &folios);
 	}
-
+	folio_zone(pfn_folio(start_pfn))->cma_pages += count;
 	cleancache_backend_put_folios(pool_id, &folios);
 
 	spin_lock(&gcma_area_lock);
-- 
2.51.1.851.g4ebd6896fd-goog