From: Michael Guralnik <michaelgur@nvidia.com>

Add a generic Fast Registration Memory Region pools mechanism to allow
drivers to optimize memory registration performance.
Drivers that have the ability to reuse MRs or their underlying HW
objects can take advantage of the mechanism to keep a 'handle' for those
objects and use them upon user request.
We assume that to achieve this goal a driver and its HW should implement
a modify operation for the MRs that is able to at least clear and set the
MRs and in more advanced implementations also support changing a subset
of the MRs properties.

The mechanism is built using an RB-tree consisting of pools, each pool
represents a set of MR properties that are shared by all of the MRs
residing in the pool and are unmodifiable by the vendor driver or HW.

The exposed API from ib_core to the driver has 4 operations:
Init and cleanup - handles data structs and locks for the pools.
Push and pop - store and retrieve 'handle' for a memory registration
or deregistrations request.

The FRMR pools mechanism implements the logic to search the RB-tree for
a pool with matching properties and create a new one when needed and
requires the driver to implement creation and destruction of a 'handle'
when pool is empty or a handle is requested or is being destroyed.

Later patch will introduce Netlink API to interact with the FRMR pools
mechanism to allow users to both configure and track its usage.
A vendor wishing to configure FRMR pool without exposing it or without
exposing internal MR properties to users, should use the
kernel_vendor_key field in the pools key. This can be useful in a few
cases, e.g, when the FRMR handle has a vendor-specific un-modifiable
property that the user registering the memory might not be aware of.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/Makefile     |   2 +-
 drivers/infiniband/core/frmr_pools.c | 328 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/frmr_pools.h |  48 +++++
 include/rdma/frmr_pools.h            |  37 ++++
 include/rdma/ib_verbs.h              |   8 +
 5 files changed, 422 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index f483e0c124445c1e9796dc7d766517b12f6dfc2f..7089a982b876f1f5088e922f296725954697a1a4 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
 				nldev.o restrack.o counters.o ib_core_uverbs.o \
-				trace.o lag.o
+				trace.o lag.o frmr_pools.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
new file mode 100644
index 0000000000000000000000000000000000000000..073b2fcfb2cc7d466fedfba14ad04f1e2d7edf65
--- /dev/null
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+
+#include "frmr_pools.h"
+
+static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle)
+{
+	u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE;
+	struct frmr_handles_page *page;
+
+	if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) {
+		page = kzalloc(sizeof(*page), GFP_ATOMIC);
+		if (!page)
+			return -ENOMEM;
+		queue->num_pages++;
+		list_add_tail(&page->list, &queue->pages_list);
+	} else {
+		page = list_last_entry(&queue->pages_list,
+				       struct frmr_handles_page, list);
+	}
+
+	page->handles[tmp] = handle;
+	queue->ci++;
+	return 0;
+}
+
+static u32 pop_handle_from_queue_locked(struct frmr_queue *queue)
+{
+	u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE;
+	struct frmr_handles_page *page;
+	u32 handle;
+
+	page = list_last_entry(&queue->pages_list, struct frmr_handles_page,
+			       list);
+	handle = page->handles[tmp];
+	queue->ci--;
+
+	if (!tmp) {
+		list_del(&page->list);
+		queue->num_pages--;
+		kfree(page);
+	}
+
+	return handle;
+}
+
+static bool pop_frmr_handles_page(struct ib_frmr_pool *pool,
+				  struct frmr_queue *queue,
+				  struct frmr_handles_page **page, u32 *count)
+{
+	spin_lock(&pool->lock);
+	if (list_empty(&queue->pages_list)) {
+		spin_unlock(&pool->lock);
+		return false;
+	}
+
+	*page = list_first_entry(&queue->pages_list, struct frmr_handles_page,
+				 list);
+	list_del(&(*page)->list);
+	queue->num_pages--;
+
+	/* If this is the last page, count may be less than
+	 * NUM_HANDLES_PER_PAGE.
+	 */
+	if (queue->ci >= NUM_HANDLES_PER_PAGE)
+		*count = NUM_HANDLES_PER_PAGE;
+	else
+		*count = queue->ci;
+
+	queue->ci -= *count;
+	spin_unlock(&pool->lock);
+	return true;
+}
+
+static void destroy_frmr_pool(struct ib_device *device,
+			      struct ib_frmr_pool *pool)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct frmr_handles_page *page;
+	u32 count;
+
+	while (pop_frmr_handles_page(pool, &pool->queue, &page, &count)) {
+		pools->pool_ops->destroy_frmrs(device, page->handles, count);
+		kfree(page);
+	}
+
+	rb_erase(&pool->node, &pools->rb_root);
+	kfree(pool);
+}
+
+/*
+ * Initialize the FRMR pools for a device.
+ *
+ * @device: The device to initialize the FRMR pools for.
+ * @pool_ops: The pool operations to use.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ib_frmr_pools_init(struct ib_device *device,
+		       const struct ib_frmr_pool_ops *pool_ops)
+{
+	struct ib_frmr_pools *pools;
+
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return -ENOMEM;
+
+	pools->rb_root = RB_ROOT;
+	rwlock_init(&pools->rb_lock);
+	pools->pool_ops = pool_ops;
+
+	device->frmr_pools = pools;
+	return 0;
+}
+EXPORT_SYMBOL(ib_frmr_pools_init);
+
+/*
+ * Clean up the FRMR pools for a device.
+ *
+ * @device: The device to clean up the FRMR pools for.
+ *
+ * Call cleanup only after all FRMR handles have been pushed back to the pool
+ * and no other FRMR operations are allowed to run in parallel.
+ * Ensuring this allows us to save synchronization overhead in pop and push
+ * operations.
+ */
+void ib_frmr_pools_cleanup(struct ib_device *device)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct rb_node *node = rb_first(&pools->rb_root);
+	struct ib_frmr_pool *pool;
+
+	while (node) {
+		struct rb_node *next = rb_next(node);
+
+		pool = rb_entry(node, struct ib_frmr_pool, node);
+		destroy_frmr_pool(device, pool);
+		node = next;
+	}
+
+	kfree(pools);
+	device->frmr_pools = NULL;
+}
+EXPORT_SYMBOL(ib_frmr_pools_cleanup);
+
+static int compare_keys(struct ib_frmr_key *key1, struct ib_frmr_key *key2)
+{
+	int res;
+
+	res = key1->ats - key2->ats;
+	if (res)
+		return res;
+
+	res = key1->access_flags - key2->access_flags;
+	if (res)
+		return res;
+
+	res = key1->vendor_key - key2->vendor_key;
+	if (res)
+		return res;
+
+	res = key1->kernel_vendor_key - key2->kernel_vendor_key;
+	if (res)
+		return res;
+
+	/*
+	 * allow using handles that support more DMA blocks, up to twice the
+	 * requested number
+	 */
+	res = key1->num_dma_blocks - key2->num_dma_blocks;
+	if (res > 0 && res < key2->num_dma_blocks)
+		return 0;
+
+	return res;
+}
+
+static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools,
+					      struct ib_frmr_key *key)
+{
+	struct rb_node *node = pools->rb_root.rb_node;
+	struct ib_frmr_pool *pool;
+	int cmp;
+
+	/* find operation is done under read lock for performance reasons.
+	 * The case of threads failing to find the same pool and creating it
+	 * is handled by the create_frmr_pool function.
+	 */
+	read_lock(&pools->rb_lock);
+	while (node) {
+		pool = rb_entry(node, struct ib_frmr_pool, node);
+		cmp = compare_keys(&pool->key, key);
+		if (cmp < 0) {
+			node = node->rb_right;
+		} else if (cmp > 0) {
+			node = node->rb_left;
+		} else {
+			read_unlock(&pools->rb_lock);
+			return pool;
+		}
+	}
+
+	read_unlock(&pools->rb_lock);
+
+	return NULL;
+}
+
+static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device,
+					     struct ib_frmr_key *key)
+{
+	struct rb_node **new = &device->frmr_pools->rb_root.rb_node,
+		       *parent = NULL;
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct ib_frmr_pool *pool;
+	int cmp;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&pool->key, key, sizeof(*key));
+	INIT_LIST_HEAD(&pool->queue.pages_list);
+	spin_lock_init(&pool->lock);
+
+	write_lock(&pools->rb_lock);
+	while (*new) {
+		parent = *new;
+		cmp = compare_keys(
+			&rb_entry(parent, struct ib_frmr_pool, node)->key, key);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+		/* If a different thread has already created the pool, return
+		 * it. The insert operation is done under the write lock so we
+		 * are sure that the pool is not inserted twice.
+		 */
+		if (cmp == 0) {
+			write_unlock(&pools->rb_lock);
+			kfree(pool);
+			return rb_entry(parent, struct ib_frmr_pool, node);
+		}
+	}
+
+	rb_link_node(&pool->node, parent, new);
+	rb_insert_color(&pool->node, &pools->rb_root);
+
+	write_unlock(&pools->rb_lock);
+
+	return pool;
+}
+
+static int get_frmr_from_pool(struct ib_device *device,
+			      struct ib_frmr_pool *pool, struct ib_mr *mr)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	u32 handle;
+	int err;
+
+	spin_lock(&pool->lock);
+	if (pool->queue.ci == 0) {
+		spin_unlock(&pool->lock);
+		err = pools->pool_ops->create_frmrs(device, &pool->key, &handle,
+						    1);
+		if (err)
+			return err;
+	} else {
+		handle = pop_handle_from_queue_locked(&pool->queue);
+		spin_unlock(&pool->lock);
+	}
+
+	mr->frmr.pool = pool;
+	mr->frmr.handle = handle;
+
+	return 0;
+}
+
+/*
+ * Pop an FRMR handle from the pool.
+ *
+ * @device: The device to pop the FRMR handle from.
+ * @mr: The MR to pop the FRMR handle from.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct ib_frmr_pool *pool;
+
+	WARN_ON_ONCE(!device->frmr_pools);
+	pool = ib_frmr_pool_find(pools, &mr->frmr.key);
+	if (!pool) {
+		pool = create_frmr_pool(device, &mr->frmr.key);
+		if (IS_ERR(pool))
+			return PTR_ERR(pool);
+	}
+
+	return get_frmr_from_pool(device, pool, mr);
+}
+EXPORT_SYMBOL(ib_frmr_pool_pop);
+
+/*
+ * Push an FRMR handle back to the pool.
+ *
+ * @device: The device to push the FRMR handle to.
+ * @mr: The MR containing the FRMR handle to push back to the pool.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
+{
+	struct ib_frmr_pool *pool = mr->frmr.pool;
+	int ret;
+
+	spin_lock(&pool->lock);
+	ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle);
+	spin_unlock(&pool->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_frmr_pool_push);
diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a4d03b3d86f431c3f2091dd5ab27292547c2030
--- /dev/null
+++ b/drivers/infiniband/core/frmr_pools.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef RDMA_CORE_FRMR_POOLS_H
+#define RDMA_CORE_FRMR_POOLS_H
+
+#include <rdma/frmr_pools.h>
+#include <linux/rbtree_types.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+#include <asm/page.h>
+
+#define NUM_HANDLES_PER_PAGE \
+	((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32))
+
+struct frmr_handles_page {
+	struct list_head list;
+	u32 handles[NUM_HANDLES_PER_PAGE];
+};
+
+/* FRMR queue holds a list of frmr_handles_page.
+ * num_pages: number of pages in the queue.
+ * ci: current index in the handles array across all pages.
+ */
+struct frmr_queue {
+	struct list_head pages_list;
+	u32 num_pages;
+	unsigned long ci;
+};
+
+struct ib_frmr_pool {
+	struct rb_node node;
+	struct ib_frmr_key key; /* Pool key */
+
+	/* Protect access to the queue */
+	spinlock_t lock;
+	struct frmr_queue queue;
+};
+
+struct ib_frmr_pools {
+	struct rb_root rb_root;
+	rwlock_t rb_lock;
+	const struct ib_frmr_pool_ops *pool_ops;
+};
+
+#endif /* RDMA_CORE_FRMR_POOLS_H */
diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h
new file mode 100644
index 0000000000000000000000000000000000000000..da92ef4d7310c0fe0cebf937a0049f81580ad386
--- /dev/null
+++ b/include/rdma/frmr_pools.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef FRMR_POOLS_H
+#define FRMR_POOLS_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+
+struct ib_device;
+struct ib_mr;
+
+struct ib_frmr_key {
+	u64 vendor_key;
+	/* A pool with non-zero kernel_vendor_key is a kernel-only pool. */
+	u64 kernel_vendor_key;
+	size_t num_dma_blocks;
+	int access_flags;
+	u8 ats:1;
+};
+
+struct ib_frmr_pool_ops {
+	int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key,
+			    u32 *handles, u32 count);
+	void (*destroy_frmrs)(struct ib_device *device, u32 *handles,
+			      u32 count);
+};
+
+int ib_frmr_pools_init(struct ib_device *device,
+		       const struct ib_frmr_pool_ops *pool_ops);
+void ib_frmr_pools_cleanup(struct ib_device *device);
+int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr);
+int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr);
+
+#endif /* FRMR_POOLS_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0a85af610b6b72db33ddd90b30163e18f7038e7d..6cc557424e2323161a3d50181190ad36d9d0a149 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -43,6 +43,7 @@
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
 #include <linux/pci-tph.h>
+#include <rdma/frmr_pools.h>
 
 #define IB_FW_VERSION_NAME_MAX	ETHTOOL_FWVERS_LEN
 
@@ -1886,6 +1887,11 @@ struct ib_mr {
 	struct ib_dm      *dm;
 	struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
 	struct ib_dmah *dmah;
+	struct {
+		struct ib_frmr_pool *pool;
+		struct ib_frmr_key key;
+		u32 handle;
+	} frmr;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
@@ -2879,6 +2885,8 @@ struct ib_device {
 	struct list_head subdev_list;
 
 	enum rdma_nl_name_assign_type name_assign_type;
+
+	struct ib_frmr_pools *frmr_pools;
 };
 
 static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size,

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Add aging mechanism to handles of FRMR pools.
Keep the handles stored in FRMR pools for at least 1 minute for
application to reuse, destroy all handles which were not reused.

Add a new queue to each pool to accomplish that.
Upon aging trigger, destroy all FRMR handles from the new 'inactive'
queue and move all handles from the 'active' pool to the 'inactive' pool.
This ensures all destroyed handles were not reused for at least one aging
time period and were not held longer than 2 aging time periods.
Handles from the inactive queue will be popped only if the active queue is
empty.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/frmr_pools.c | 84 ++++++++++++++++++++++++++++++++----
 drivers/infiniband/core/frmr_pools.h |  7 +++
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
index 073b2fcfb2cc7d466fedfba14ad04f1e2d7edf65..406664a6e2099b2a7827e12a40820ecab75cb59c 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -7,9 +7,12 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <rdma/ib_verbs.h>
+#include <linux/timer.h>
 
 #include "frmr_pools.h"
 
+#define FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS 60
+
 static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle)
 {
 	u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE;
@@ -79,19 +82,58 @@ static bool pop_frmr_handles_page(struct ib_frmr_pool *pool,
 	return true;
 }
 
-static void destroy_frmr_pool(struct ib_device *device,
-			      struct ib_frmr_pool *pool)
+static void destroy_all_handles_in_queue(struct ib_device *device,
+					 struct ib_frmr_pool *pool,
+					 struct frmr_queue *queue)
 {
 	struct ib_frmr_pools *pools = device->frmr_pools;
 	struct frmr_handles_page *page;
 	u32 count;
 
-	while (pop_frmr_handles_page(pool, &pool->queue, &page, &count)) {
+	while (pop_frmr_handles_page(pool, queue, &page, &count)) {
 		pools->pool_ops->destroy_frmrs(device, page->handles, count);
 		kfree(page);
 	}
+}
+
+static void pool_aging_work(struct work_struct *work)
+{
+	struct ib_frmr_pool *pool = container_of(
+		to_delayed_work(work), struct ib_frmr_pool, aging_work);
+	struct ib_frmr_pools *pools = pool->device->frmr_pools;
+	bool has_work = false;
+
+	destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue);
+
+	/* Move all pages from regular queue to inactive queue */
+	spin_lock(&pool->lock);
+	if (pool->queue.ci > 0) {
+		list_splice_tail_init(&pool->queue.pages_list,
+				      &pool->inactive_queue.pages_list);
+		pool->inactive_queue.num_pages = pool->queue.num_pages;
+		pool->inactive_queue.ci = pool->queue.ci;
+
+		pool->queue.num_pages = 0;
+		pool->queue.ci = 0;
+		has_work = true;
+	}
+	spin_unlock(&pool->lock);
+
+	/* Reschedule if there are handles to age in next aging period */
+	if (has_work)
+		queue_delayed_work(
+			pools->aging_wq, &pool->aging_work,
+			secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS));
+}
+
+static void destroy_frmr_pool(struct ib_device *device,
+			      struct ib_frmr_pool *pool)
+{
+	cancel_delayed_work_sync(&pool->aging_work);
+	destroy_all_handles_in_queue(device, pool, &pool->queue);
+	destroy_all_handles_in_queue(device, pool, &pool->inactive_queue);
 
-	rb_erase(&pool->node, &pools->rb_root);
+	rb_erase(&pool->node, &device->frmr_pools->rb_root);
 	kfree(pool);
 }
 
@@ -115,6 +157,11 @@ int ib_frmr_pools_init(struct ib_device *device,
 	pools->rb_root = RB_ROOT;
 	rwlock_init(&pools->rb_lock);
 	pools->pool_ops = pool_ops;
+	pools->aging_wq = create_singlethread_workqueue("frmr_aging_wq");
+	if (!pools->aging_wq) {
+		kfree(pools);
+		return -ENOMEM;
+	}
 
 	device->frmr_pools = pools;
 	return 0;
@@ -145,6 +192,7 @@ void ib_frmr_pools_cleanup(struct ib_device *device)
 		node = next;
 	}
 
+	destroy_workqueue(pools->aging_wq);
 	kfree(pools);
 	device->frmr_pools = NULL;
 }
@@ -226,7 +274,10 @@ static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device,
 
 	memcpy(&pool->key, key, sizeof(*key));
 	INIT_LIST_HEAD(&pool->queue.pages_list);
+	INIT_LIST_HEAD(&pool->inactive_queue.pages_list);
 	spin_lock_init(&pool->lock);
+	INIT_DELAYED_WORK(&pool->aging_work, pool_aging_work);
+	pool->device = device;
 
 	write_lock(&pools->rb_lock);
 	while (*new) {
@@ -265,11 +316,17 @@ static int get_frmr_from_pool(struct ib_device *device,
 
 	spin_lock(&pool->lock);
 	if (pool->queue.ci == 0) {
-		spin_unlock(&pool->lock);
-		err = pools->pool_ops->create_frmrs(device, &pool->key, &handle,
-						    1);
-		if (err)
-			return err;
+		if (pool->inactive_queue.ci > 0) {
+			handle = pop_handle_from_queue_locked(
+				&pool->inactive_queue);
+			spin_unlock(&pool->lock);
+		} else {
+			spin_unlock(&pool->lock);
+			err = pools->pool_ops->create_frmrs(device, &pool->key,
+							    &handle, 1);
+			if (err)
+				return err;
+		}
 	} else {
 		handle = pop_handle_from_queue_locked(&pool->queue);
 		spin_unlock(&pool->lock);
@@ -317,12 +374,21 @@ EXPORT_SYMBOL(ib_frmr_pool_pop);
 int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
 {
 	struct ib_frmr_pool *pool = mr->frmr.pool;
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	bool schedule_aging = false;
 	int ret;
 
 	spin_lock(&pool->lock);
+	/* Schedule aging every time an empty pool becomes non-empty */
+	if (pool->queue.ci == 0)
+		schedule_aging = true;
 	ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle);
 	spin_unlock(&pool->lock);
 
+	if (ret == 0 && schedule_aging)
+		queue_delayed_work(pools->aging_wq, &pool->aging_work,
+			secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS));
+
 	return ret;
 }
 EXPORT_SYMBOL(ib_frmr_pool_push);
diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h
index 5a4d03b3d86f431c3f2091dd5ab27292547c2030..a20323e03e3f446856dda921811e2359232e0b82 100644
--- a/drivers/infiniband/core/frmr_pools.h
+++ b/drivers/infiniband/core/frmr_pools.h
@@ -11,6 +11,7 @@
 #include <linux/spinlock_types.h>
 #include <linux/types.h>
 #include <asm/page.h>
+#include <linux/workqueue.h>
 
 #define NUM_HANDLES_PER_PAGE \
 	((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32))
@@ -37,12 +38,18 @@ struct ib_frmr_pool {
 	/* Protect access to the queue */
 	spinlock_t lock;
 	struct frmr_queue queue;
+	struct frmr_queue inactive_queue;
+
+	struct delayed_work aging_work;
+	struct ib_device *device;
 };
 
 struct ib_frmr_pools {
 	struct rb_root rb_root;
 	rwlock_t rb_lock;
 	const struct ib_frmr_pool_ops *pool_ops;
+
+	struct workqueue_struct *aging_wq;
 };
 
 #endif /* RDMA_CORE_FRMR_POOLS_H */

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Count for each pool the number of FRMR handles popped and held by user
MRs.
Also keep track of the max value of this counter.

Next patches will expose the statistics through netlink.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/frmr_pools.c | 12 ++++++++++--
 drivers/infiniband/core/frmr_pools.h |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
index 406664a6e2099b2a7827e12a40820ecab75cb59c..9af2f6aa6c06cee8a1157aac05aa64f361451083 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -319,19 +319,24 @@ static int get_frmr_from_pool(struct ib_device *device,
 		if (pool->inactive_queue.ci > 0) {
 			handle = pop_handle_from_queue_locked(
 				&pool->inactive_queue);
-			spin_unlock(&pool->lock);
 		} else {
 			spin_unlock(&pool->lock);
 			err = pools->pool_ops->create_frmrs(device, &pool->key,
 							    &handle, 1);
 			if (err)
 				return err;
+			spin_lock(&pool->lock);
 		}
 	} else {
 		handle = pop_handle_from_queue_locked(&pool->queue);
-		spin_unlock(&pool->lock);
 	}
 
+	pool->in_use++;
+	if (pool->in_use > pool->max_in_use)
+		pool->max_in_use = pool->in_use;
+
+	spin_unlock(&pool->lock);
+
 	mr->frmr.pool = pool;
 	mr->frmr.handle = handle;
 
@@ -383,6 +388,9 @@ int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
 	if (pool->queue.ci == 0)
 		schedule_aging = true;
 	ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle);
+	if (ret == 0)
+		pool->in_use--;
+
 	spin_unlock(&pool->lock);
 
 	if (ret == 0 && schedule_aging)
diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h
index a20323e03e3f446856dda921811e2359232e0b82..814d8a2106c2978a1a1feca3ba50420025fca994 100644
--- a/drivers/infiniband/core/frmr_pools.h
+++ b/drivers/infiniband/core/frmr_pools.h
@@ -42,6 +42,9 @@ struct ib_frmr_pool {
 
 	struct delayed_work aging_work;
 	struct ib_device *device;
+
+	u32 max_in_use;
+	u32 in_use;
 };
 
 struct ib_frmr_pools {

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Add a configuration of pinned handles on a specific FRMR pool.
The configured amount of pinned handles will not be aged and will stay
available for users to claim.

Upon setting the amount of pinned handles to an FRMR pool, we will make
sure we have at least the pinned amount of handles associated with the
pool and create more, if necessary.
The count for pinned handles take into account handles that are used by
user MRs and handles in the queue.

Introduce a new FRMR operation of build_key that allows drivers to
manipulate FRMR keys supplied by the user, allowing failing for
unsupported properties and masking of properties that are modifiable.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/frmr_pools.c | 123 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/frmr_pools.h |   3 +
 include/rdma/frmr_pools.h            |   2 +
 3 files changed, 128 insertions(+)

diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
index 9af2f6aa6c06cee8a1157aac05aa64f361451083..254113d2442d5d6956587a1c444dc74cd48204fb 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -96,6 +96,51 @@ static void destroy_all_handles_in_queue(struct ib_device *device,
 	}
 }
 
+static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	u32 total, to_destroy, destroyed = 0;
+	bool has_work = false;
+	u32 *handles;
+	u32 handle;
+
+	spin_lock(&pool->lock);
+	total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use;
+	if (total <= pool->pinned_handles) {
+		spin_unlock(&pool->lock);
+		return false;
+	}
+
+	to_destroy = total - pool->pinned_handles;
+
+	handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC);
+	if (!handles) {
+		spin_unlock(&pool->lock);
+		return true;
+	}
+
+	/* Destroy all excess handles in the inactive queue */
+	while (pool->inactive_queue.ci && destroyed < to_destroy) {
+		handles[destroyed++] = pop_handle_from_queue_locked(
+			&pool->inactive_queue);
+	}
+
+	/* Move all handles from regular queue to inactive queue */
+	while (pool->queue.ci) {
+		handle = pop_handle_from_queue_locked(&pool->queue);
+		push_handle_to_queue_locked(&pool->inactive_queue,
+					    handle);
+		has_work = true;
+	}
+
+	spin_unlock(&pool->lock);
+
+	if (destroyed)
+		pools->pool_ops->destroy_frmrs(device, handles, destroyed);
+	kfree(handles);
+	return has_work;
+}
+
 static void pool_aging_work(struct work_struct *work)
 {
 	struct ib_frmr_pool *pool = container_of(
@@ -103,6 +148,11 @@ static void pool_aging_work(struct work_struct *work)
 	struct ib_frmr_pools *pools = pool->device->frmr_pools;
 	bool has_work = false;
 
+	if (pool->pinned_handles) {
+		has_work = age_pinned_pool(pool->device, pool);
+		goto out;
+	}
+
 	destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue);
 
 	/* Move all pages from regular queue to inactive queue */
@@ -119,6 +169,7 @@ static void pool_aging_work(struct work_struct *work)
 	}
 	spin_unlock(&pool->lock);
 
+out:
 	/* Reschedule if there are handles to age in next aging period */
 	if (has_work)
 		queue_delayed_work(
@@ -307,6 +358,78 @@ static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device,
 	return pool;
 }
 
+int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
+			     u32 pinned_handles)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct ib_frmr_key driver_key = {};
+	struct ib_frmr_pool *pool;
+	u32 needed_handles;
+	u32 current_total;
+	int i, ret = 0;
+	u32 *handles;
+
+	if (!pools)
+		return -EINVAL;
+
+	if (pools->pool_ops->build_key) {
+		ret = pools->pool_ops->build_key(device, key, &driver_key);
+		if (ret)
+			return ret;
+	} else {
+		memcpy(&driver_key, key, sizeof(*key));
+	}
+
+	pool = ib_frmr_pool_find(pools, &driver_key);
+	if (!pool) {
+		pool = create_frmr_pool(device, &driver_key);
+		if (IS_ERR(pool))
+			return PTR_ERR(pool);
+	}
+
+	spin_lock(&pool->lock);
+	current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci;
+
+	if (current_total < pinned_handles)
+		needed_handles = pinned_handles - current_total;
+	else
+		needed_handles = 0;
+
+	pool->pinned_handles = pinned_handles;
+	spin_unlock(&pool->lock);
+
+	if (!needed_handles)
+		goto schedule_aging;
+
+	handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL);
+	if (!handles)
+		return -ENOMEM;
+
+	ret = pools->pool_ops->create_frmrs(device, key, handles,
+					    needed_handles);
+	if (ret) {
+		kfree(handles);
+		return ret;
+	}
+
+	spin_lock(&pool->lock);
+	for (i = 0; i < needed_handles; i++) {
+		ret = push_handle_to_queue_locked(&pool->queue,
+						  handles[i]);
+		if (ret)
+			goto end;
+	}
+
+end:
+	spin_unlock(&pool->lock);
+	kfree(handles);
+
+schedule_aging:
+	mod_delayed_work(pools->aging_wq, &pool->aging_work, 0);
+
+	return ret;
+}
+
 static int get_frmr_from_pool(struct ib_device *device,
 			      struct ib_frmr_pool *pool, struct ib_mr *mr)
 {
diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h
index 814d8a2106c2978a1a1feca3ba50420025fca994..b144273ee34785623d2254d19f5af40869e00e83 100644
--- a/drivers/infiniband/core/frmr_pools.h
+++ b/drivers/infiniband/core/frmr_pools.h
@@ -45,6 +45,7 @@ struct ib_frmr_pool {
 
 	u32 max_in_use;
 	u32 in_use;
+	u32 pinned_handles;
 };
 
 struct ib_frmr_pools {
@@ -55,4 +56,6 @@ struct ib_frmr_pools {
 	struct workqueue_struct *aging_wq;
 };
 
+int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
+			     u32 pinned_handles);
 #endif /* RDMA_CORE_FRMR_POOLS_H */
diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h
index da92ef4d7310c0fe0cebf937a0049f81580ad386..333ce31fc762efb786cd458711617e7ffbd971d0 100644
--- a/include/rdma/frmr_pools.h
+++ b/include/rdma/frmr_pools.h
@@ -26,6 +26,8 @@ struct ib_frmr_pool_ops {
 			    u32 *handles, u32 count);
 	void (*destroy_frmrs)(struct ib_device *device, u32 *handles,
 			      u32 count);
+	int (*build_key)(struct ib_device *device, const struct ib_frmr_key *in,
+			 struct ib_frmr_key *out);
 };
 
 int ib_frmr_pools_init(struct ib_device *device,

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Use the new generic FRMR pools mechanism to optimize the performance of
memory registrations.
The move to the new generic FRMR pools will allow users configuring MR
cache through debugfs of MR cache to use the netlink API for FRMR pools
which will be added later in this series. Thus being able to have more
flexibility configuring the kernel and also being able to configure on
machines where debugfs is not available.

Mlx5_ib will save the mkey index as the handle in FRMR pools, same as the
MR cache implementation.
Upon each memory registration mlx5_ib will try to pull a handle from FRMR
pools and upon each deregistration it will push the handle back to it's
appropriate pool.

Use the vendor key field in umr pool key to save the access mode of the
mkey.

Use the option for kernel-only FRMR pool to manage the mkeys used for
registration with DMAH as the translation between UAPI of DMAH and the
mkey property of st_index is non-trivial and changing dynamically.
Since the value for no PH is 0xff and not zero, switch between them in
the frmr_key to have a zero'ed kernel_vendor_key when not using DMAH.

Remove the limitation we had with MR cache for mkeys up to 2^20 dma
blocks and support mkeys up to HW limitations according to caps.

Remove all MR cache related code.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |    7 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   86 +--
 drivers/infiniband/hw/mlx5/mr.c      | 1141 ++++++----------------------------
 drivers/infiniband/hw/mlx5/odp.c     |   19 -
 drivers/infiniband/hw/mlx5/umr.h     |    1 +
 5 files changed, 186 insertions(+), 1068 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 90daa58126f471d78949fe42581ec3364b34c4ff..ad7b2f280be6914cbe3779d41330602116e948c3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4602,7 +4602,7 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 
 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
-	mlx5_mkey_cache_cleanup(dev);
+	mlx5r_frmr_pools_cleanup(&dev->ib_dev);
 	mlx5r_umr_resource_cleanup(dev);
 	mlx5r_umr_cleanup(dev);
 }
@@ -4620,9 +4620,10 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	if (ret)
 		return ret;
 
-	ret = mlx5_mkey_cache_init(dev);
+	ret = mlx5r_frmr_pools_init(&dev->ib_dev);
 	if (ret)
-		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		mlx5_ib_warn(dev, "frmr pools init failed %d\n", ret);
+
 	return ret;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 09d82d5f95e35414c84fe6258a36b15b25bb6574..da7bd4d6df3c2be3e30e49ac204489129561531c 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -641,25 +641,12 @@ enum mlx5_mkey_type {
 /* Used for non-existent ph value */
 #define MLX5_IB_NO_PH 0xff
 
-struct mlx5r_cache_rb_key {
-	u8 ats:1;
-	u8 ph;
-	u16 st_index;
-	unsigned int access_mode;
-	unsigned int access_flags;
-	unsigned int ndescs;
-};
-
 struct mlx5_ib_mkey {
 	u32 key;
 	enum mlx5_mkey_type type;
 	unsigned int ndescs;
 	struct wait_queue_head wait;
 	refcount_t usecount;
-	/* Cacheable user Mkey must hold either a rb_key or a cache_ent. */
-	struct mlx5r_cache_rb_key rb_key;
-	struct mlx5_cache_ent *cache_ent;
-	u8 cacheable : 1;
 };
 
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
@@ -784,68 +771,6 @@ struct umr_common {
 	struct mutex init_lock;
 };
 
-#define NUM_MKEYS_PER_PAGE \
-	((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32))
-
-struct mlx5_mkeys_page {
-	u32 mkeys[NUM_MKEYS_PER_PAGE];
-	struct list_head list;
-};
-static_assert(sizeof(struct mlx5_mkeys_page) == PAGE_SIZE);
-
-struct mlx5_mkeys_queue {
-	struct list_head pages_list;
-	u32 num_pages;
-	unsigned long ci;
-	spinlock_t lock; /* sync list ops */
-};
-
-struct mlx5_cache_ent {
-	struct mlx5_mkeys_queue	mkeys_queue;
-	u32			pending;
-
-	char                    name[4];
-
-	struct rb_node		node;
-	struct mlx5r_cache_rb_key rb_key;
-
-	u8 is_tmp:1;
-	u8 disabled:1;
-	u8 fill_to_high_water:1;
-	u8 tmp_cleanup_scheduled:1;
-
-	/*
-	 * - limit is the low water mark for stored mkeys, 2* limit is the
-	 *   upper water mark.
-	 */
-	u32 in_use;
-	u32 limit;
-
-	/* Statistics */
-	u32                     miss;
-
-	struct mlx5_ib_dev     *dev;
-	struct delayed_work	dwork;
-};
-
-struct mlx5r_async_create_mkey {
-	union {
-		u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
-		u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
-	};
-	struct mlx5_async_work cb_work;
-	struct mlx5_cache_ent *ent;
-	u32 mkey;
-};
-
-struct mlx5_mkey_cache {
-	struct workqueue_struct *wq;
-	struct rb_root		rb_root;
-	struct mutex		rb_lock;
-	struct dentry		*fs_root;
-	unsigned long		last_add;
-};
-
 struct mlx5_ib_port_resources {
 	struct mlx5_ib_gsi_qp *gsi;
 	struct work_struct pkey_change_work;
@@ -1180,8 +1105,6 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mkey_cache		cache;
-	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
 	struct ib_odp_caps	odp_caps;
@@ -1438,13 +1361,8 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
 void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
-void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
-struct mlx5_cache_ent *
-mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
-			      struct mlx5r_cache_rb_key rb_key,
-			      bool persistent_entry);
-
+int mlx5r_frmr_pools_init(struct ib_device *device);
+void mlx5r_frmr_pools_cleanup(struct ib_device *device);
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       int access_flags, int access_mode,
 				       int ndescs);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 325fa04cbe8ae36aeec0f5c9b91bf6530807ebdd..cc5c3daadd5110b905b347e4ec81d80bdff14d93 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -31,7 +31,6 @@
  * SOFTWARE.
  */
 
-
 #include <linux/kref.h>
 #include <linux/random.h>
 #include <linux/debugfs.h>
@@ -39,6 +38,7 @@
 #include <linux/delay.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
+#include <rdma/frmr_pools.h>
 #include <rdma/ib_umem_odp.h>
 #include "dm.h"
 #include "mlx5_ib.h"
@@ -46,15 +46,15 @@
 #include "data_direct.h"
 #include "dmah.h"
 
-enum {
-	MAX_PENDING_REG_MR = 8,
-};
-
-#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
 #define MLX5_UMR_ALIGN 2048
 
-static void
-create_mkey_callback(int status, struct mlx5_async_work *context);
+static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		return MLX5_MAX_UMR_EXTENDED_SHIFT;
+	return MLX5_MAX_UMR_SHIFT;
+}
+
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 				     u64 iova, int access_flags,
 				     unsigned long page_size, bool populate,
@@ -111,23 +111,6 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 	return ret;
 }
 
-static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
-{
-	struct mlx5_ib_dev *dev = async_create->ent->dev;
-	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
-
-	MLX5_SET(create_mkey_in, async_create->in, opcode,
-		 MLX5_CMD_OP_CREATE_MKEY);
-	assign_mkey_variant(dev, &async_create->mkey, async_create->in);
-	return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
-				async_create->out, outlen, create_mkey_callback,
-				&async_create->cb_work);
-}
-
-static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
-static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
-
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
@@ -135,94 +118,6 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
 }
 
-static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
-{
-	if (status == -ENXIO) /* core driver is not available */
-		return;
-
-	mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
-	if (status != -EREMOTEIO) /* driver specific failure */
-		return;
-
-	/* Failed in FW, print cmd out failure details */
-	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
-}
-
-static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
-{
-	unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
-	struct mlx5_mkeys_page *page;
-
-	lockdep_assert_held(&ent->mkeys_queue.lock);
-	if (ent->mkeys_queue.ci >=
-	    ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
-		page = kzalloc(sizeof(*page), GFP_ATOMIC);
-		if (!page)
-			return -ENOMEM;
-		ent->mkeys_queue.num_pages++;
-		list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
-	} else {
-		page = list_last_entry(&ent->mkeys_queue.pages_list,
-				       struct mlx5_mkeys_page, list);
-	}
-
-	page->mkeys[tmp] = mkey;
-	ent->mkeys_queue.ci++;
-	return 0;
-}
-
-static int pop_mkey_locked(struct mlx5_cache_ent *ent)
-{
-	unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
-	struct mlx5_mkeys_page *last_page;
-	u32 mkey;
-
-	lockdep_assert_held(&ent->mkeys_queue.lock);
-	last_page = list_last_entry(&ent->mkeys_queue.pages_list,
-				    struct mlx5_mkeys_page, list);
-	mkey = last_page->mkeys[tmp];
-	last_page->mkeys[tmp] = 0;
-	ent->mkeys_queue.ci--;
-	if (ent->mkeys_queue.num_pages > 1 && !tmp) {
-		list_del(&last_page->list);
-		ent->mkeys_queue.num_pages--;
-		kfree(last_page);
-	}
-	return mkey;
-}
-
-static void create_mkey_callback(int status, struct mlx5_async_work *context)
-{
-	struct mlx5r_async_create_mkey *mkey_out =
-		container_of(context, struct mlx5r_async_create_mkey, cb_work);
-	struct mlx5_cache_ent *ent = mkey_out->ent;
-	struct mlx5_ib_dev *dev = ent->dev;
-	unsigned long flags;
-
-	if (status) {
-		create_mkey_warn(dev, status, mkey_out->out);
-		kfree(mkey_out);
-		spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
-		ent->pending--;
-		WRITE_ONCE(dev->fill_delay, 1);
-		spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
-		mod_timer(&dev->delay_timer, jiffies + HZ);
-		return;
-	}
-
-	mkey_out->mkey |= mlx5_idx_to_mkey(
-		MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
-	WRITE_ONCE(dev->cache.last_add, jiffies);
-
-	spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
-	push_mkey_locked(ent, mkey_out->mkey);
-	ent->pending--;
-	/* If we are doing fill_to_high_water then keep going. */
-	queue_adjust_cache_locked(ent);
-	spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
-	kfree(mkey_out);
-}
-
 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
 {
 	int ret = 0;
@@ -242,538 +137,6 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
 	return ret;
 }
 
-static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
-{
-	set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
-				      ent->dev->umrc.pd);
-	MLX5_SET(mkc, mkc, free, 1);
-	MLX5_SET(mkc, mkc, umr_en, 1);
-	MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
-	MLX5_SET(mkc, mkc, access_mode_4_2,
-		(ent->rb_key.access_mode >> 2) & 0x7);
-	MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
-
-	MLX5_SET(mkc, mkc, translations_octword_size,
-		 get_mkc_octo_size(ent->rb_key.access_mode,
-				   ent->rb_key.ndescs));
-	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
-
-	if (ent->rb_key.ph != MLX5_IB_NO_PH) {
-		MLX5_SET(mkc, mkc, pcie_tph_en, 1);
-		MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph);
-		if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
-			MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
-				 ent->rb_key.st_index);
-	}
-}
-
-/* Asynchronously schedule new MRs to be populated in the cache. */
-static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
-{
-	struct mlx5r_async_create_mkey *async_create;
-	void *mkc;
-	int err = 0;
-	int i;
-
-	for (i = 0; i < num; i++) {
-		async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
-				       GFP_KERNEL);
-		if (!async_create)
-			return -ENOMEM;
-		mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
-				   memory_key_mkey_entry);
-		set_cache_mkc(ent, mkc);
-		async_create->ent = ent;
-
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		if (ent->pending >= MAX_PENDING_REG_MR) {
-			err = -EAGAIN;
-			goto free_async_create;
-		}
-		ent->pending++;
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-
-		err = mlx5_ib_create_mkey_cb(async_create);
-		if (err) {
-			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
-			goto err_create_mkey;
-		}
-	}
-
-	return 0;
-
-err_create_mkey:
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	ent->pending--;
-free_async_create:
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-	kfree(async_create);
-	return err;
-}
-
-/* Synchronously create a MR in the cache */
-static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
-{
-	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	void *mkc;
-	u32 *in;
-	int err;
-
-	in = kzalloc(inlen, GFP_KERNEL);
-	if (!in)
-		return -ENOMEM;
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-	set_cache_mkc(ent, mkc);
-
-	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
-	if (err)
-		goto free_in;
-
-	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
-free_in:
-	kfree(in);
-	return err;
-}
-
-static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
-{
-	u32 mkey;
-
-	lockdep_assert_held(&ent->mkeys_queue.lock);
-	if (!ent->mkeys_queue.ci)
-		return;
-	mkey = pop_mkey_locked(ent);
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-	mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
-	spin_lock_irq(&ent->mkeys_queue.lock);
-}
-
-static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
-				bool limit_fill)
-	__acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
-{
-	int err;
-
-	lockdep_assert_held(&ent->mkeys_queue.lock);
-
-	while (true) {
-		if (limit_fill)
-			target = ent->limit * 2;
-		if (target == ent->pending + ent->mkeys_queue.ci)
-			return 0;
-		if (target > ent->pending + ent->mkeys_queue.ci) {
-			u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
-
-			spin_unlock_irq(&ent->mkeys_queue.lock);
-			err = add_keys(ent, todo);
-			if (err == -EAGAIN)
-				usleep_range(3000, 5000);
-			spin_lock_irq(&ent->mkeys_queue.lock);
-			if (err) {
-				if (err != -EAGAIN)
-					return err;
-			} else
-				return 0;
-		} else {
-			remove_cache_mr_locked(ent);
-		}
-	}
-}
-
-static ssize_t size_write(struct file *filp, const char __user *buf,
-			  size_t count, loff_t *pos)
-{
-	struct mlx5_cache_ent *ent = filp->private_data;
-	u32 target;
-	int err;
-
-	err = kstrtou32_from_user(buf, count, 0, &target);
-	if (err)
-		return err;
-
-	/*
-	 * Target is the new value of total_mrs the user requests, however we
-	 * cannot free MRs that are in use. Compute the target value for stored
-	 * mkeys.
-	 */
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	if (target < ent->in_use) {
-		err = -EINVAL;
-		goto err_unlock;
-	}
-	target = target - ent->in_use;
-	if (target < ent->limit || target > ent->limit*2) {
-		err = -EINVAL;
-		goto err_unlock;
-	}
-	err = resize_available_mrs(ent, target, false);
-	if (err)
-		goto err_unlock;
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-
-	return count;
-
-err_unlock:
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-	return err;
-}
-
-static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
-			 loff_t *pos)
-{
-	struct mlx5_cache_ent *ent = filp->private_data;
-	char lbuf[20];
-	int err;
-
-	err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
-		       ent->mkeys_queue.ci + ent->in_use);
-	if (err < 0)
-		return err;
-
-	return simple_read_from_buffer(buf, count, pos, lbuf, err);
-}
-
-static const struct file_operations size_fops = {
-	.owner	= THIS_MODULE,
-	.open	= simple_open,
-	.write	= size_write,
-	.read	= size_read,
-};
-
-static ssize_t limit_write(struct file *filp, const char __user *buf,
-			   size_t count, loff_t *pos)
-{
-	struct mlx5_cache_ent *ent = filp->private_data;
-	u32 var;
-	int err;
-
-	err = kstrtou32_from_user(buf, count, 0, &var);
-	if (err)
-		return err;
-
-	/*
-	 * Upon set we immediately fill the cache to high water mark implied by
-	 * the limit.
-	 */
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	ent->limit = var;
-	err = resize_available_mrs(ent, 0, true);
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-	if (err)
-		return err;
-	return count;
-}
-
-static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
-			  loff_t *pos)
-{
-	struct mlx5_cache_ent *ent = filp->private_data;
-	char lbuf[20];
-	int err;
-
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
-	if (err < 0)
-		return err;
-
-	return simple_read_from_buffer(buf, count, pos, lbuf, err);
-}
-
-static const struct file_operations limit_fops = {
-	.owner	= THIS_MODULE,
-	.open	= simple_open,
-	.write	= limit_write,
-	.read	= limit_read,
-};
-
-static bool someone_adding(struct mlx5_mkey_cache *cache)
-{
-	struct mlx5_cache_ent *ent;
-	struct rb_node *node;
-	bool ret;
-
-	mutex_lock(&cache->rb_lock);
-	for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
-		ent = rb_entry(node, struct mlx5_cache_ent, node);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		ret = ent->mkeys_queue.ci < ent->limit;
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		if (ret) {
-			mutex_unlock(&cache->rb_lock);
-			return true;
-		}
-	}
-	mutex_unlock(&cache->rb_lock);
-	return false;
-}
-
-/*
- * Check if the bucket is outside the high/low water mark and schedule an async
- * update. The cache refill has hysteresis, once the low water mark is hit it is
- * refilled up to the high mark.
- */
-static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
-{
-	lockdep_assert_held(&ent->mkeys_queue.lock);
-
-	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
-		return;
-	if (ent->mkeys_queue.ci < ent->limit) {
-		ent->fill_to_high_water = true;
-		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
-	} else if (ent->fill_to_high_water &&
-		   ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
-		/*
-		 * Once we start populating due to hitting a low water mark
-		 * continue until we pass the high water mark.
-		 */
-		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
-	} else if (ent->mkeys_queue.ci == 2 * ent->limit) {
-		ent->fill_to_high_water = false;
-	} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
-		/* Queue deletion of excess entries */
-		ent->fill_to_high_water = false;
-		if (ent->pending)
-			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
-					   secs_to_jiffies(1));
-		else
-			mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
-	}
-}
-
-static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-{
-	u32 mkey;
-
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	while (ent->mkeys_queue.ci) {
-		mkey = pop_mkey_locked(ent);
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		mlx5_core_destroy_mkey(dev->mdev, mkey);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-	}
-	ent->tmp_cleanup_scheduled = false;
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-}
-
-static void __cache_work_func(struct mlx5_cache_ent *ent)
-{
-	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mkey_cache *cache = &dev->cache;
-	int err;
-
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	if (ent->disabled)
-		goto out;
-
-	if (ent->fill_to_high_water &&
-	    ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
-	    !READ_ONCE(dev->fill_delay)) {
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		err = add_keys(ent, 1);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		if (ent->disabled)
-			goto out;
-		if (err) {
-			/*
-			 * EAGAIN only happens if there are pending MRs, so we
-			 * will be rescheduled when storing them. The only
-			 * failure path here is ENOMEM.
-			 */
-			if (err != -EAGAIN) {
-				mlx5_ib_warn(
-					dev,
-					"add keys command failed, err %d\n",
-					err);
-				queue_delayed_work(cache->wq, &ent->dwork,
-						   secs_to_jiffies(1));
-			}
-		}
-	} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
-		bool need_delay;
-
-		/*
-		 * The remove_cache_mr() logic is performed as garbage
-		 * collection task. Such task is intended to be run when no
-		 * other active processes are running.
-		 *
-		 * The need_resched() will return TRUE if there are user tasks
-		 * to be activated in near future.
-		 *
-		 * In such case, we don't execute remove_cache_mr() and postpone
-		 * the garbage collection work to try to run in next cycle, in
-		 * order to free CPU resources to other tasks.
-		 */
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		need_delay = need_resched() || someone_adding(cache) ||
-			     !time_after(jiffies,
-					 READ_ONCE(cache->last_add) + 300 * HZ);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		if (ent->disabled)
-			goto out;
-		if (need_delay) {
-			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
-			goto out;
-		}
-		remove_cache_mr_locked(ent);
-		queue_adjust_cache_locked(ent);
-	}
-out:
-	spin_unlock_irq(&ent->mkeys_queue.lock);
-}
-
-static void delayed_cache_work_func(struct work_struct *work)
-{
-	struct mlx5_cache_ent *ent;
-
-	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
-	/* temp entries are never filled, only cleaned */
-	if (ent->is_tmp)
-		clean_keys(ent->dev, ent);
-	else
-		__cache_work_func(ent);
-}
-
-static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
-			     struct mlx5r_cache_rb_key key2)
-{
-	int res;
-
-	res = key1.ats - key2.ats;
-	if (res)
-		return res;
-
-	res = key1.access_mode - key2.access_mode;
-	if (res)
-		return res;
-
-	res = key1.access_flags - key2.access_flags;
-	if (res)
-		return res;
-
-	res = key1.st_index - key2.st_index;
-	if (res)
-		return res;
-
-	res = key1.ph - key2.ph;
-	if (res)
-		return res;
-
-	/*
-	 * keep ndescs the last in the compare table since the find function
-	 * searches for an exact match on all properties and only closest
-	 * match in size.
-	 */
-	return key1.ndescs - key2.ndescs;
-}
-
-static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
-				 struct mlx5_cache_ent *ent)
-{
-	struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
-	struct mlx5_cache_ent *cur;
-	int cmp;
-
-	/* Figure out where to put new node */
-	while (*new) {
-		cur = rb_entry(*new, struct mlx5_cache_ent, node);
-		parent = *new;
-		cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
-		if (cmp > 0)
-			new = &((*new)->rb_left);
-		if (cmp < 0)
-			new = &((*new)->rb_right);
-		if (cmp == 0)
-			return -EEXIST;
-	}
-
-	/* Add new node and rebalance tree. */
-	rb_link_node(&ent->node, parent, new);
-	rb_insert_color(&ent->node, &cache->rb_root);
-
-	return 0;
-}
-
-static struct mlx5_cache_ent *
-mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
-			   struct mlx5r_cache_rb_key rb_key)
-{
-	struct rb_node *node = dev->cache.rb_root.rb_node;
-	struct mlx5_cache_ent *cur, *smallest = NULL;
-	u64 ndescs_limit;
-	int cmp;
-
-	/*
-	 * Find the smallest ent with order >= requested_order.
-	 */
-	while (node) {
-		cur = rb_entry(node, struct mlx5_cache_ent, node);
-		cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
-		if (cmp > 0) {
-			smallest = cur;
-			node = node->rb_left;
-		}
-		if (cmp < 0)
-			node = node->rb_right;
-		if (cmp == 0)
-			return cur;
-	}
-
-	/*
-	 * Limit the usage of mkeys larger than twice the required size while
-	 * also allowing the usage of smallest cache entry for small MRs.
-	 */
-	ndescs_limit = max_t(u64, rb_key.ndescs * 2,
-			     MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
-
-	return (smallest &&
-		smallest->rb_key.access_mode == rb_key.access_mode &&
-		smallest->rb_key.access_flags == rb_key.access_flags &&
-		smallest->rb_key.ats == rb_key.ats &&
-		smallest->rb_key.st_index == rb_key.st_index &&
-		smallest->rb_key.ph == rb_key.ph &&
-		smallest->rb_key.ndescs <= ndescs_limit) ?
-		       smallest :
-		       NULL;
-}
-
-static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-					       struct mlx5_cache_ent *ent)
-{
-	struct mlx5_ib_mr *mr;
-	int err;
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock_irq(&ent->mkeys_queue.lock);
-	ent->in_use++;
-
-	if (!ent->mkeys_queue.ci) {
-		queue_adjust_cache_locked(ent);
-		ent->miss++;
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		err = create_cache_mkey(ent, &mr->mmkey.key);
-		if (err) {
-			spin_lock_irq(&ent->mkeys_queue.lock);
-			ent->in_use--;
-			spin_unlock_irq(&ent->mkeys_queue.lock);
-			kfree(mr);
-			return ERR_PTR(err);
-		}
-	} else {
-		mr->mmkey.key = pop_mkey_locked(ent);
-		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-	}
-	mr->mmkey.cache_ent = ent;
-	mr->mmkey.type = MLX5_MKEY_MR;
-	mr->mmkey.rb_key = ent->rb_key;
-	mr->mmkey.cacheable = true;
-	init_waitqueue_head(&mr->mmkey.wait);
-	return mr;
-}
-
 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
 					 int access_flags)
 {
@@ -798,256 +161,195 @@ static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
 	return ret;
 }
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       int access_flags, int access_mode,
-				       int ndescs)
-{
-	struct mlx5r_cache_rb_key rb_key = {
-		.ndescs = ndescs,
-		.access_mode = access_mode,
-		.access_flags = get_unchangeable_access_flags(dev, access_flags),
-		.ph = MLX5_IB_NO_PH,
-	};
-	struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
+#define MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK 1
+#define MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT 16
+#define MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK 0xFF0000
+#define MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK 0xFFFF
 
-	if (!ent)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	return _mlx5_mr_cache_alloc(dev, ent);
-}
-
-static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (!mlx5_debugfs_root || dev->is_rep)
-		return;
-
-	debugfs_remove_recursive(dev->cache.fs_root);
-	dev->cache.fs_root = NULL;
-}
-
-static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
-					    struct mlx5_cache_ent *ent)
+static struct mlx5_ib_mr *
+_mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+		      int access_flags, int access_mode,
+		      unsigned long page_size, u16 st_index, u8 ph)
 {
-	int order = order_base_2(ent->rb_key.ndescs);
-	struct dentry *dir;
-
-	if (!mlx5_debugfs_root || dev->is_rep)
-		return;
-
-	if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
-		order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
-
-	sprintf(ent->name, "%d", order);
-	dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
-	debugfs_create_file("size", 0600, dir, ent, &size_fops);
-	debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-	debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
-	debugfs_create_u32("miss", 0600, dir, &ent->miss);
-}
+	struct mlx5_ib_mr *mr;
+	int err;
 
-static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
-{
-	struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
-	struct mlx5_mkey_cache *cache = &dev->cache;
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
-	if (!mlx5_debugfs_root || dev->is_rep)
-		return;
+	mr->ibmr.frmr.key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
+	mr->ibmr.frmr.key.access_flags =
+		get_unchangeable_access_flags(dev, access_flags);
+	mr->ibmr.frmr.key.num_dma_blocks =
+		ib_umem_num_dma_blocks(umem, page_size);
+	mr->ibmr.frmr.key.vendor_key =
+		access_mode == MLX5_MKC_ACCESS_MODE_KSM ?
+			MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK :
+			0;
+
+	/* Normalize ph: swap 0 and MLX5_IB_NO_PH */
+	if (ph == MLX5_IB_NO_PH || ph == 0)
+		ph ^= MLX5_IB_NO_PH;
+
+	mr->ibmr.frmr.key.kernel_vendor_key =
+		st_index | (ph << MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT);
+	err = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr);
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+	mr->mmkey.key = mr->ibmr.frmr.handle;
+	init_waitqueue_head(&mr->mmkey.wait);
 
-	cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
+	return mr;
 }
 
-static void delay_time_func(struct timer_list *t)
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+				       int access_flags, int access_mode,
+				       int ndescs)
 {
-	struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer);
-
-	WRITE_ONCE(dev->fill_delay, 0);
-}
+	struct ib_frmr_key key = {
+		.access_flags =
+			get_unchangeable_access_flags(dev, access_flags),
+		.vendor_key = access_mode == MLX5_MKC_ACCESS_MODE_MTT ?
+				      0 :
+				      MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK,
+		.num_dma_blocks = ndescs,
+		.kernel_vendor_key = 0, /* no PH and no ST index */
+	};
+	struct mlx5_ib_mr *mr;
+	int ret;
 
-static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
-{
-	struct mlx5_mkeys_page *page;
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
-	page = kzalloc(sizeof(*page), GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-	INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
-	spin_lock_init(&ent->mkeys_queue.lock);
-	list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
-	ent->mkeys_queue.num_pages++;
-	return 0;
-}
+	init_waitqueue_head(&mr->mmkey.wait);
 
-static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
-{
-	struct mlx5_mkeys_page *page;
+	mr->ibmr.frmr.key = key;
+	ret = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr);
+	if (ret) {
+		kfree(mr);
+		return ERR_PTR(ret);
+	}
+	mr->mmkey.key = mr->ibmr.frmr.handle;
+	mr->mmkey.type = MLX5_MKEY_MR;
 
-	WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
-	page = list_last_entry(&ent->mkeys_queue.pages_list,
-			       struct mlx5_mkeys_page, list);
-	list_del(&page->list);
-	kfree(page);
+	return mr;
 }
 
-struct mlx5_cache_ent *
-mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
-			      struct mlx5r_cache_rb_key rb_key,
-			      bool persistent_entry)
+static int mlx5r_create_mkeys(struct ib_device *device, struct ib_frmr_key *key,
+			      u32 *handles, unsigned int count)
 {
-	struct mlx5_cache_ent *ent;
-	int order;
-	int ret;
+	int access_mode =
+		key->vendor_key & MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK ?
+			MLX5_MKC_ACCESS_MODE_KSM :
+			MLX5_MKC_ACCESS_MODE_MTT;
 
-	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
-	if (!ent)
-		return ERR_PTR(-ENOMEM);
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	u16 st_index;
+	void *mkc;
+	u32 *in;
+	int err, i;
+	u8 ph;
 
-	ret = mlx5r_mkeys_init(ent);
-	if (ret)
-		goto mkeys_err;
-	ent->rb_key = rb_key;
-	ent->dev = dev;
-	ent->is_tmp = !persistent_entry;
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
-	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+	set_mkc_access_pd_addr_fields(mkc, key->access_flags, 0, dev->umrc.pd);
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, ma_translation_mode, !!key->ats);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 get_mkc_octo_size(access_mode, key->num_dma_blocks));
+	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
 
-	ret = mlx5_cache_ent_insert(&dev->cache, ent);
-	if (ret)
-		goto ent_insert_err;
-
-	if (persistent_entry) {
-		if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
-			order = MLX5_IMR_KSM_CACHE_ENTRY;
-		else
-			order = order_base_2(rb_key.ndescs) - 2;
-
-		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
-		    mlx5r_umr_can_load_pas(dev, 0))
-			ent->limit = dev->mdev->profile.mr_cache[order].limit;
-		else
-			ent->limit = 0;
-
-		mlx5_mkey_cache_debugfs_add_ent(dev, ent);
+	st_index = key->kernel_vendor_key &
+		   MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK;
+	ph = key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK;
+	if (ph) {
+		/* Normalize ph: swap MLX5_IB_NO_PH for 0 */
+		if (ph == MLX5_IB_NO_PH)
+			ph = 0;
+		MLX5_SET(mkc, mkc, pcie_tph_en, 1);
+		MLX5_SET(mkc, mkc, pcie_tph_ph, ph);
+		if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+			MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
+				 st_index);
 	}
 
-	return ent;
-ent_insert_err:
-	mlx5r_mkeys_uninit(ent);
-mkeys_err:
-	kfree(ent);
-	return ERR_PTR(ret);
-}
-
-static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
-{
-	struct rb_root *root = &dev->cache.rb_root;
-	struct mlx5_cache_ent *ent;
-	struct rb_node *node;
-
-	mutex_lock(&dev->cache.rb_lock);
-	node = rb_first(root);
-	while (node) {
-		ent = rb_entry(node, struct mlx5_cache_ent, node);
-		node = rb_next(node);
-		clean_keys(dev, ent);
-		rb_erase(&ent->node, root);
-		mlx5r_mkeys_uninit(ent);
-		kfree(ent);
+	for (i = 0; i < count; i++) {
+		assign_mkey_variant(dev, handles + i, in);
+		err = mlx5_core_create_mkey(dev->mdev, handles + i, in, inlen);
+		if (err)
+			goto free_in;
 	}
-	mutex_unlock(&dev->cache.rb_lock);
+free_in:
+	kfree(in);
+	if (err)
+		for (; i > 0; i--)
+			mlx5_core_destroy_mkey(dev->mdev, handles[i]);
+	return err;
 }
 
-int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+static void mlx5r_destroy_mkeys(struct ib_device *device, u32 *handles,
+				unsigned int count)
 {
-	struct mlx5_mkey_cache *cache = &dev->cache;
-	struct rb_root *root = &dev->cache.rb_root;
-	struct mlx5r_cache_rb_key rb_key = {
-		.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
-		.ph = MLX5_IB_NO_PH,
-	};
-	struct mlx5_cache_ent *ent;
-	struct rb_node *node;
-	int ret;
-	int i;
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	int i, err;
 
-	mutex_init(&dev->slow_path_mutex);
-	mutex_init(&dev->cache.rb_lock);
-	dev->cache.rb_root = RB_ROOT;
-	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
-	if (!cache->wq) {
-		mlx5_ib_warn(dev, "failed to create work queue\n");
-		return -ENOMEM;
+	for (i = 0; i < count; i++) {
+		err = mlx5_core_destroy_mkey(dev->mdev, handles[i]);
+		if (err)
+			pr_warn_ratelimited(
+				"mlx5_ib: failed to destroy mkey %d: %d",
+				handles[i], err);
 	}
+}
 
-	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
-	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	mlx5_mkey_cache_debugfs_init(dev);
-	mutex_lock(&cache->rb_lock);
-	for (i = 0; i <= mkey_cache_max_order(dev); i++) {
-		rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
-		ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
-		if (IS_ERR(ent)) {
-			ret = PTR_ERR(ent);
-			goto err;
-		}
-	}
+static int mlx5r_build_frmr_key(struct ib_device *device,
+				const struct ib_frmr_key *in,
+				struct ib_frmr_key *out)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
 
-	ret = mlx5_odp_init_mkey_cache(dev);
-	if (ret)
-		goto err;
+	/* check HW capabilities of users requested frmr key */
+	if ((in->ats && !MLX5_CAP_GEN(dev->mdev, ats)) ||
+	    ilog2(in->num_dma_blocks) > mkey_max_umr_order(dev))
+		return -EOPNOTSUPP;
 
-	mutex_unlock(&cache->rb_lock);
-	for (node = rb_first(root); node; node = rb_next(node)) {
-		ent = rb_entry(node, struct mlx5_cache_ent, node);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-	}
+	out->ats = in->ats;
+	out->access_flags =
+		get_unchangeable_access_flags(dev, in->access_flags);
+	out->vendor_key = in->vendor_key;
+	out->num_dma_blocks = in->num_dma_blocks;
 
 	return 0;
-
-err:
-	mutex_unlock(&cache->rb_lock);
-	mlx5_mkey_cache_debugfs_cleanup(dev);
-	mlx5r_destroy_cache_entries(dev);
-	destroy_workqueue(cache->wq);
-	mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
-	return ret;
 }
 
-void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
-{
-	struct rb_root *root = &dev->cache.rb_root;
-	struct mlx5_cache_ent *ent;
-	struct rb_node *node;
-
-	if (!dev->cache.wq)
-		return;
-
-	mutex_lock(&dev->cache.rb_lock);
-	for (node = rb_first(root); node; node = rb_next(node)) {
-		ent = rb_entry(node, struct mlx5_cache_ent, node);
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		ent->disabled = true;
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-		cancel_delayed_work(&ent->dwork);
-	}
-	mutex_unlock(&dev->cache.rb_lock);
-
-	/*
-	 * After all entries are disabled and will not reschedule on WQ,
-	 * flush it and all async commands.
-	 */
-	flush_workqueue(dev->cache.wq);
+static struct ib_frmr_pool_ops mlx5r_frmr_pool_ops = {
+	.create_frmrs = mlx5r_create_mkeys,
+	.destroy_frmrs = mlx5r_destroy_mkeys,
+	.build_key = mlx5r_build_frmr_key,
+};
 
-	mlx5_mkey_cache_debugfs_cleanup(dev);
-	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
+int mlx5r_frmr_pools_init(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
 
-	/* At this point all entries are disabled and have no concurrent work. */
-	mlx5r_destroy_cache_entries(dev);
+	mutex_init(&dev->slow_path_mutex);
+	return ib_frmr_pools_init(device, &mlx5r_frmr_pool_ops);
+}
 
-	destroy_workqueue(dev->cache.wq);
-	timer_delete_sync(&dev->delay_timer);
+void mlx5r_frmr_pools_cleanup(struct ib_device *device)
+{
+	ib_frmr_pools_cleanup(device);
 }
 
 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
@@ -1109,13 +411,6 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
-{
-	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MKEY_CACHE_LAST_STD_ENTRY;
-	return MLX5_MAX_UMR_SHIFT;
-}
-
 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 			  u64 length, int access_flags, u64 iova)
 {
@@ -1144,8 +439,6 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 					     u16 st_index, u8 ph)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5r_cache_rb_key rb_key = {};
-	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 	unsigned long page_size;
 
@@ -1157,33 +450,12 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
 
-	rb_key.access_mode = access_mode;
-	rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
-	rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
-	rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
-	rb_key.st_index = st_index;
-	rb_key.ph = ph;
-	ent = mkey_cache_ent_from_rb_key(dev, rb_key);
-	/*
-	 * If the MR can't come from the cache then synchronously create an uncached
-	 * one.
-	 */
-	if (!ent) {
-		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode,
-				st_index, ph);
-		mutex_unlock(&dev->slow_path_mutex);
-		if (IS_ERR(mr))
-			return mr;
-		mr->mmkey.rb_key = rb_key;
-		mr->mmkey.cacheable = true;
-		return mr;
-	}
-
-	mr = _mlx5_mr_cache_alloc(dev, ent);
+	mr = _mlx5_frmr_pool_alloc(dev, umem, access_flags, access_mode,
+				   page_size, st_index, ph);
 	if (IS_ERR(mr))
 		return mr;
 
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
 	mr->page_shift = order_base_2(page_size);
@@ -1812,18 +1084,24 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 				  unsigned long *page_size)
 {
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+	u8 access_mode;
 
-	/* We only track the allocated sizes of MRs from the cache */
-	if (!mr->mmkey.cache_ent)
+	/* We only track the allocated sizes of MRs from the frmr pools */
+	if (!mr->ibmr.frmr.pool)
 		return false;
 	if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
 		return false;
 
-	*page_size = mlx5_umem_mkc_find_best_pgsz(
-		dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode);
+	access_mode = mr->ibmr.frmr.key.vendor_key &
+				      MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK ?
+			      MLX5_MKC_ACCESS_MODE_KSM :
+			      MLX5_MKC_ACCESS_MODE_MTT;
+
+	*page_size =
+		mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova, access_mode);
 	if (WARN_ON(!*page_size))
 		return false;
-	return (mr->mmkey.cache_ent->rb_key.ndescs) >=
+	return (mr->ibmr.frmr.key.num_dma_blocks) >=
 	       ib_umem_num_dma_blocks(new_umem, *page_size);
 }
 
@@ -1884,7 +1162,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	int err;
 
 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct ||
-	    mr->mmkey.rb_key.ph != MLX5_IB_NO_PH)
+	    (mr->ibmr.frmr.key.kernel_vendor_key &
+	     MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK) != 0)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	mlx5_ib_dbg(
@@ -2025,47 +1304,6 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 	}
 }
 
-static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
-				    struct mlx5_ib_mr *mr)
-{
-	struct mlx5_mkey_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
-	int ret;
-
-	if (mr->mmkey.cache_ent) {
-		spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
-		goto end;
-	}
-
-	mutex_lock(&cache->rb_lock);
-	ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
-	if (ent) {
-		if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
-			if (ent->disabled) {
-				mutex_unlock(&cache->rb_lock);
-				return -EOPNOTSUPP;
-			}
-			mr->mmkey.cache_ent = ent;
-			spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
-			mutex_unlock(&cache->rb_lock);
-			goto end;
-		}
-	}
-
-	ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
-	mutex_unlock(&cache->rb_lock);
-	if (IS_ERR(ent))
-		return PTR_ERR(ent);
-
-	mr->mmkey.cache_ent = ent;
-	spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
-
-end:
-	ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
-	spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
-	return ret;
-}
-
 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
 {
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
@@ -2131,33 +1369,12 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
 	bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
 			      !to_ib_umem_dmabuf(mr->umem)->pinned;
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
-	struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
 	bool is_odp = is_odp_mr(mr);
-	bool from_cache = !!ent;
 	int ret;
 
-	if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
-	    !cache_ent_find_and_store(dev, mr)) {
-		ent = mr->mmkey.cache_ent;
-		/* upon storing to a clean temp entry - schedule its cleanup */
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		if (from_cache)
-			ent->in_use--;
-		if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
-			mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
-					 secs_to_jiffies(30));
-			ent->tmp_cleanup_scheduled = true;
-		}
-		spin_unlock_irq(&ent->mkeys_queue.lock);
+	if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
+	    !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
 		return 0;
-	}
-
-	if (ent) {
-		spin_lock_irq(&ent->mkeys_queue.lock);
-		ent->in_use--;
-		mr->mmkey.cache_ent = NULL;
-		spin_unlock_irq(&ent->mkeys_queue.lock);
-	}
 
 	if (is_odp)
 		mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
@@ -2241,7 +1458,7 @@ static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 			mlx5_ib_free_odp_mr(mr);
 	}
 
-	if (!mr->mmkey.cache_ent)
+	if (!mr->ibmr.frmr.pool)
 		mlx5_free_priv_descs(mr);
 
 	kfree(mr);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 6441abdf1f3b688ab25174885a8b843e7afb65bd..aefc9506a634f1e9a0dd51f09aa4f319684e0c16 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1878,25 +1878,6 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
-{
-	struct mlx5r_cache_rb_key rb_key = {
-		.access_mode = MLX5_MKC_ACCESS_MODE_KSM,
-		.ndescs = mlx5_imr_ksm_entries,
-		.ph = MLX5_IB_NO_PH,
-	};
-	struct mlx5_cache_ent *ent;
-
-	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
-		return 0;
-
-	ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
-	if (IS_ERR(ent))
-		return PTR_ERR(ent);
-
-	return 0;
-}
-
 static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
 	.advise_mr = mlx5_ib_advise_mr,
 };
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index e9361f0140e7b49ea3cf59a6093bf766d8dfebbb..7eeaf6a94c9743ac10f7f62069f603468da75565 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -9,6 +9,7 @@
 
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
+#define MLX5_MAX_UMR_EXTENDED_SHIFT 43
 
 #define MLX5_IB_UMR_OCTOWORD	       16
 #define MLX5_IB_UMR_XLT_ALIGNMENT      64

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Following mlx5_ib move to using FRMR pools, drop all unused code of MR
cache.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 67 +-------------------------
 include/linux/mlx5/driver.h                    | 11 -----
 2 files changed, 1 insertion(+), 77 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index df93625c9dfa3a11b769acdcab1320a6a4aeb4b0..cb2a58c789e992f8b06e9108c3ecc41e14276d65 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -110,74 +110,9 @@ static struct mlx5_profile profile[] = {
 
 	},
 	[2] = {
-		.mask		= MLX5_PROF_MASK_QP_SIZE |
-				  MLX5_PROF_MASK_MR_CACHE,
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
 		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
 		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
-		.mr_cache[0]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[1]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[2]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[3]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[4]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[5]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[6]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[7]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[8]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[9]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[10]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[11]	= {
-			.size	= 500,
-			.limit	= 250
-		},
-		.mr_cache[12]	= {
-			.size	= 64,
-			.limit	= 32
-		},
-		.mr_cache[13]	= {
-			.size	= 32,
-			.limit	= 16
-		},
-		.mr_cache[14]	= {
-			.size	= 16,
-			.limit	= 8
-		},
-		.mr_cache[15]	= {
-			.size	= 8,
-			.limit	= 4
-		},
 	},
 	[3] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5405ca1038f9ea175ea5bc028e801bb8d7de9311..975cd8705a58f68f2ff101b72c893b8a882b2806 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -699,23 +699,12 @@ struct mlx5_st;
 
 enum {
 	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
-	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
-};
-
-enum {
-	MKEY_CACHE_LAST_STD_ENTRY = 20,
-	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MKEY_CACHE_ENTRIES
 };
 
 struct mlx5_profile {
 	u64	mask;
 	u8	log_max_qp;
 	u8	num_cmd_caches;
-	struct {
-		int	size;
-		int	limit;
-	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 struct mlx5_hca_cap {

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Add support for a new command in netlink to dump to user the state of
the FRMR pools on the devices.
Expose each pool with its key and the usage statistics for it.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/nldev.c  | 254 +++++++++++++++++++++++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  17 +++
 2 files changed, 271 insertions(+)

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 2220a2dfab240eaef2eb64d8e45cb221dfa25614..6cdf6073fdf9c51ee291a63bb86ac690b094aa9f 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -37,11 +37,13 @@
 #include <net/netlink.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_netlink.h>
+#include <rdma/frmr_pools.h>
 
 #include "core_priv.h"
 #include "cma_priv.h"
 #include "restrack.h"
 #include "uverbs.h"
+#include "frmr_pools.h"
 
 /*
  * This determines whether a non-privileged user is allowed to specify a
@@ -172,6 +174,16 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_EVENT_TYPE]		= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_FRMR_POOLS]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_KEY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE]	= { .type = NLA_U64 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2637,6 +2649,244 @@ static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return ib_del_sub_device_and_put(device);
 }
 
+static int fill_frmr_pool_key(struct sk_buff *msg, struct ib_frmr_key *key)
+{
+	struct nlattr *key_attr;
+
+	key_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY);
+	if (!key_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS, key->ats))
+		goto err;
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS,
+			key->access_flags))
+		goto err;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY,
+			      key->vendor_key, RDMA_NLDEV_ATTR_PAD))
+		goto err;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS,
+			      key->num_dma_blocks, RDMA_NLDEV_ATTR_PAD))
+		goto err;
+
+	nla_nest_end(msg, key_attr);
+	return 0;
+
+err:
+	return -EMSGSIZE;
+}
+
+static int fill_frmr_pool_entry(struct sk_buff *msg, struct ib_frmr_pool *pool)
+{
+	if (fill_frmr_pool_key(msg, &pool->key))
+		return -EMSGSIZE;
+
+	spin_lock(&pool->lock);
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES,
+			pool->queue.ci + pool->inactive_queue.ci))
+		goto err_unlock;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE,
+			      pool->max_in_use, RDMA_NLDEV_ATTR_PAD))
+		goto err_unlock;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE,
+			      pool->in_use, RDMA_NLDEV_ATTR_PAD))
+		goto err_unlock;
+	spin_unlock(&pool->lock);
+
+	return 0;
+
+err_unlock:
+	spin_unlock(&pool->lock);
+	return -EMSGSIZE;
+}
+
+static int fill_frmr_pools_info(struct sk_buff *msg, struct ib_device *device)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct ib_frmr_pool *pool;
+	struct nlattr *table_attr;
+	struct rb_node *node;
+
+	if (!pools)
+		return 0;
+
+	read_lock(&pools->rb_lock);
+	if (RB_EMPTY_ROOT(&pools->rb_root)) {
+		read_unlock(&pools->rb_lock);
+		return 0;
+	}
+	read_unlock(&pools->rb_lock);
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_FRMR_POOLS);
+	if (!table_attr)
+		return -EMSGSIZE;
+
+	read_lock(&pools->rb_lock);
+	for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) {
+		pool = rb_entry(node, struct ib_frmr_pool, node);
+		if (fill_frmr_pool_entry(msg, pool))
+			goto err;
+	}
+	read_unlock(&pools->rb_lock);
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	read_unlock(&pools->rb_lock);
+	nla_nest_cancel(msg, table_attr);
+	return -EMSGSIZE;
+}
+
+static int nldev_frmr_pools_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+				     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	int ret;
+
+	ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			    NL_VALIDATE_LIBERAL, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_FRMR_POOLS_GET),
+			0, 0);
+	if (!nlh || fill_nldev_handle(msg, device)) {
+		ret = -EMSGSIZE;
+		goto err_free;
+	}
+
+	ret = fill_frmr_pools_info(msg, device);
+	if (ret)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb,
+				       struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_frmr_pools *pools;
+	int err, ret = 0, idx = 0;
+	struct ib_frmr_pool *pool;
+	struct nlattr *table_attr;
+	struct nlattr *entry_attr;
+	struct ib_device *device;
+	int start = cb->args[0];
+	struct rb_node *node;
+	struct nlmsghdr *nlh;
+	bool filled = false;
+
+	err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			    nldev_policy, NL_VALIDATE_LIBERAL, NULL);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	device = ib_device_get_by_index(
+		sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]));
+	if (!device)
+		return -EINVAL;
+
+	pools = device->frmr_pools;
+	if (!pools) {
+		ib_device_put(device);
+		return 0;
+	}
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_FRMR_POOLS_GET),
+			0, NLM_F_MULTI);
+
+	if (!nlh || fill_nldev_handle(skb, device)) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	table_attr = nla_nest_start_noflag(skb, RDMA_NLDEV_ATTR_FRMR_POOLS);
+	if (!table_attr) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	read_lock(&pools->rb_lock);
+	for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) {
+		pool = rb_entry(node, struct ib_frmr_pool, node);
+		if (pool->key.kernel_vendor_key)
+			continue;
+
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+
+		filled = true;
+
+		entry_attr = nla_nest_start_noflag(
+			skb, RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY);
+		if (!entry_attr) {
+			ret = -EMSGSIZE;
+			goto end_msg;
+		}
+
+		if (fill_frmr_pool_entry(skb, pool)) {
+			nla_nest_cancel(skb, entry_attr);
+			ret = -EMSGSIZE;
+			goto end_msg;
+		}
+
+		nla_nest_end(skb, entry_attr);
+		idx++;
+	}
+end_msg:
+	read_unlock(&pools->rb_lock);
+
+	nla_nest_end(skb, table_attr);
+	nlmsg_end(skb, nlh);
+	cb->args[0] = idx;
+
+	/*
+	 * No more entries to fill, cancel the message and
+	 * return 0 to mark end of dumpit.
+	 */
+	if (!filled)
+		goto err;
+
+	ib_device_put(device);
+	return skb->len;
+
+err:
+	nlmsg_cancel(skb, nlh);
+	ib_device_put(device);
+	return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -2743,6 +2993,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_deldev,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_FRMR_POOLS_GET] = {
+		.doit = nldev_frmr_pools_get_doit,
+		.dump = nldev_frmr_pools_get_dumpit,
+	},
 };
 
 static int fill_mon_netdev_rename(struct sk_buff *msg,
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index f41f0228fcd0e0b74e74b4d87611546b00f799a1..8f17ffe0190cb86131109209c45caec155ab36da 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -308,6 +308,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_MONITOR,
 
+	RDMA_NLDEV_CMD_FRMR_POOLS_GET, /* can dump */
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -582,6 +584,21 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_SYS_ATTR_MONITOR_MODE,	/* u8 */
 
 	RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED,	/* u8 */
+
+	/*
+	 * FRMR Pools attributes
+	 */
+	RDMA_NLDEV_ATTR_FRMR_POOLS,		/* nested table */
+	RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY,	/* nested table */
+	RDMA_NLDEV_ATTR_FRMR_POOL_KEY,		/* nested table */
+	RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS,	/* u8 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS,	/* u32 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY,	/* u64 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS,	/* u64 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES,	/* u32 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE,	/* u64 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE,	/* u64 */
+
 	/*
 	 * Always the end
 	 */

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Allow users to set FRMR pools aging timer through netlink.
This functionality will allow user to control how long handles reside in
the kernel before being destroyed, thus being able to tune the tradeoff
between memory and HW object consumption and memory registration
optimization.
Since FRMR pools is highly beneficial for application restart scenarios,
this command allows users to modify the aging timer to their application
restart time, making sure the FRMR handles deregistered on application
teardown are kept for long enough in the pools for reuse in the
application startup.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/frmr_pools.c | 31 ++++++++++++++++++++++++++++--
 drivers/infiniband/core/frmr_pools.h |  2 ++
 drivers/infiniband/core/nldev.c      | 37 ++++++++++++++++++++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h     |  3 +++
 4 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
index 254113d2442d5d6956587a1c444dc74cd48204fb..b150bb78de3c4fd89990f7aed7874e4db94eac0a 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -174,7 +174,7 @@ static void pool_aging_work(struct work_struct *work)
 	if (has_work)
 		queue_delayed_work(
 			pools->aging_wq, &pool->aging_work,
-			secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS));
+			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
 }
 
 static void destroy_frmr_pool(struct ib_device *device,
@@ -214,6 +214,8 @@ int ib_frmr_pools_init(struct ib_device *device,
 		return -ENOMEM;
 	}
 
+	pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS;
+
 	device->frmr_pools = pools;
 	return 0;
 }
@@ -249,6 +251,31 @@ void ib_frmr_pools_cleanup(struct ib_device *device)
 }
 EXPORT_SYMBOL(ib_frmr_pools_cleanup);
 
+int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec)
+{
+	struct ib_frmr_pools *pools = device->frmr_pools;
+	struct ib_frmr_pool *pool;
+	struct rb_node *node;
+
+	if (!pools)
+		return -EINVAL;
+
+	if (period_sec == 0)
+		return -EINVAL;
+
+	WRITE_ONCE(pools->aging_period_sec, period_sec);
+
+	read_lock(&pools->rb_lock);
+	for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) {
+		pool = rb_entry(node, struct ib_frmr_pool, node);
+		mod_delayed_work(pools->aging_wq, &pool->aging_work,
+				 secs_to_jiffies(period_sec));
+	}
+	read_unlock(&pools->rb_lock);
+
+	return 0;
+}
+
 static int compare_keys(struct ib_frmr_key *key1, struct ib_frmr_key *key2)
 {
 	int res;
@@ -518,7 +545,7 @@ int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
 
 	if (ret == 0 && schedule_aging)
 		queue_delayed_work(pools->aging_wq, &pool->aging_work,
-			secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS));
+			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
 
 	return ret;
 }
diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h
index b144273ee34785623d2254d19f5af40869e00e83..81149ff15e003358b6d060c98fb68120c9a0e8b9 100644
--- a/drivers/infiniband/core/frmr_pools.h
+++ b/drivers/infiniband/core/frmr_pools.h
@@ -54,8 +54,10 @@ struct ib_frmr_pools {
 	const struct ib_frmr_pool_ops *pool_ops;
 
 	struct workqueue_struct *aging_wq;
+	u32 aging_period_sec;
 };
 
 int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
 			     u32 pinned_handles);
+int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec);
 #endif /* RDMA_CORE_FRMR_POOLS_H */
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 6cdf6073fdf9c51ee291a63bb86ac690b094aa9f..e22c999d164120ac070b435e92f53c15f976bf5c 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -184,6 +184,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES] = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD] = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2887,6 +2888,38 @@ static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb,
 	return ret;
 }
 
+static int nldev_frmr_pools_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+				     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 aging_period;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			  extack);
+	if (err)
+		return err;
+
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	if (!tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD])
+		return -EINVAL;
+
+	device = ib_device_get_by_index(
+		sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]));
+	if (!device)
+		return -EINVAL;
+
+	aging_period = nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]);
+
+	err = ib_frmr_pools_set_aging_period(device, aging_period);
+
+	ib_device_put(device);
+	return err;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -2997,6 +3030,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_frmr_pools_get_doit,
 		.dump = nldev_frmr_pools_get_dumpit,
 	},
+	[RDMA_NLDEV_CMD_FRMR_POOLS_SET] = {
+		.doit = nldev_frmr_pools_set_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 static int fill_mon_netdev_rename(struct sk_buff *msg,
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 8f17ffe0190cb86131109209c45caec155ab36da..f9c295caf2b1625e3636d4279a539d481fdeb4ac 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -310,6 +310,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_FRMR_POOLS_GET, /* can dump */
 
+	RDMA_NLDEV_CMD_FRMR_POOLS_SET,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -598,6 +600,7 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES,	/* u32 */
 	RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE,	/* u64 */
 	RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE,	/* u64 */
+	RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD,	/* u32 */
 
 	/*
 	 * Always the end

-- 
2.47.1

From: Michael Guralnik <michaelgur@nvidia.com>

Allow users to set through netlink, for a specific FRMR pool, the amount
of handles that are not aged, and fill the pool to this amount.

This allows users to warm-up the FRMR pools to an expected amount of
handles with specific attributes that fits their expected usage.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
 drivers/infiniband/core/frmr_pools.c |  1 +
 drivers/infiniband/core/nldev.c      | 66 +++++++++++++++++++++++++++++++++---
 include/uapi/rdma/rdma_netlink.h     |  1 +
 3 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c
index b150bb78de3c4fd89990f7aed7874e4db94eac0a..9a27ff2d9aec20b415c187909ba660a94590b2d7 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -452,6 +452,7 @@ int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
 	kfree(handles);
 
 schedule_aging:
+	/* Ensure aging is scheduled to adjust to new pinned handles count */
 	mod_delayed_work(pools->aging_wq, &pool->aging_work, 0);
 
 	return ret;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index e22c999d164120ac070b435e92f53c15f976bf5c..5c8a4e19fdf8e82e78237d4e6ced9c519613505e 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -185,6 +185,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES] = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2692,6 +2693,9 @@ static int fill_frmr_pool_entry(struct sk_buff *msg, struct ib_frmr_pool *pool)
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE,
 			      pool->in_use, RDMA_NLDEV_ATTR_PAD))
 		goto err_unlock;
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES,
+			pool->pinned_handles))
+		goto err_unlock;
 	spin_unlock(&pool->lock);
 
 	return 0;
@@ -2789,6 +2793,54 @@ static int nldev_frmr_pools_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return ret;
 }
 
+static void nldev_frmr_pools_parse_key(struct nlattr *tb[],
+				       struct ib_frmr_key *key,
+				       struct netlink_ext_ack *extack)
+{
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS])
+		key->ats = nla_get_u8(tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]);
+
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS])
+		key->access_flags = nla_get_u32(
+			tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS]);
+
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY])
+		key->vendor_key = nla_get_u64(
+			tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY]);
+
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS])
+		key->num_dma_blocks = nla_get_u64(
+			tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS]);
+}
+
+static int nldev_frmr_pools_set_pinned(struct ib_device *device,
+				       struct nlattr *tb[],
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *key_tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_frmr_key key = { 0 };
+	u32 pinned_handles = 0;
+	int err = 0;
+
+	pinned_handles =
+		nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES]);
+
+	if (!tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY])
+		return -EINVAL;
+
+	err = nla_parse_nested(key_tb, RDMA_NLDEV_ATTR_MAX - 1,
+			       tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY], nldev_policy,
+			       extack);
+	if (err)
+		return err;
+
+	nldev_frmr_pools_parse_key(key_tb, &key, extack);
+
+	err = ib_frmr_pools_set_pinned(device, &key, pinned_handles);
+
+	return err;
+}
+
 static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb,
 				       struct netlink_callback *cb)
 {
@@ -2904,18 +2956,22 @@ static int nldev_frmr_pools_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX])
 		return -EINVAL;
 
-	if (!tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD])
-		return -EINVAL;
-
 	device = ib_device_get_by_index(
 		sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]));
 	if (!device)
 		return -EINVAL;
 
-	aging_period = nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]);
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]) {
+		aging_period = nla_get_u32(
+			tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]);
+		err = ib_frmr_pools_set_aging_period(device, aging_period);
+		goto done;
+	}
 
-	err = ib_frmr_pools_set_aging_period(device, aging_period);
+	if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES])
+		err = nldev_frmr_pools_set_pinned(device, tb, extack);
 
+done:
 	ib_device_put(device);
 	return err;
 }
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index f9c295caf2b1625e3636d4279a539d481fdeb4ac..39178df104f01d19a8135554adece66be881fd15 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -601,6 +601,7 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE,	/* u64 */
 	RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE,	/* u64 */
 	RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD,	/* u32 */
+	RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES,	/* u32 */
 
 	/*
 	 * Always the end

-- 
2.47.1