When DRBD's metadata device resides on persistent memory (PMEM/NVDIMM), accessing it by reading and writing full blocks is unnecessarily costly. Add a DAX-based metadata path that directly maps the metadata region, enabling byte-granular, IRQ-safe access without having to go through the block layer. The PMEM path also introduces a more efficient activity log layout: instead of writing journal transactions, the in-memory LRU-cache hash table is stored directly in persistent memory and updated in-place. Similarly, the resync bitmap is accessed directly from PMEM rather than being loaded into and flushed from DRAM. This is compiled in only when CONFIG_DEV_DAX_PMEM is enabled. Co-developed-by: Philipp Reisner Signed-off-by: Philipp Reisner Co-developed-by: Lars Ellenberg Signed-off-by: Lars Ellenberg Co-developed-by: Joel Colledge Signed-off-by: Joel Colledge Co-developed-by: Christoph Böhmwalder Signed-off-by: Christoph Böhmwalder --- drivers/block/drbd/Makefile | 1 + drivers/block/drbd/drbd_dax_pmem.c | 158 +++++++++++++++++++++++++++++ drivers/block/drbd/drbd_dax_pmem.h | 40 ++++++++ 3 files changed, 199 insertions(+) create mode 100644 drivers/block/drbd/drbd_dax_pmem.c create mode 100644 drivers/block/drbd/drbd_dax_pmem.h diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 7f2655a206aa..4b58eb83fc22 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -5,6 +5,7 @@ drbd-y += drbd_main.o drbd_strings.o drbd_nl.o drbd-y += drbd_interval.o drbd_state.o drbd-y += drbd_nla.o drbd-y += drbd_transport.o +drbd-$(CONFIG_DEV_DAX_PMEM) += drbd_dax_pmem.o drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_dax_pmem.c b/drivers/block/drbd/drbd_dax_pmem.c new file mode 100644 index 000000000000..6f29dfd763a3 --- /dev/null +++ b/drivers/block/drbd/drbd_dax_pmem.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + drbd_dax.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2017, LINBIT HA-Solutions GmbH. + + + */ + +/* + In case DRBD's meta-data resides in persistent memory do a few things + different. + + 1 direct access the bitmap in place. Do not load it into DRAM, do not + write it back from DRAM. + 2 Use a better fitting format for the on-disk activity log. Instead of + writing transactions, the unmangled LRU-cache hash table is there. +*/ + +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_dax_pmem.h" +#include "drbd_meta_data.h" + +static int map_superblock_for_dax(struct drbd_backing_dev *bdev, struct dax_device *dax_dev) +{ + long want = 1; + pgoff_t pgoff = bdev->md.md_offset >> (PAGE_SHIFT - SECTOR_SHIFT); + void *kaddr; + long len; + int id; + + id = dax_read_lock(); + len = dax_direct_access(dax_dev, pgoff, want, DAX_ACCESS, &kaddr, NULL); + dax_read_unlock(id); + + if (len < want) + return -EIO; + + bdev->md_on_pmem = kaddr; + + return 0; +} + +/** + * drbd_dax_open() - Open device for dax and map metadata superblock + * @bdev: backing device to be opened + */ +int drbd_dax_open(struct drbd_backing_dev *bdev) +{ + struct dax_device *dax_dev; + int err; + u64 part_off; + + dax_dev = fs_dax_get_by_bdev(bdev->md_bdev, &part_off, NULL, NULL); + if (!dax_dev) + return -ENODEV; + + err = map_superblock_for_dax(bdev, dax_dev); + if (!err) + bdev->dax_dev = dax_dev; + else + put_dax(dax_dev); + + return err; +} + +void drbd_dax_close(struct drbd_backing_dev *bdev) +{ + put_dax(bdev->dax_dev); +} + +/** + * drbd_dax_map() - Map metadata for dax + * @bdev: backing device whose metadata is to be mapped + */ +int drbd_dax_map(struct drbd_backing_dev *bdev) +{ + struct dax_device *dax_dev = bdev->dax_dev; + sector_t first_sector = drbd_md_first_sector(bdev); + sector_t al_sector = bdev->md.md_offset + bdev->md.al_offset; + long want = (drbd_md_last_sector(bdev) + 1 - first_sector) >> (PAGE_SHIFT - SECTOR_SHIFT); + pgoff_t pgoff = first_sector >> (PAGE_SHIFT - SECTOR_SHIFT); + long md_offset_byte = (bdev->md.md_offset - first_sector) << SECTOR_SHIFT; + long al_offset_byte = (al_sector - first_sector) << SECTOR_SHIFT; + void *kaddr; + long len; + int id; + + id = dax_read_lock(); + len = dax_direct_access(dax_dev, pgoff, want, DAX_ACCESS, &kaddr, NULL); + dax_read_unlock(id); + + if (len < want) + return -EIO; + + bdev->md_on_pmem = kaddr + md_offset_byte; + bdev->al_on_pmem = kaddr + al_offset_byte; + + return 0; +} + +void drbd_dax_al_update(struct drbd_device *device, struct lc_element *al_ext) +{ + struct al_on_pmem *al_on_pmem = device->ldev->al_on_pmem; + __be32 *slot = &al_on_pmem->slots[al_ext->lc_index]; + + *slot = cpu_to_be32(al_ext->lc_new_number); + arch_wb_cache_pmem(slot, sizeof(*slot)); +} + + +void drbd_dax_al_begin_io_commit(struct drbd_device *device) +{ + struct lc_element *e; + + spin_lock_irq(&device->al_lock); + + list_for_each_entry(e, &device->act_log->to_be_changed, list) + drbd_dax_al_update(device, e); + + lc_committed(device->act_log); + + spin_unlock_irq(&device->al_lock); +} + +int drbd_dax_al_initialize(struct drbd_device *device) +{ + struct al_on_pmem *al_on_pmem = device->ldev->al_on_pmem; + __be32 *slots = al_on_pmem->slots; + int i, al_slots = (device->ldev->md.al_size_4k << (12 - 2)) - 1; + + al_on_pmem->magic = cpu_to_be32(DRBD_AL_PMEM_MAGIC); + /* initialize all slots rather than just the configured number in case + * the configuration is later changed */ + for (i = 0; i < al_slots; i++) { + unsigned int extent_nr = i < device->act_log->nr_elements ? + lc_element_by_index(device->act_log, i)->lc_number : + LC_FREE; + slots[i] = cpu_to_be32(extent_nr); + } + + return 0; +} + +void *drbd_dax_bitmap(struct drbd_device *device, unsigned long want) +{ + struct drbd_backing_dev *bdev = device->ldev; + unsigned char *md_on_pmem = (unsigned char *)bdev->md_on_pmem; + + return md_on_pmem + (long)bdev->md.bm_offset * SECTOR_SIZE; +} diff --git a/drivers/block/drbd/drbd_dax_pmem.h b/drivers/block/drbd/drbd_dax_pmem.h new file mode 100644 index 000000000000..9a929969ff27 --- /dev/null +++ b/drivers/block/drbd/drbd_dax_pmem.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef DRBD_DAX_H +#define DRBD_DAX_H + +#include + +#if IS_ENABLED(CONFIG_DEV_DAX_PMEM) + +int drbd_dax_open(struct drbd_backing_dev *bdev); +void drbd_dax_close(struct drbd_backing_dev *bdev); +int drbd_dax_map(struct drbd_backing_dev *bdev); +void drbd_dax_al_update(struct drbd_device *device, struct lc_element *al_ext); +void drbd_dax_al_begin_io_commit(struct drbd_device *device); +int drbd_dax_al_initialize(struct drbd_device *device); +void *drbd_dax_bitmap(struct drbd_device *device, unsigned long want); + +static inline bool drbd_md_dax_active(struct drbd_backing_dev *bdev) +{ + return bdev->dax_dev != NULL; +} +static inline struct meta_data_on_disk_9 *drbd_dax_md_addr(struct drbd_backing_dev *bdev) +{ + return bdev->md_on_pmem; +} +#else + +#define drbd_dax_open(B) do { } while (0) +#define drbd_dax_close(B) do { } while (0) +#define drbd_dax_map(B) (-ENOTSUPP) +#define drbd_dax_al_begin_io_commit(D) do { } while (0) +#define drbd_dax_al_initialize(D) (-EIO) +#define drbd_dax_bitmap(D, L) (NULL) +#define drbd_md_dax_active(B) (false) +#define drbd_dax_md_addr(B) (NULL) + +#define arch_wb_cache_pmem(A, L) do { } while (0) + +#endif /* IS_ENABLED(CONFIG_DEV_DAX_PMEM) */ + +#endif /* DRBD_DAX_H */ -- 2.53.0