Similar to live migration, loading a VM from some saved state (aka snapshot) is also an event that calls for clock adjustments in the guest. However, guests might want to take more actions as a response to such events, e.g. as discarding UUIDs, resetting network connections, reseeding entropy pools, etc. These are actions that guests don't typically take during live migration, so add a new field in the vmclock_abi called vm_generation_counter which informs the guest about such events. Signed-off-by: Babis Chalios --- include/uapi/linux/vmclock-abi.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h index 2d99b29ac44a..fbf1c5928273 100644 --- a/include/uapi/linux/vmclock-abi.h +++ b/include/uapi/linux/vmclock-abi.h @@ -115,6 +115,12 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) __u8 pad[2]; __u8 clock_status; @@ -177,6 +183,19 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the VM + * is loaded from a snapshot. This event, typically, represents a + * "jump" forward in time. As a result, in this case as well, the + * guest needs to discard any calibrarion against external sources. + * Loading a snapshot in a VM has different semantics than other VM + * events such as live migration, i.e. apart from re-adjusting guest + * clocks a guest user space might want to discard UUIDs, reset + * network connections or reseed entropy, etc. As a result, we + * use a dedicated marker for such events. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ -- 2.34.1 VMClock now expects the hypervisor to send a device notification every time the seqcount lock changes to a new (even) value. Moreover, add support for poll() in VMClock as a means to propagate this notification to user space. poll() will notify listeners every time seq_count has changed to a new (even) value since the last time read() (or open()) was called on the device. This means that when poll() returns a (POLLIN) event, listeners need to use read() to observe what has changed and update the reader's view of seq_count. In other words, after a poll() returned all subsequent calls to poll() will immediately return with a POLLIN event until the listener calls read(). Signed-off-by: Babis Chalios --- drivers/ptp/ptp_vmclock.c | 85 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index b3a83b03d9c1..efcdcc5c40cf 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -5,6 +5,9 @@ * Copyright © 2024 Amazon.com, Inc. or its affiliates. */ +#include "linux/poll.h" +#include "linux/types.h" +#include "linux/wait.h" #include #include #include @@ -39,6 +42,7 @@ struct vmclock_state { struct resource res; struct vmclock_abi *clk; struct miscdevice miscdev; + wait_queue_head_t disrupt_wait; struct ptp_clock_info ptp_clock_info; struct ptp_clock *ptp_clock; enum clocksource_ids cs_id, sys_cs_id; @@ -357,10 +361,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev, return ptp_clock_register(&st->ptp_clock_info, dev); } +struct vmclock_file_state { + struct vmclock_state *st; + uint32_t seq; +}; + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) return -EROFS; @@ -379,8 +388,9 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); size_t max_count; uint32_t seq; @@ -402,8 +412,10 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, /* Pairs with hypervisor wmb */ virt_rmb(); - if (seq == le32_to_cpu(st->clk->seq_count)) + if (seq == le32_to_cpu(st->clk->seq_count)) { + fst->seq = seq; break; + } if (ktime_after(ktime_get(), deadline)) return -ETIMEDOUT; @@ -413,10 +425,51 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, return count; } +static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) +{ + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + uint32_t seq; + + poll_wait(fp, &st->disrupt_wait, wait); + + seq = le32_to_cpu(st->clk->seq_count); + if (fst->seq != seq) + return POLLIN | POLLRDNORM; + + return 0; +} + +static int vmclock_miscdev_open(struct inode *inode, struct file *fp) +{ + struct vmclock_state *st = container_of(fp->private_data, + struct vmclock_state, miscdev); + struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); + + if (!fst) + return -ENOMEM; + + fst->st = st; + fst->seq = le32_to_cpu(st->clk->seq_count); + + fp->private_data = fst; + + return 0; +} + +static int vmclock_miscdev_release(struct inode *inode, struct file *fp) +{ + kfree(fp->private_data); + return 0; +} + static const struct file_operations vmclock_miscdev_fops = { .owner = THIS_MODULE, + .open = vmclock_miscdev_open, + .release = vmclock_miscdev_release, .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, + .poll = vmclock_miscdev_poll, }; /* module operations */ @@ -459,6 +512,16 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data return AE_ERROR; } +static void +vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + u32 __always_unused event, void *dev) +{ + struct device *device = dev; + struct vmclock_state *st = device->driver_data; + + wake_up_interruptible(&st->disrupt_wait); +} + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) { struct acpi_device *adev = ACPI_COMPANION(dev); @@ -479,6 +542,14 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) return -ENODEV; } + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); + if (ACPI_FAILURE(status)) { + dev_err(dev, "failed to install notification handler"); + return -ENODEV; + } + return 0; } @@ -549,6 +620,8 @@ static int vmclock_probe(struct platform_device *pdev) if (ret) return ret; + init_waitqueue_head(&st->disrupt_wait); + /* * If the structure is big enough, it can be mapped to userspace. * Theoretically a guest OS even using larger pages could still @@ -581,6 +654,8 @@ static int vmclock_probe(struct platform_device *pdev) return -ENODEV; } + dev->driver_data = st; + dev_info(dev, "%s: registered %s%s%s\n", st->name, st->miscdev.minor ? "miscdev" : "", (st->miscdev.minor && st->ptp_clock) ? ", " : "", -- 2.34.1