Similar to live migration, loading a VM from some saved state (aka snapshot) is also an event that calls for clock adjustments in the guest. However, guests might want to take more actions as a response to such events, e.g. as discarding UUIDs, resetting network connections, reseeding entropy pools, etc. These are actions that guests don't typically take during live migration, so add a new field in the vmclock_abi called vm_generation_counter which informs the guest about such events. Hypervisor advertises support for vm_generation_counter through the VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the presence of this bit in vmclock_abi flags field before using this flag. Signed-off-by: Babis Chalios Reviewed-by: David Woodhouse --- include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h index 2d99b29ac44a..937fe00e4f33 100644 --- a/include/uapi/linux/vmclock-abi.h +++ b/include/uapi/linux/vmclock-abi.h @@ -115,6 +115,12 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) __u8 pad[2]; __u8 clock_status; @@ -177,6 +183,15 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the guest + * has been loaded from a snapshot. In addition to handling a + * disruption in time (which will also be signalled through the + * disruption_marker field), a guest may wish to discard UUIDs, + * reset network connections, reseed entropy, etc. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ -- 2.34.1 Add optional support for device notifications in VMClock. When supported, the hypervisor will send a device notification every time it updates the seq_count to a new even value. Moreover, add support for poll() in VMClock as a means to propagate this notification to user space. poll() will return a POLLIN event to listeners every time seq_count changes to a value different than the one last seen (since open() or last read()/pread()). This means that when poll() returns a POLLIN event, listeners need to use read() to observe what has changed and update the reader's view of seq_count. In other words, after a poll() returned, all subsequent calls to poll() will immediately return with a POLLIN event until the listener calls read(). The device advertises support for the notification mechanism by setting flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If the flag is not present the driver won't setup the ACPI notification handler and poll() will always immediately return POLLHUP. Signed-off-by: Babis Chalios Reviewed-by: David Woodhouse --- drivers/ptp/ptp_vmclock.c | 126 +++++++++++++++++++++++++++++-- include/uapi/linux/vmclock-abi.h | 5 ++ 2 files changed, 124 insertions(+), 7 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index b3a83b03d9c1..a203386c4b09 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -5,6 +5,9 @@ * Copyright © 2024 Amazon.com, Inc. or its affiliates. */ +#include "linux/poll.h" +#include "linux/types.h" +#include "linux/wait.h" #include #include #include @@ -39,6 +42,7 @@ struct vmclock_state { struct resource res; struct vmclock_abi *clk; struct miscdevice miscdev; + wait_queue_head_t disrupt_wait; struct ptp_clock_info ptp_clock_info; struct ptp_clock *ptp_clock; enum clocksource_ids cs_id, sys_cs_id; @@ -357,10 +361,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev, return ptp_clock_register(&st->ptp_clock_info, dev); } +struct vmclock_file_state { + struct vmclock_state *st; + atomic_t seq; +}; + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) return -EROFS; @@ -379,11 +388,12 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); size_t max_count; - uint32_t seq; + uint32_t seq, old_seq; if (*ppos >= PAGE_SIZE) return 0; @@ -392,6 +402,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, if (count > max_count) count = max_count; + old_seq = atomic_read(&fst->seq); while (1) { seq = le32_to_cpu(st->clk->seq_count) & ~1U; /* Pairs with hypervisor wmb */ @@ -402,8 +413,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, /* Pairs with hypervisor wmb */ virt_rmb(); - if (seq == le32_to_cpu(st->clk->seq_count)) - break; + if (seq == le32_to_cpu(st->clk->seq_count)) { + /* + * Either we updated fst->seq to seq (the latest version we observed) + * or someone else did (old_seq == seq), so we can break. + */ + if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || + old_seq == seq) { + break; + } + } if (ktime_after(ktime_get(), deadline)) return -ETIMEDOUT; @@ -413,10 +432,58 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, return count; } +static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) +{ + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + uint32_t seq; + + /* + * Hypervisor will not send us any notifications, so fail immediately + * to avoid having caller sleeping for ever. + */ + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) + return POLLHUP; + + poll_wait(fp, &st->disrupt_wait, wait); + + seq = le32_to_cpu(st->clk->seq_count); + if (atomic_read(&fst->seq) != seq) + return POLLIN | POLLRDNORM; + + return 0; +} + +static int vmclock_miscdev_open(struct inode *inode, struct file *fp) +{ + struct vmclock_state *st = container_of(fp->private_data, + struct vmclock_state, miscdev); + struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); + + if (!fst) + return -ENOMEM; + + fst->st = st; + atomic_set(&fst->seq, 0); + + fp->private_data = fst; + + return 0; +} + +static int vmclock_miscdev_release(struct inode *inode, struct file *fp) +{ + kfree(fp->private_data); + return 0; +} + static const struct file_operations vmclock_miscdev_fops = { .owner = THIS_MODULE, + .open = vmclock_miscdev_open, + .release = vmclock_miscdev_release, .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, + .poll = vmclock_miscdev_poll, }; /* module operations */ @@ -459,6 +526,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data return AE_ERROR; } +static void +vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + u32 __always_unused event, void *dev) +{ + struct device *device = dev; + struct vmclock_state *st = device->driver_data; + + wake_up_interruptible(&st->disrupt_wait); +} + +static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) +{ + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; + + /* + * This should never happen as this function is only called when + * has_acpi_companion(dev) is true, but the logic is sufficiently + * complex that Coverity can't see the tautology. + */ + if (!adev) + return -ENODEV; + + /* The device does not support notifications. Nothing else to do */ + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) + return 0; + + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); + if (ACPI_FAILURE(status)) { + dev_err(dev, "failed to install notification handler"); + return -ENODEV; + } + + return 0; +} + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) { struct acpi_device *adev = ACPI_COMPANION(dev); @@ -549,6 +654,11 @@ static int vmclock_probe(struct platform_device *pdev) if (ret) return ret; + init_waitqueue_head(&st->disrupt_wait); + ret = vmclock_setup_notification(dev, st); + if (ret) + return ret; + /* * If the structure is big enough, it can be mapped to userspace. * Theoretically a guest OS even using larger pages could still @@ -581,6 +691,8 @@ static int vmclock_probe(struct platform_device *pdev) return -ENODEV; } + dev->driver_data = st; + dev_info(dev, "%s: registered %s%s%s\n", st->name, st->miscdev.minor ? "miscdev" : "", (st->miscdev.minor && st->ptp_clock) ? ", " : "", diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h index 937fe00e4f33..d320623b0118 100644 --- a/include/uapi/linux/vmclock-abi.h +++ b/include/uapi/linux/vmclock-abi.h @@ -121,6 +121,11 @@ struct vmclock_abi { * loaded from some save state (restored from a snapshot). */ #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + /* + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send + * a notification every time it updates seq_count to a new even number. + */ +#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) __u8 pad[2]; __u8 clock_status; -- 2.34.1 From: David Woodhouse The vmclock device provides a PTP clock source and precise timekeeping across live migration and snapshot/restore operations. The binding has a required memory region containing the vmclock_abi structure and an optional interrupt for clock disruption notifications. The full spec is at https://uapi-group.org/specifications/specs/vmclock/ Signed-off-by: David Woodhouse Signed-off-by: Babis Chalios Reviewed-by: Krzysztof Kozlowski --- .../bindings/ptp/amazon,vmclock.yaml | 46 +++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 47 insertions(+) create mode 100644 Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml new file mode 100644 index 000000000000..357790df876f --- /dev/null +++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Virtual Machine Clock + +maintainers: + - David Woodhouse + +description: + The vmclock device provides a precise clock source and allows for + accurate timekeeping across live migration and snapshot/restore + operations. The full specification of the shared data structure is + available at https://uapi-group.org/specifications/specs/vmclock/ + +properties: + compatible: + const: amazon,vmclock + + reg: + description: + Specifies the shared memory region containing the vmclock_abi structure. + maxItems: 1 + + interrupts: + description: + Interrupt used to notify when the contents of the vmclock_abi structure + have been updated. + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + #include + ptp@80000000 { + compatible = "amazon,vmclock"; + reg = <0x80000000 0x1000>; + interrupts = ; + }; diff --git a/MAINTAINERS b/MAINTAINERS index e8f06145fb54..171813ea76a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20728,6 +20728,7 @@ PTP VMCLOCK SUPPORT M: David Woodhouse L: netdev@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml F: drivers/ptp/ptp_vmclock.c F: include/uapi/linux/vmclock-abi.h -- 2.34.1 From: David Woodhouse Add device tree support to the ptp_vmclock driver, allowing it to probe via device tree in addition to ACPI. Handle optional interrupt for clock disruption notifications, mirroring the ACPI notification behaviour. Signed-off-by: David Woodhouse Signed-off-by: Babis Chalios --- drivers/ptp/ptp_vmclock.c | 71 ++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index a203386c4b09..831668efabf9 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -14,10 +14,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -536,7 +538,7 @@ vmclock_acpi_notification_handler(acpi_handle __always_unused handle, wake_up_interruptible(&st->disrupt_wait); } -static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) +static int vmclock_setup_acpi_notification(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); acpi_status status; @@ -549,10 +551,6 @@ static int vmclock_setup_notification(struct device *dev, struct vmclock_state * if (!adev) return -ENODEV; - /* The device does not support notifications. Nothing else to do */ - if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) - return 0; - status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, vmclock_acpi_notification_handler, dev); @@ -587,6 +585,57 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) return 0; } +static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st) +{ + struct vmclock_state *st = _st; + + wake_up_interruptible(&st->disrupt_wait); + return IRQ_HANDLED; +} + +static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) +{ + struct platform_device *pdev = to_platform_device(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + st->res = *res; + + return 0; +} + +static int vmclock_setup_of_notification(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + int irq; + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, + "vmclock", dev->driver_data); +} + +static int vmclock_setup_notification(struct device *dev, + struct vmclock_state *st) +{ + /* The device does not support notifications. Nothing else to do */ + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) + return 0; + + if (has_acpi_companion(dev)) { + return vmclock_setup_acpi_notification(dev); + } else { + return vmclock_setup_of_notification(dev); + } + +} + + static void vmclock_put_idx(void *data) { struct vmclock_state *st = data; @@ -607,7 +656,7 @@ static int vmclock_probe(struct platform_device *pdev) if (has_acpi_companion(dev)) ret = vmclock_probe_acpi(dev, st); else - ret = -EINVAL; /* Only ACPI for now */ + ret = vmclock_probe_dt(dev, st); if (ret) { dev_info(dev, "Failed to obtain physical address: %d\n", ret); @@ -655,6 +704,7 @@ static int vmclock_probe(struct platform_device *pdev) return ret; init_waitqueue_head(&st->disrupt_wait); + dev->driver_data = st; ret = vmclock_setup_notification(dev, st); if (ret) return ret; @@ -691,8 +741,6 @@ static int vmclock_probe(struct platform_device *pdev) return -ENODEV; } - dev->driver_data = st; - dev_info(dev, "%s: registered %s%s%s\n", st->name, st->miscdev.minor ? "miscdev" : "", (st->miscdev.minor && st->ptp_clock) ? ", " : "", @@ -707,11 +755,18 @@ static const struct acpi_device_id vmclock_acpi_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); +static const struct of_device_id vmclock_of_ids[] = { + { .compatible = "amazon,vmclock", }, + { }, +}; +MODULE_DEVICE_TABLE(of, vmclock_of_ids); + static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, + .of_match_table = vmclock_of_ids, }, }; -- 2.34.1 From: David Woodhouse As we finalised the spec, we spotted that vmgenid actually says that the _HID is supposed to be hypervisor-specific. Although in the 13 years since the original vmgenid doc was published, nobody seems to have cared about using _HID to distinguish between implementations on different hypervisors, and we only ever use the _CID. For consistency, match the _CID of "VMCLOCK" too. Signed-off-by: David Woodhouse Signed-off-by: Babis Chalios --- drivers/ptp/ptp_vmclock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 831668efabf9..ef945e8248ee 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -751,6 +751,7 @@ static int vmclock_probe(struct platform_device *pdev) static const struct acpi_device_id vmclock_acpi_ids[] = { { "AMZNC10C", 0 }, + { "VMCLOCK", 0 }, {} }; MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); -- 2.34.1 From: David Woodhouse Signed-off-by: David Woodhouse Signed-off-by: Babis Chalios --- drivers/ptp/Kconfig | 2 +- drivers/ptp/ptp_vmclock.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 5f8ea34d11d6..b93640ca08b7 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -134,7 +134,7 @@ config PTP_1588_CLOCK_KVM config PTP_1588_CLOCK_VMCLOCK tristate "Virtual machine PTP clock" depends on X86_TSC || ARM_ARCH_TIMER - depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 + depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 default PTP_1588_CLOCK_KVM help This driver adds support for using a virtual precision clock diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index ef945e8248ee..7f342e5a6a92 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -501,6 +501,7 @@ static void vmclock_remove(void *data) misc_deregister(&st->miscdev); } +#ifdef CONFIG_ACPI static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) { struct vmclock_state *st = data; @@ -584,6 +585,7 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) return 0; } +#endif /* CONFIG_ACPI */ static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st) { @@ -627,12 +629,11 @@ static int vmclock_setup_notification(struct device *dev, if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) return 0; - if (has_acpi_companion(dev)) { +#ifdef CONFIG_ACPI + if (has_acpi_companion(dev)) return vmclock_setup_acpi_notification(dev); - } else { - return vmclock_setup_of_notification(dev); - } - +#endif + return vmclock_setup_of_notification(dev); } @@ -653,9 +654,11 @@ static int vmclock_probe(struct platform_device *pdev) if (!st) return -ENOMEM; +#ifdef CONFIG_ACPI if (has_acpi_companion(dev)) ret = vmclock_probe_acpi(dev, st); else +#endif ret = vmclock_probe_dt(dev, st); if (ret) { -- 2.34.1 From: David Woodhouse To output UTC would involve complex calculations about whether the time elapsed since the reference time has crossed the end of the month when a leap second takes effect. I've prototyped that, but it made me sad. Much better to report TAI, which is what PHCs should do anyway. And much much simpler. Signed-off-by: David Woodhouse Signed-off-by: Babis Chalios --- drivers/ptp/ptp_vmclock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 7f342e5a6a92..deab3205601b 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -82,13 +82,13 @@ static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) { - if (likely(clk->time_type == VMCLOCK_TIME_UTC)) + if (clk->time_type == VMCLOCK_TIME_TAI) return true; - if (clk->time_type == VMCLOCK_TIME_TAI && + if (clk->time_type == VMCLOCK_TIME_UTC && (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { if (sec) - *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); + *sec -= (int16_t)le16_to_cpu(clk->tai_offset_sec); return true; } return false; @@ -349,9 +349,9 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev, return NULL; } - /* Only UTC, or TAI with offset */ + /* Accept TAI directly, or UTC with valid offset for conversion to TAI */ if (!tai_adjust(st->clk, NULL)) { - dev_info(dev, "vmclock does not provide unambiguous UTC\n"); + dev_info(dev, "vmclock does not provide unambiguous time\n"); return NULL; } -- 2.34.1