The current reset process saves the device's config space state before reset and restores it afterward. However, when a device is in an error state before reset, config space reads may return error values instead of valid data. This results in saving corrupted values that get written back to the device during state restoration. Avoid saving the state of the config space when the device is in error. While restoring we only restore the state that can be restored through kernel data such as BARs or doesn't depend on the saved state. Signed-off-by: Farhan Ali --- drivers/pci/pci.c | 25 ++++++++++++++++++++++--- drivers/pci/pcie/aer.c | 3 +++ drivers/pci/pcie/dpc.c | 3 +++ drivers/pci/pcie/ptm.c | 3 +++ drivers/pci/tph.c | 3 +++ drivers/pci/vc.c | 3 +++ 6 files changed, 37 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cd..a3d93d1baee7 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1720,6 +1720,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev) struct pci_cap_saved_state *save_state; u16 *cap; + if (!dev->state_saved) + return; + /* * Restore max latencies (in the LTR capability) before enabling * LTR itself in PCI_EXP_DEVCTL2. @@ -1775,6 +1778,9 @@ static void pci_restore_pcix_state(struct pci_dev *dev) struct pci_cap_saved_state *save_state; u16 *cap; + if (!dev->state_saved) + return; + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!save_state || !pos) @@ -1792,6 +1798,14 @@ static void pci_restore_pcix_state(struct pci_dev *dev) int pci_save_state(struct pci_dev *dev) { int i; + u32 val; + + pci_read_config_dword(dev, PCI_COMMAND, &val); + if (PCI_POSSIBLE_ERROR(val)) { + pci_warn(dev, "Device config space inaccessible, will only be partially restored\n"); + return -EIO; + } + /* XXX: 100% dword access ok here? */ for (i = 0; i < 16; i++) { pci_read_config_dword(dev, i * 4, &dev->saved_config_space[i]); @@ -1854,6 +1868,14 @@ static void pci_restore_config_space_range(struct pci_dev *pdev, static void pci_restore_config_space(struct pci_dev *pdev) { + if (!pdev->state_saved) { + pci_warn(pdev, "No saved config space, restoring BARs\n"); + pci_restore_bars(pdev); + pci_write_config_word(pdev, PCI_COMMAND, + PCI_COMMAND_MEMORY | PCI_COMMAND_IO); + return; + } + if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { pci_restore_config_space_range(pdev, 10, 15, 0, false); /* Restore BARs before the command register. */ @@ -1906,9 +1928,6 @@ static void pci_restore_rebar_state(struct pci_dev *pdev) */ void pci_restore_state(struct pci_dev *dev) { - if (!dev->state_saved) - return; - pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); pci_restore_pri_state(dev); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e286c197d716..e6023ffbf94d 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -361,6 +361,9 @@ void pci_restore_aer_state(struct pci_dev *dev) if (!aer) return; + if (!dev->state_saved) + return; + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR); if (!save_state) return; diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index fc18349614d7..e0d7034c2cd8 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -67,6 +67,9 @@ void pci_restore_dpc_state(struct pci_dev *dev) if (!pci_is_pcie(dev)) return; + if (!dev->state_saved) + return; + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC); if (!save_state) return; diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 65e4b008be00..78613000acfb 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -112,6 +112,9 @@ void pci_restore_ptm_state(struct pci_dev *dev) if (!ptm) return; + if (!dev->state_saved) + return; + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); if (!save_state) return; diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index cc64f93709a4..c0fa1b9a7a78 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -435,6 +435,9 @@ void pci_restore_tph_state(struct pci_dev *pdev) if (!pdev->tph_enabled) return; + if (!pdev->state_saved) + return; + save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH); if (!save_state) return; diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index a4ff7f5f66dd..00609c7e032a 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -391,6 +391,9 @@ void pci_restore_vc_state(struct pci_dev *dev) { int i; + if (!dev->state_saved) + return; + for (i = 0; i < ARRAY_SIZE(vc_caps); i++) { int pos; struct pci_cap_saved_state *save_state; -- 2.43.0 If a device is in an error state, then any reads of device registers can return error value. Add addtional checks to validate if a device is in an error state before doing an flr reset. Signed-off-by: Farhan Ali --- drivers/pci/pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a3d93d1baee7..327fefc6a1eb 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4576,12 +4576,19 @@ EXPORT_SYMBOL_GPL(pcie_flr); */ int pcie_reset_flr(struct pci_dev *dev, bool probe) { + u32 reg; + if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) return -ENOTTY; if (!(dev->devcap & PCI_EXP_DEVCAP_FLR)) return -ENOTTY; + if (pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, ®)) { + pci_warn(dev, "Device unable to do an FLR\n"); + return -ENOTTY; + } + if (probe) return 0; -- 2.43.0 On s390 systems, which use a machine level hypervisor, PCI devices are always accessed through a form of PCI pass-through which fundamentally operates on a per PCI function granularity. This is also reflected in the s390 PCI hotplug driver which creates hotplug slots for individual PCI functions. Its reset_slot() function, which is a wrapper for zpci_hot_reset_device(), thus also resets individual functions. Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object to multifunction devices. This approach worked fine on s390 systems that only exposed virtual functions as individual PCI domains to the operating system. Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions") s390 supports exposing the topology of multifunction PCI devices by grouping them in a shared PCI domain. When attempting to reset a function through the hotplug driver, the shared slot assignment causes the wrong function to be reset instead of the intended one. It also leaks memory as we do create a pci_slot object for the function, but don't correctly free it in pci_slot_release(). Add a flag for struct pci_slot to allow per function PCI slots for functions managed through a hypervisor, which exposes individual PCI functions while retaining the topology. Fixes: 44510d6fa0c0 ("s390/pci: Handling multifunctions") Cc: Suggested-by: Niklas Schnelle Signed-off-by: Farhan Ali --- drivers/pci/hotplug/s390_pci_hpc.c | 10 ++++++++-- drivers/pci/pci.c | 5 +++-- drivers/pci/slot.c | 14 +++++++++++--- include/linux/pci.h | 1 + 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index d9996516f49e..8b547de464bf 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -126,14 +126,20 @@ static const struct hotplug_slot_ops s390_hotplug_slot_ops = { int zpci_init_slot(struct zpci_dev *zdev) { + int ret; char name[SLOT_NAME_SIZE]; struct zpci_bus *zbus = zdev->zbus; zdev->hotplug_slot.ops = &s390_hotplug_slot_ops; snprintf(name, SLOT_NAME_SIZE, "%08x", zdev->fid); - return pci_hp_register(&zdev->hotplug_slot, zbus->bus, - zdev->devfn, name); + ret = pci_hp_register(&zdev->hotplug_slot, zbus->bus, + zdev->devfn, name); + if (ret) + return ret; + + zdev->hotplug_slot.pci_slot->per_func_slot = 1; + return 0; } void zpci_exit_slot(struct zpci_dev *zdev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 327fefc6a1eb..530a793a332c 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5057,8 +5057,9 @@ static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, bool probe) static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe) { - if (dev->multifunction || dev->subordinate || !dev->slot || - dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET) + if (dev->subordinate || !dev->slot || + dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET || + (dev->multifunction && !dev->slot->per_func_slot)) return -ENOTTY; return pci_reset_hotplug_slot(dev->slot->hotplug, probe); diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 50fb3eb595fe..51ee59e14393 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -63,6 +63,14 @@ static ssize_t cur_speed_read_file(struct pci_slot *slot, char *buf) return bus_speed_read(slot->bus->cur_bus_speed, buf); } +static bool pci_dev_matches_slot(struct pci_dev *dev, struct pci_slot *slot) +{ + if (slot->per_func_slot) + return dev->devfn == slot->number; + + return PCI_SLOT(dev->devfn) == slot->number; +} + static void pci_slot_release(struct kobject *kobj) { struct pci_dev *dev; @@ -73,7 +81,7 @@ static void pci_slot_release(struct kobject *kobj) down_read(&pci_bus_sem); list_for_each_entry(dev, &slot->bus->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (pci_dev_matches_slot(dev, slot)) dev->slot = NULL; up_read(&pci_bus_sem); @@ -166,7 +174,7 @@ void pci_dev_assign_slot(struct pci_dev *dev) mutex_lock(&pci_slot_mutex); list_for_each_entry(slot, &dev->bus->slots, list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (pci_dev_matches_slot(dev, slot)) dev->slot = slot; mutex_unlock(&pci_slot_mutex); } @@ -285,7 +293,7 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, down_read(&pci_bus_sem); list_for_each_entry(dev, &parent->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot_nr) + if (pci_dev_matches_slot(dev, slot)) dev->slot = slot; up_read(&pci_bus_sem); diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860..9265f32d9786 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -78,6 +78,7 @@ struct pci_slot { struct list_head list; /* Node in list of slots */ struct hotplug_slot *hotplug; /* Hotplug info (move here) */ unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ + unsigned int per_func_slot:1; /* Allow per function slot */ struct kobject kobj; }; -- 2.43.0 On s390 today we overwrite the PCI BAR resource address to either an artificial cookie address or MIO address. However this address is different from the bus address of the BARs programmed by firmware. The artificial cookie address was created to index into an array of function handles (zpci_iomap_start). The MIO (mapped I/O) addresses are provided by firmware but maybe different from the bus address. This creates an issue when trying to convert the BAR resource address to bus address using the generic pcibios_resource_to_bus(). Implement an architecture specific pcibios_resource_to_bus() function to correctly translate PCI BAR resource addresses to bus addresses for s390. Similarly add architecture specific pcibios_bus_to_resource function to do the reverse translation. Signed-off-by: Farhan Ali --- arch/s390/pci/pci.c | 74 +++++++++++++++++++++++++++++++++++++++ drivers/pci/host-bridge.c | 4 +-- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index cd6676c2d602..3992ba44e1cf 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -264,6 +264,80 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, return 0; } +void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, + struct resource *res) +{ + struct zpci_bus *zbus = bus->sysdata; + struct zpci_bar_struct *zbar; + struct zpci_dev *zdev; + + region->start = res->start; + region->end = res->end; + + for (int i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + int j = 0; + + zbar = NULL; + zdev = zbus->function[i]; + if (!zdev) + continue; + + for (j = 0; j < PCI_STD_NUM_BARS; j++) { + if (zdev->bars[j].res->start == res->start && + zdev->bars[j].res->end == res->end && + res->flags & IORESOURCE_MEM) { + zbar = &zdev->bars[j]; + break; + } + } + + if (zbar) { + /* only MMIO is supported */ + region->start = zbar->val & PCI_BASE_ADDRESS_MEM_MASK; + if (zbar->val & PCI_BASE_ADDRESS_MEM_TYPE_64) + region->start |= (u64)zdev->bars[j + 1].val << 32; + + region->end = region->start + (1UL << zbar->size) - 1; + return; + } + } +} + +void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res, + struct pci_bus_region *region) +{ + struct zpci_bus *zbus = bus->sysdata; + struct zpci_dev *zdev; + resource_size_t start, end; + + res->start = region->start; + res->end = region->end; + + for (int i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + zdev = zbus->function[i]; + if (!zdev || !zdev->has_resources) + continue; + + for (int j = 0; j < PCI_STD_NUM_BARS; j++) { + if (!zdev->bars[j].size) + continue; + + /* only MMIO is supported */ + start = zdev->bars[j].val & PCI_BASE_ADDRESS_MEM_MASK; + if (zdev->bars[j].val & PCI_BASE_ADDRESS_MEM_TYPE_64) + start |= (u64)zdev->bars[j + 1].val << 32; + + end = start + (1UL << zdev->bars[j].size) - 1; + + if (start == region->start && end == region->end) { + res->start = zdev->bars[j].res->start; + res->end = zdev->bars[j].res->end; + return; + } + } + } +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) { diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index afa50b446567..56d62afb3afe 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -48,7 +48,7 @@ void pci_set_host_bridge_release(struct pci_host_bridge *bridge, } EXPORT_SYMBOL_GPL(pci_set_host_bridge_release); -void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, +void __weak pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, struct resource *res) { struct pci_host_bridge *bridge = pci_find_host_bridge(bus); @@ -73,7 +73,7 @@ static bool region_contains(struct pci_bus_region *region1, return region1->start <= region2->start && region1->end >= region2->end; } -void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res, +void __weak pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res, struct pci_bus_region *region) { struct pci_host_bridge *bridge = pci_find_host_bridge(bus); -- 2.43.0 Commit c1e18c17bda6 ("s390/pci: add zpci_set_irq()/zpci_clear_irq()"), introduced the zpci_set_irq() and zpci_clear_irq(), to be used while resetting a zPCI device. Commit da995d538d3a ("s390/pci: implement reset_slot for hotplug slot"), mentions zpci_clear_irq() being called in the path for zpci_hot_reset_device(). But that is not the case anymore and these functions are not called outside of this file. Instead zpci_hot_reset_device() relies on zpci_disable_device() also clearing the IRQs, but misses to reset the zdev->irqs_registered flag. However after a CLP disable/enable reset, the device's IRQ are unregistered, but the flag zdev->irq_registered does not get cleared. It creates an inconsistent state and so arch_restore_msi_irqs() doesn't correctly restore the device's IRQ. This becomes a problem when a PCI driver tries to restore the state of the device through pci_restore_state(). Restore IRQ unconditionally for the device and remove the irq_registered flag as its redundant. Signed-off-by: Farhan Ali Reviewed-by: Niklas Schnelle --- arch/s390/include/asm/pci.h | 1 - arch/s390/pci/pci_irq.c | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 41f900f693d9..aed19a1aa9d7 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -145,7 +145,6 @@ struct zpci_dev { u8 has_resources : 1; u8 is_physfn : 1; u8 util_str_avail : 1; - u8 irqs_registered : 1; u8 tid_avail : 1; u8 rtr_avail : 1; /* Relaxed translation allowed */ unsigned int devfn; /* DEVFN part of the RID*/ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 84482a921332..e73be96ce5fe 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -107,9 +107,6 @@ static int zpci_set_irq(struct zpci_dev *zdev) else rc = zpci_set_airq(zdev); - if (!rc) - zdev->irqs_registered = 1; - return rc; } @@ -123,9 +120,6 @@ static int zpci_clear_irq(struct zpci_dev *zdev) else rc = zpci_clear_airq(zdev); - if (!rc) - zdev->irqs_registered = 0; - return rc; } @@ -427,8 +421,7 @@ bool arch_restore_msi_irqs(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); - if (!zdev->irqs_registered) - zpci_set_irq(zdev); + zpci_set_irq(zdev); return true; } -- 2.43.0 We can now have userspace drivers (vfio-pci based) on s390x. The userspace drivers will not have any KVM fd and so no kzdev associated with them. So we need to update the logic for detecting passthrough devices to not depend on struct kvm_zdev. Signed-off-by: Farhan Ali Reviewed-by: Niklas Schnelle --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci_event.c | 14 ++++---------- drivers/vfio/pci/vfio_pci_zdev.c | 9 ++++++++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index aed19a1aa9d7..f47f62fc3bfd 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -169,6 +169,7 @@ struct zpci_dev { char res_name[16]; bool mio_capable; + bool mediated_recovery; struct zpci_bar_struct bars[PCI_STD_NUM_BARS]; u64 start_dma; /* Start of available DMA addresses */ diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d930416d4c90..541d536be052 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -61,16 +61,10 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) } } -static bool is_passed_through(struct pci_dev *pdev) +static bool needs_mediated_recovery(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); - bool ret; - - mutex_lock(&zdev->kzdev_lock); - ret = !!zdev->kzdev; - mutex_unlock(&zdev->kzdev_lock); - - return ret; + return zdev->mediated_recovery; } static bool is_driver_supported(struct pci_driver *driver) @@ -194,7 +188,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) } pdev->error_state = pci_channel_io_frozen; - if (is_passed_through(pdev)) { + if (needs_mediated_recovery(pdev)) { pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", pci_name(pdev)); status_str = "failed (pass-through)"; @@ -277,7 +271,7 @@ static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) * we will inject the error event and let the guest recover the device * itself. */ - if (is_passed_through(pdev)) + if (needs_mediated_recovery(pdev)) goto out; driver = to_pci_driver(pdev->dev.driver); if (driver && driver->err_handler && driver->err_handler->error_detected) diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index 0990fdb146b7..a7bc23ce8483 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -148,6 +148,8 @@ int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) if (!zdev) return -ENODEV; + zdev->mediated_recovery = true; + if (!vdev->vdev.kvm) return 0; @@ -161,7 +163,12 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) { struct zpci_dev *zdev = to_zpci(vdev->pdev); - if (!zdev || !vdev->vdev.kvm) + if (!zdev) + return; + + zdev->mediated_recovery = false; + + if (!vdev->vdev.kvm) return; if (zpci_kvm_hook.kvm_unregister) -- 2.43.0 For a passthrough device we need co-operation from user space to recover the device. This would require to bubble up any error information to user space. Let's store this error information for passthrough devices, so it can be retrieved later. Signed-off-by: Farhan Ali --- arch/s390/include/asm/pci.h | 28 ++++++++++ arch/s390/pci/pci.c | 1 + arch/s390/pci/pci_event.c | 95 +++++++++++++++++++------------- drivers/vfio/pci/vfio_pci_zdev.c | 2 + 4 files changed, 88 insertions(+), 38 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index f47f62fc3bfd..40bfe5721109 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -116,6 +116,31 @@ struct zpci_bus { enum pci_bus_speed max_bus_speed; }; +/* Content Code Description for PCI Function Error */ +struct zpci_ccdf_err { + u32 reserved1; + u32 fh; /* function handle */ + u32 fid; /* function id */ + u32 ett : 4; /* expected table type */ + u32 mvn : 12; /* MSI vector number */ + u32 dmaas : 8; /* DMA address space */ + u32 reserved2 : 6; + u32 q : 1; /* event qualifier */ + u32 rw : 1; /* read/write */ + u64 faddr; /* failing address */ + u32 reserved3; + u16 reserved4; + u16 pec; /* PCI event code */ +} __packed; + +#define ZPCI_ERR_PENDING_MAX 4 +struct zpci_ccdf_pending { + u8 count; + u8 head; + u8 tail; + struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX]; +}; + /* Private data per function */ struct zpci_dev { struct zpci_bus *zbus; @@ -191,6 +216,8 @@ struct zpci_dev { struct iommu_domain *s390_domain; /* attached IOMMU domain */ struct kvm_zdev *kzdev; struct mutex kzdev_lock; + struct zpci_ccdf_pending pending_errs; + struct mutex pending_errs_lock; spinlock_t dom_lock; /* protect s390_domain change */ }; @@ -316,6 +343,7 @@ void zpci_debug_exit_device(struct zpci_dev *); int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); int zpci_clear_error_state(struct zpci_dev *zdev); int zpci_reset_load_store_blocked(struct zpci_dev *zdev); +void zpci_cleanup_pending_errors(struct zpci_dev *zdev); #ifdef CONFIG_NUMA diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 3992ba44e1cf..759dfd9e0fb8 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -897,6 +897,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) mutex_init(&zdev->state_lock); mutex_init(&zdev->fmb_lock); mutex_init(&zdev->kzdev_lock); + mutex_init(&zdev->pending_errs_lock); return zdev; diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 541d536be052..9f396a2847ab 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -18,23 +18,6 @@ #include "pci_bus.h" #include "pci_report.h" -/* Content Code Description for PCI Function Error */ -struct zpci_ccdf_err { - u32 reserved1; - u32 fh; /* function handle */ - u32 fid; /* function id */ - u32 ett : 4; /* expected table type */ - u32 mvn : 12; /* MSI vector number */ - u32 dmaas : 8; /* DMA address space */ - u32 : 6; - u32 q : 1; /* event qualifier */ - u32 rw : 1; /* read/write */ - u64 faddr; /* failing address */ - u32 reserved3; - u16 reserved4; - u16 pec; /* PCI event code */ -} __packed; - /* Content Code Description for PCI Function Availability */ struct zpci_ccdf_avail { u32 reserved1; @@ -76,6 +59,41 @@ static bool is_driver_supported(struct pci_driver *driver) return true; } +static void zpci_store_pci_error(struct pci_dev *pdev, + struct zpci_ccdf_err *ccdf) +{ + struct zpci_dev *zdev = to_zpci(pdev); + int i; + + mutex_lock(&zdev->pending_errs_lock); + if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) { + pr_err("%s: Maximum number (%d) of pending error events queued", + pci_name(pdev), ZPCI_ERR_PENDING_MAX); + mutex_unlock(&zdev->pending_errs_lock); + return; + } + + i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX; + memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err)); + zdev->pending_errs.tail++; + zdev->pending_errs.count++; + mutex_unlock(&zdev->pending_errs_lock); +} + +void zpci_cleanup_pending_errors(struct zpci_dev *zdev) +{ + struct pci_dev *pdev = NULL; + + mutex_lock(&zdev->pending_errs_lock); + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + if (zdev->pending_errs.count) + pr_info("%s: Unhandled PCI error events count=%d", + pci_name(pdev), zdev->pending_errs.count); + memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending)); + mutex_unlock(&zdev->pending_errs_lock); +} +EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors); + static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, struct pci_driver *driver) { @@ -169,7 +187,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, * and the platform determines which functions are affected for * multi-function devices. */ -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev, + struct zpci_ccdf_err *ccdf) { pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; struct zpci_dev *zdev = to_zpci(pdev); @@ -188,13 +207,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) } pdev->error_state = pci_channel_io_frozen; - if (needs_mediated_recovery(pdev)) { - pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", - pci_name(pdev)); - status_str = "failed (pass-through)"; - goto out_unlock; - } - driver = to_pci_driver(pdev->dev.driver); if (!is_driver_supported(driver)) { if (!driver) { @@ -210,12 +222,23 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) goto out_unlock; } + if (needs_mediated_recovery(pdev)) + zpci_store_pci_error(pdev, ccdf); + ers_res = zpci_event_notify_error_detected(pdev, driver); if (ers_result_indicates_abort(ers_res)) { status_str = "failed (abort on detection)"; goto out_unlock; } + if (needs_mediated_recovery(pdev)) { + pr_info("%s: Leaving recovery of pass-through device to user-space\n", + pci_name(pdev)); + ers_res = PCI_ERS_RESULT_RECOVERED; + status_str = "in progress"; + goto out_unlock; + } + if (ers_res != PCI_ERS_RESULT_NEED_RESET) { ers_res = zpci_event_do_error_state_clear(pdev, driver); if (ers_result_indicates_abort(ers_res)) { @@ -258,25 +281,20 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) * @pdev: PCI function for which to report * @es: PCI channel failure state to report */ -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es, + struct zpci_ccdf_err *ccdf) { struct pci_driver *driver; pci_dev_lock(pdev); pdev->error_state = es; - /** - * While vfio-pci's error_detected callback notifies user-space QEMU - * reacts to this by freezing the guest. In an s390 environment PCI - * errors are rarely fatal so this is overkill. Instead in the future - * we will inject the error event and let the guest recover the device - * itself. - */ + if (needs_mediated_recovery(pdev)) - goto out; + zpci_store_pci_error(pdev, ccdf); driver = to_pci_driver(pdev->dev.driver); if (driver && driver->err_handler && driver->err_handler->error_detected) driver->err_handler->error_detected(pdev, pdev->error_state); -out: + pci_dev_unlock(pdev); } @@ -322,12 +340,13 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) break; case 0x0040: /* Service Action or Error Recovery Failed */ case 0x003b: - zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf); break; default: /* PCI function left in the error state attempt to recover */ - ers_res = zpci_event_attempt_error_recovery(pdev); + ers_res = zpci_event_attempt_error_recovery(pdev, ccdf); if (ers_res != PCI_ERS_RESULT_RECOVERED) - zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, + ccdf); break; } pci_dev_put(pdev); diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index a7bc23ce8483..2be37eab9279 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) zdev->mediated_recovery = false; + zpci_cleanup_pending_errors(zdev); + if (!vdev->vdev.kvm) return; -- 2.43.0 For zPCI devices, we have platform specific error information. The platform firmware provides this error information to the operating system in an architecture specific mechanism. To enable recovery from userspace for these devices, we want to expose this error information to userspace. Add a new device feature to expose this information. Signed-off-by: Farhan Ali --- drivers/vfio/pci/vfio_pci_core.c | 2 ++ drivers/vfio/pci/vfio_pci_priv.h | 8 ++++++++ drivers/vfio/pci/vfio_pci_zdev.c | 34 ++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 15 ++++++++++++++ 4 files changed, 59 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..378adb3226db 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1514,6 +1514,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_ZPCI_ERROR: + return vfio_pci_zdev_feature_err(device, flags, arg, argsz); default: return -ENOTTY; } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..808d92aab76b 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -86,6 +86,8 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps); int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); +int vfio_pci_zdev_feature_err(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz); #else static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps) @@ -100,6 +102,12 @@ static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) {} + +static int vfio_pci_zdev_feature_err(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + return -ENODEV; +} #endif static inline bool vfio_pci_is_vga(struct pci_dev *pdev) diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index 2be37eab9279..261954039aa9 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -141,6 +141,40 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, return ret; } +int vfio_pci_zdev_feature_err(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_device_feature_zpci_err err; + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + struct zpci_dev *zdev = to_zpci(vdev->pdev); + int ret; + int head = 0; + + if (!zdev) + return -ENODEV; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(err)); + if (ret != 1) + return ret; + + mutex_lock(&zdev->pending_errs_lock); + if (zdev->pending_errs.count) { + head = zdev->pending_errs.head % ZPCI_ERR_PENDING_MAX; + err.pec = zdev->pending_errs.err[head].pec; + zdev->pending_errs.head++; + zdev->pending_errs.count--; + err.pending_errors = zdev->pending_errs.count; + } + mutex_unlock(&zdev->pending_errs_lock); + + if (copy_to_user(arg, &err, sizeof(err))) + return -EFAULT; + + return 0; +} + int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) { struct zpci_dev *zdev = to_zpci(vdev->pdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009ba..d72177bc3961 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1478,6 +1478,21 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * VFIO_DEVICE_FEATURE_ZPCI_ERROR feature provides PCI error information to + * userspace for vfio-pci devices on s390x. On s390x PCI error recovery involves + * platform firmware and notification to operating system is done by + * architecture specific mechanism. Exposing this information to userspace + * allows userspace to take appropriate actions to handle an error on the + * device. + */ +struct vfio_device_feature_zpci_err { + __u16 pec; + __u8 pending_errors; + __u8 pad; +}; +#define VFIO_DEVICE_FEATURE_ZPCI_ERROR 11 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- 2.43.0 On error recovery for a PCI device bound to vfio-pci driver, we want to recover the state of the device to its last known saved state. The callback restores the state of the device to its initial saved state. Signed-off-by: Farhan Ali --- drivers/vfio/pci/vfio_pci_core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 378adb3226db..f2fcb81b3e69 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2241,6 +2241,17 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, } EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected); +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + + if (!vdev->pci_saved_state) + return; + + pci_load_saved_state(pdev, vdev->pci_saved_state); + pci_restore_state(pdev); +} + int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, int nr_virtfn) { @@ -2305,6 +2316,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); const struct pci_error_handlers vfio_pci_core_err_handlers = { .error_detected = vfio_pci_core_aer_err_detected, + .reset_done = vfio_pci_core_aer_reset_done, }; EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); -- 2.43.0 We are configuring the error signaling on the vast majority of devices and it's extremely rare that it fires anyway. This allows userspace to be notified on errors for legacy PCI devices. The Internal Share Memory (ISM) device on s390x is one such device. For PCI devices on IBM s390x error recovery involves platform firmware and notification to operating system is done by architecture specific way. So the ISM device can still be recovered when notified of an error. Signed-off-by: Farhan Ali --- drivers/vfio/pci/vfio_pci_core.c | 6 ++---- drivers/vfio/pci/vfio_pci_intrs.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f2fcb81b3e69..d125471fd5ea 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -749,8 +749,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_typ return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { - if (pci_is_pcie(vdev->pdev)) - return 1; + return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { return 1; } @@ -1150,8 +1149,7 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: - if (pci_is_pcie(vdev->pdev)) - break; + break; fallthrough; default: return -EINVAL; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 123298a4dc8f..f2d13b6eb28f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -838,8 +838,7 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, case VFIO_PCI_ERR_IRQ_INDEX: switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { case VFIO_IRQ_SET_ACTION_TRIGGER: - if (pci_is_pcie(vdev->pdev)) - func = vfio_pci_set_err_trigger; + func = vfio_pci_set_err_trigger; break; } break; -- 2.43.0