The next patch wants to use this constant, share it. Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 16 +++------------- include/uapi/linux/pci_regs.h | 10 ++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 060ebe330ee163..2a47ddb01799c1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1408,16 +1408,6 @@ EXPORT_SYMBOL_GPL(iommu_group_id); static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, unsigned long *devfns); -/* - * To consider a PCI device isolated, we require ACS to support Source - * Validation, Request Redirection, Completer Redirection, and Upstream - * Forwarding. This effectively means that devices cannot spoof their - * requester ID, requests and completions cannot be redirected, and all - * transactions are forwarded upstream, even as it passes through a - * bridge where the target device is downstream. - */ -#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) - /* * For multifunction devices which are not isolated from each other, find * all the other non-isolated functions and look for existing groups. For @@ -1430,13 +1420,13 @@ static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev, struct pci_dev *tmp = NULL; struct iommu_group *group; - if (!pdev->multifunction || pci_acs_enabled(pdev, REQ_ACS_FLAGS)) + if (!pdev->multifunction || pci_acs_enabled(pdev, PCI_ACS_ISOLATED)) return NULL; for_each_pci_dev(tmp) { if (tmp == pdev || tmp->bus != pdev->bus || PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) || - pci_acs_enabled(tmp, REQ_ACS_FLAGS)) + pci_acs_enabled(tmp, PCI_ACS_ISOLATED)) continue; group = get_pci_alias_group(tmp, devfns); @@ -1580,7 +1570,7 @@ struct iommu_group *pci_device_group(struct device *dev) if (!bus->self) continue; - if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS)) + if (pci_acs_path_enabled(bus->self, NULL, PCI_ACS_ISOLATED)) break; pdev = bus->self; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de607d..6095e7d7d4cc48 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1009,6 +1009,16 @@ #define PCI_ACS_CTRL 0x06 /* ACS Control Register */ #define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */ +/* + * To consider a PCI device isolated, we require ACS to support Source + * Validation, Request Redirection, Completer Redirection, and Upstream + * Forwarding. This effectively means that devices cannot spoof their + * requester ID, requests and completions cannot be redirected, and all + * transactions are forwarded upstream, even as it passes through a + * bridge where the target device is downstream. + */ +#define PCI_ACS_ISOLATED (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) + /* SATA capability */ #define PCI_SATA_REGS 4 /* SATA REGs specifier */ #define PCI_SATA_REGS_MASK 0xF /* location - BAR#/inline */ -- 2.43.0 Prepare to move the calculation of the bus P2P isolation out of the iommu code and into the PCI core. This allows using the faster list iteration under the pci_bus_sem, and the code has a kinship with the logic in pci_for_each_dma_alias(). Bus isolation is the concept that drives the iommu_groups for the purposes of VFIO. Stated simply, if device A can send traffic to device B then they must be in the same group. Only PCIe provides isolation. The multi-drop electrical topology in classical PCI allows any bus member to claim the transaction. In PCIe isolation comes out of ACS. If a PCIe Switch and Root Complex has ACS flags that prevent peer to peer traffic and funnel all operations to the IOMMU then devices can be isolated. Multi-function devices also have an isolation concern with self loopback between the functions, though pci_bus_isolated() does not deal with devices. As a property of a bus, there are several positive cases: - The point to point "bus" on a physical PCIe link is isolated if the bridge/root device has something preventing self-access to its own MMIO. - A Root Port is usually isolated - A PCIe switch can be isolated if all it's Down Stream Ports have good ACS flags pci_bus_isolated() implements these rules and returns an enum indicating the level of isolation the bus has, with five possibilies: PCIE_ISOLATED: Traffic on this PCIE bus can not do any P2P. PCIE_SWITCH_DSP_NON_ISOLATED: The bus is the internal bus of a PCIE switch and the USP is isolated but the DSPs are not. PCIE_NON_ISOLATED: The PCIe bus has no isolation between the bridge or any downstream devices. PCI_BUS_NON_ISOLATED: It is a PCI/PCI-X but the bridge is PCIe, has no aliases and the bridge is isolated from the bus. PCI_BRIDGE_NON_ISOLATED: It is a PCI/PCI-X bus and has no isolation, the bridge is part of the group. The calculation is done per-bus, so it is possible for a transactions from a PCI device to travel through different bus isolation types on its way upstream. PCIE_SWITCH_DSP_NON_ISOLATED/PCI_BUS_NON_ISOLATED and PCIE_NON_ISOLATED/PCI_BRIDGE_NON_ISOLATED are the same for the purposes of creating iommu groups. The distinction between PCIe and PCI allows for easier understanding and debugging as to why the groups are chosen. For the iommu groups if all busses on the upstream path are PCIE_ISOLATED then the end device has a chance to have a single-device iommu_group. Once any non-isolated bus segment is found that bus segment will have an iommu_group that captures all downstream devices, and sometimes the upstream bridge. pci_bus_isolated() is principally about isolation, but there is an overlap with grouping requirements for legacy PCI aliasing. For purely legacy PCI environments pci_bus_isolated() returns PCI_BRIDGE_NON_ISOLATED for everything and all devices within a hierarchy are in one group. No need to worry about bridge aliasing. Signed-off-by: Jason Gunthorpe --- drivers/pci/search.c | 174 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 31 ++++++++ 2 files changed, 205 insertions(+) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 53840634fbfc2b..fe6c07e67cb8ce 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -113,6 +113,180 @@ int pci_for_each_dma_alias(struct pci_dev *pdev, return ret; } +static enum pci_bus_isolation pcie_switch_isolated(struct pci_bus *bus) +{ + struct pci_dev *pdev; + + /* + * Within a PCIe switch we have an interior bus that has the Upstream + * port as the bridge and a set of Downstream port bridging to the + * egress ports. + * + * Each DSP has an ACS setting which controls where its traffic is + * permitted to go. Any DSP with a permissive ACS setting can send + * traffic flowing upstream back downstream through another DSP. + * + * Thus any non-permissive DSP spoils the whole bus. + */ + guard(rwsem_read)(&pci_bus_sem); + list_for_each_entry(pdev, &bus->devices, bus_list) { + /* Don't understand what this is, be conservative */ + if (!pci_is_pcie(pdev) || + pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM || + pdev->dma_alias_mask) + return PCIE_NON_ISOLATED; + + if (!pci_acs_enabled(pdev, PCI_ACS_ISOLATED)) + return PCIE_SWITCH_DSP_NON_ISOLATED; + } + return PCIE_ISOLATED; +} + +static bool pci_has_mmio(struct pci_dev *pdev) +{ + unsigned int i; + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *res = pci_resource_n(pdev, i); + + if (resource_size(res) && resource_type(res) == IORESOURCE_MEM) + return true; + } + return false; +} + +/** + * pci_bus_isolated - Determine how isolated connected devices are + * @bus: The bus to check + * + * Isolation is the ability of devices to talk to each other. Full isolation + * means that a device can only communicate with the IOMMU and can not do peer + * to peer within the fabric. + * + * We consider isolation on a bus by bus basis. If the bus will permit a + * transaction originated downstream to complete on anything other than the + * IOMMU then the bus is not isolated. + * + * Non-isolation includes all the downstream devices on this bus, and it may + * include the upstream bridge or port that is creating this bus. + * + * The various cases are returned in an enum. + * + * Broadly speaking this function evaluates the ACS settings in a PCI switch to + * determine if a PCI switch is configured to have full isolation. + * + * Old PCI/PCI-X busses cannot have isolation due to their physical properties, + * but they do have some aliasing properties that effect group creation. + * + * pci_bus_isolated() does not consider loopback internal to devices, like + * multi-function devices performing a self-loopback. The caller must check + * this separately. It does not considering alasing within the bus. + * + * It does not currently support the ACS P2P Egress Control Vector, Linux does + * not yet have any way to enable this feature. EC will create subsets of the + * bus that are isolated from other subsets. + */ +enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus) +{ + struct pci_dev *bridge = bus->self; + int type; + + /* + * This bus was created by pci_register_host_bridge(). The spec provides + * no way to tell what kind of bus this is, for PCIe we expect this to + * be internal to the root complex and not covered by any spec behavior. + * Linux has historically been optimistic about this bus and treated it + * as isolating. Given that the behavior of the root complex and the ACS + * behavior of RCiEP's is explicitly not specified we hope that the + * implementation is directing everything that reaches the root bus to + * the IOMMU. + */ + if (pci_is_root_bus(bus)) + return PCIE_ISOLATED; + + /* + * bus->self is only NULL for SRIOV VFs, it represents a "virtual" bus + * within Linux to hold any bus numbers consumed by VF RIDs. Caller must + * use pci_physfn() to get the bus for calling this function. + */ + if (WARN_ON(!bridge)) + return PCI_BRIDGE_NON_ISOLATED; + + /* + * The bridge is not a PCIe bridge therefore this bus is PCI/PCI-X. + * + * PCI does not have anything like ACS. Any down stream device can bus + * master an address that any other downstream device can claim. No + * isolation is possible. + */ + if (!pci_is_pcie(bridge)) { + if (bridge->dev_flags & PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS) + type = PCI_EXP_TYPE_PCI_BRIDGE; + else + return PCI_BRIDGE_NON_ISOLATED; + } else { + type = pci_pcie_type(bridge); + } + + switch (type) { + /* + * Since PCIe links are point to point root ports are isolated if there + * is no internal loopback to the root port's MMIO. Like MFDs assume if + * there is no ACS cap then there is no loopback. + */ + case PCI_EXP_TYPE_ROOT_PORT: + if (bridge->acs_cap && + !pci_acs_enabled(bridge, PCI_ACS_ISOLATED)) + return PCIE_NON_ISOLATED; + return PCIE_ISOLATED; + + /* + * Since PCIe links are point to point a DSP is always considered + * isolated. The internal bus of the switch will be non-isolated if the + * DSP's have any ACS that allows upstream traffic to flow back + * downstream to any DSP, including back to this DSP or its MMIO. + */ + case PCI_EXP_TYPE_DOWNSTREAM: + return PCIE_ISOLATED; + + /* + * bus is the interior bus of a PCI-E switch where ACS rules apply. + */ + case PCI_EXP_TYPE_UPSTREAM: + return pcie_switch_isolated(bus); + + /* + * PCIe to PCI/PCI-X - this bus is PCI. + */ + case PCI_EXP_TYPE_PCI_BRIDGE: + /* + * A PCIe express bridge will use the subordinate bus number + * with a 0 devfn as the RID in some cases. This causes all + * subordinate devfns to alias with 0, which is the same + * grouping as PCI_BUS_NON_ISOLATED. The RID of the bridge + * itself is only used by the bridge. + * + * However, if the bridge has MMIO then we will assume the MMIO + * is not isolated due to no ACS controls on this bridge type. + */ + if (pci_has_mmio(bridge)) + return PCI_BRIDGE_NON_ISOLATED; + return PCI_BUS_NON_ISOLATED; + + /* + * PCI/PCI-X to PCIe - this bus is PCIe. We already know there must be a + * PCI bus upstream of this bus, so just return non-isolated. If + * upstream is PCI-X the PCIe RID should be preserved, but for PCI the + * RID will be lost. + */ + case PCI_EXP_TYPE_PCIE_BRIDGE: + return PCI_BRIDGE_NON_ISOLATED; + + default: + return PCI_BRIDGE_NON_ISOLATED; + } +} + static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr) { struct pci_bus *child; diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860db..c36fff9d2254f8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -855,6 +855,32 @@ struct pci_dynids { struct list_head list; /* For IDs added at runtime */ }; +enum pci_bus_isolation { + /* + * The bus is off a root port and the root port has isolated ACS flags + * or the bus is part of a PCIe switch and the switch has isolated ACS + * flags. + */ + PCIE_ISOLATED, + /* + * The switch's DSP's are not isolated from each other but are isolated + * from the USP. + */ + PCIE_SWITCH_DSP_NON_ISOLATED, + /* The above and the USP's MMIO is not isolated. */ + PCIE_NON_ISOLATED, + /* + * A PCI/PCI-X bus, no isolation. This is like + * PCIE_SWITCH_DSP_NON_ISOLATED in that the upstream bridge is isolated + * from the bus. The bus itself may also have a shared alias of devfn=0. + */ + PCI_BUS_NON_ISOLATED, + /* + * The above and the bridge's MMIO is not isolated and the bridge's RID + * may be an alias. + */ + PCI_BRIDGE_NON_ISOLATED, +}; /* * PCI Error Recovery System (PCI-ERS). If a PCI device driver provides @@ -1243,6 +1269,8 @@ struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus, struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from); struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from); +enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus); + int pci_dev_present(const struct pci_device_id *ids); int pci_bus_read_config_byte(struct pci_bus *bus, unsigned int devfn, @@ -2056,6 +2084,9 @@ static inline struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from) { return NULL; } +static inline enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus) +{ return PCIE_NON_ISOLATED; } + static inline int pci_dev_present(const struct pci_device_id *ids) { return 0; } -- 2.43.0 The current algorithm does not work if ACS is turned off, and it is not clear how this has been missed for so long. I think it has been avoided because the kernel command line options to target specific devices and disable ACS are rarely used. For discussion lets consider a simple topology like the below: -- DSP 02:00.0 -> End Point A Root 00:00.0 -> USP 01:00.0 --| -- DSP 02:03.0 -> End Point B If ACS is fully activated we expect 00:00.0, 01:00.0, 02:00.0, 02:03.0, A, B to all have unique single device groups. If both DSPs have ACS off then we expect 00:00.0 and 01:00.0 to have unique single device groups while 02:00.0, 02:03.0, A, B are part of one multi-device group. If the DSPs have asymmetric ACS, with one fully isolating and one non-isolating we also expect the above multi-device group result. Instead the current algorithm always creates unique single device groups for this topology. It happens because the pci_device_group(DSP) immediately moves to the USP and computes pci_acs_path_enabled(USP) == true and decides the DSP can get a unique group. The pci_device_group(A) immediately moves to the DSP, sees pci_acs_path_enabled(DSP) == false and then takes the DSPs group. For root-ports a PCIe topology like: -- Dev 01:00.0 Root 00:00.00 --- Root Port 00:01.0 --| | -- Dev 01:00.1 |- Dev 00:17.0 Previously would group [00:01.0, 01:00.0, 01:00.1] together if there is no ACS capability in the root port. While ACS on root ports is underspecified in the spec, it should still function as an egress control and limit access to either the MMIO of the root port itself, or perhaps some other devices upstream of the root complex - 00:17.0 perhaps in this example. Historically the grouping in Linux has assumed the root port routes all traffic into the TA/IOMMU and never bypasses the TA to go to other functions in the root complex. Following the new understanding that ACS is required for internal loopback also treat root ports with no ACS capability as lacking internal loopback as well. The current algorithm has several issues: 1) It implicitly depends on ordering. Since the existing group discovery only goes in the upstream direction discovering a downstream device before its upstream will cause the wrong creation of narrower groups. 2) It assumes that if the path from the end point to the root is entirely ACS isolated then that end point is isolated. This misses cross-traffic in the asymmetric ACS case. 3) When evaluating a non-isolated DSP it does not check peer DSPs for an already established group unless the multi-function feature does it. 4) It does not understand the aliasing rule for PCIe to PCI bridges where the alias is to the subordinate bus. The bridge's RID on the primary bus is not aliased. This causes the PCIe to PCI bridge to be wrongly joined to the group with the downstream devices. As grouping is a security property for VFIO creating incorrectly narrowed groups is a security problem for the system. Revise the design to solve these problems. Explicitly require ordering, or return EPROBE_DEFER if things are out of order. This avoids silent errors that created smaller groups and solves problem #1. Work on busses, not devices. Isolation is a property of the bus, and the first non-isolated bus should form a group containing all devices downstream of that bus. If all busses on the path to an end device are isolated then the end device has a chance to make a single-device group. Use pci_bus_isolation() to compute the bus's isolation status based on the ACS flags and technology. pci_bus_isolation() touches a lot of PCI internals to get the information in the right format. Add a new flag in the iommu_group to record that the group contains a non-isolated bus. Any downstream pci_device_group() will see bus->self->iommu_group is non-isolated and unconditionally join it. This makes the first non-isolation apply to all downstream devices and solves problem #2 The bus's non-isolated iommu_group will be stored in either the DSP of PCIe switch or the bus->self upstream device, depending on the situation. When storing in the DSP all the DSPs are checked first for a pre-existing non-isolated iommu_group. When stored in the upstream the flag forces it to all downstreams. This solves problem #3. Put the handling of end-device aliases and MFD into pci_get_alias_group() and only call it in cases where we have a fully isolated path. Otherwise every downstream device on the bus is going to be joined to the group of bus->self. Finally, replace the initial pci_for_each_dma_alias() with a combination of: - Directly checking pci_real_dma_dev() and enforcing ordering. The group should contain both pdev and pci_real_dma_dev(pdev) which is only possible if pdev is ordered after real_dma_dev. This solves a case of #1. - Indirectly relying on pci_bus_isolation() to report legacy PCI busses as non-isolated, with the enum including the distinction of the PCIe to PCI bridge being isolated from the downstream. This solves problem #4. It is very likely this is going to expand iommu_group membership in existing systems. After all that is the security bug that is being fixed. Expanding the iommu_groups risks problems for users using VFIO. The intention is to have a more accurate reflection of the security properties in the system and should be seen as a security fix. However people who have ACS disabled may now need to enable it. As such users may have had good reason for ACS to be disabled I strongly recommend that backporting of this also include the new config_acs option so that such users can potentially minimally enable ACS only where needed. Fixes: 104a1c13ac66 ("iommu/core: Create central IOMMU group lookup/creation interface") Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 279 ++++++++++++++++++++++++++++++++---------- include/linux/pci.h | 3 + 2 files changed, 217 insertions(+), 65 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2a47ddb01799c1..1874bbdc73b75e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -65,8 +65,16 @@ struct iommu_group { struct list_head entry; unsigned int owner_cnt; void *owner; + + /* Used by the device_group() callbacks */ + u32 bus_data; }; +/* + * Everything downstream of this group should share it. + */ +#define BUS_DATA_PCI_NON_ISOLATED BIT(0) + struct group_device { struct list_head list; struct device *dev; @@ -1484,25 +1492,6 @@ static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, return NULL; } -struct group_for_pci_data { - struct pci_dev *pdev; - struct iommu_group *group; -}; - -/* - * DMA alias iterator callback, return the last seen device. Stop and return - * the IOMMU group if we find one along the way. - */ -static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque) -{ - struct group_for_pci_data *data = opaque; - - data->pdev = pdev; - data->group = iommu_group_get(&pdev->dev); - - return data->group != NULL; -} - /* * Generic device_group call-back function. It just allocates one * iommu-group per device. @@ -1534,57 +1523,31 @@ struct iommu_group *generic_single_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(generic_single_device_group); -/* - * Use standard PCI bus topology, isolation features, and DMA alias quirks - * to find or create an IOMMU group for a device. - */ -struct iommu_group *pci_device_group(struct device *dev) +static struct iommu_group *pci_group_alloc_non_isolated(void) { - struct pci_dev *pdev = to_pci_dev(dev); - struct group_for_pci_data data; - struct pci_bus *bus; - struct iommu_group *group = NULL; - u64 devfns[4] = { 0 }; + struct iommu_group *group; - if (WARN_ON(!dev_is_pci(dev))) - return ERR_PTR(-EINVAL); + group = iommu_group_alloc(); + if (IS_ERR(group)) + return group; + group->bus_data |= BUS_DATA_PCI_NON_ISOLATED; + return group; +} - /* - * Find the upstream DMA alias for the device. A device must not - * be aliased due to topology in order to have its own IOMMU group. - * If we find an alias along the way that already belongs to a - * group, use it. - */ - if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data)) - return data.group; - - pdev = data.pdev; - - /* - * Continue upstream from the point of minimum IOMMU granularity - * due to aliases to the point where devices are protected from - * peer-to-peer DMA by PCI ACS. Again, if we find an existing - * group, use it. - */ - for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) { - if (!bus->self) - continue; - - if (pci_acs_path_enabled(bus->self, NULL, PCI_ACS_ISOLATED)) - break; - - pdev = bus->self; - - group = iommu_group_get(&pdev->dev); - if (group) - return group; - } +/* + * Return a group if the function has isolation restrictions related to + * aliases or MFD ACS. + */ +static struct iommu_group *pci_get_function_group(struct pci_dev *pdev) +{ + struct iommu_group *group; + DECLARE_BITMAP(devfns, 256) = {}; /* * Look for existing groups on device aliases. If we alias another * device or another device aliases us, use the same group. */ - group = get_pci_alias_group(pdev, (unsigned long *)devfns); + group = get_pci_alias_group(pdev, devfns); if (group) return group; @@ -1593,12 +1556,198 @@ struct iommu_group *pci_device_group(struct device *dev) * slot and aliases of those funcions, if any. No need to clear * the search bitmap, the tested devfns are still valid. */ - group = get_pci_function_alias_group(pdev, (unsigned long *)devfns); + group = get_pci_function_alias_group(pdev, devfns); if (group) return group; - /* No shared group found, allocate new */ - return iommu_group_alloc(); + /* + * When MFD's are included in the set due to ACS we assume that if ACS + * permits an internal loopback between functions it also permits the + * loopback to go downstream if a function is a bridge. + * + * It is less clear what aliases mean when applied to a bridge. For now + * be conservative and also propagate the group downstream. + */ + __clear_bit(pdev->devfn & 0xFF, devfns); + if (!bitmap_empty(devfns, sizeof(devfns) * BITS_PER_BYTE)) + return pci_group_alloc_non_isolated(); + return NULL; +} + +/* Return a group if the upstream hierarchy has isolation restrictions. */ +static struct iommu_group *pci_hierarchy_group(struct pci_dev *pdev) +{ + /* + * SRIOV functions may reside on a virtual bus, jump directly to the PFs + * bus in all cases. + */ + struct pci_bus *bus = pci_physfn(pdev)->bus; + struct iommu_group *group; + + /* Nothing upstream of this */ + if (pci_is_root_bus(bus)) + return NULL; + + /* + * !self is only for SRIOV virtual busses which should have been + * excluded by pci_physfn() + */ + if (WARN_ON(!bus->self)) + return ERR_PTR(-EINVAL); + + group = iommu_group_get(&bus->self->dev); + if (!group) { + /* + * If the upstream bridge needs the same group as pdev then + * there is no way for it's pci_device_group() to discover it. + */ + dev_err(&pdev->dev, + "PCI device is probing out of order, upstream bridge device of %s is not probed yet\n", + pci_name(bus->self)); + return ERR_PTR(-EPROBE_DEFER); + } + if (group->bus_data & BUS_DATA_PCI_NON_ISOLATED) + return group; + iommu_group_put(group); + return NULL; +} + +/* + * For legacy PCI we have two main considerations when forming groups: + * + * 1) In PCI we can loose the RID inside the fabric, or some devices will use + * the wrong RID. The PCI core calls this aliasing, but from an IOMMU + * perspective it means that a PCI device may have multiple RIDs and a + * single RID may represent many PCI devices. This effectively means all the + * aliases must share a translation, thus group, because the IOMMU cannot + * tell devices apart. + * + * 2) PCI permits a bus segment to claim an address even if the transaction + * originates from an end point not the CPU. When it happens it is called + * peer to peer. Claiming a transaction in the middle of the bus hierarchy + * bypasses the IOMMU translation. The IOMMU subsystem rules require these + * devices to be placed in the same group because they lack isolation from + * each other. In PCI Express the ACS system can be used to inhibit this and + * force transactions to go to the IOMMU. + * + * From a PCI perspective any given PCI bus is either isolating or + * non-isolating. Isolating means downstream originated transactions always + * progress toward the CPU and do not go to other devices on the bus + * segment, while non-isolating means downstream originated transactions can + * progress back downstream through another device on the bus segment. + * + * Beyond buses a multi-function device or bridge can also allow + * transactions to loop back internally from one function to another. + * + * Once a PCI bus becomes non isolating the entire downstream hierarchy of + * that bus becomes a single group. + */ +struct iommu_group *pci_device_group(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct iommu_group *group; + struct pci_dev *real_pdev; + + if (WARN_ON(!dev_is_pci(dev))) + return ERR_PTR(-EINVAL); + + /* + * Arches can supply a completely different PCI device that actually + * does DMA. + */ + real_pdev = pci_real_dma_dev(pdev); + if (real_pdev != pdev) { + group = iommu_group_get(&real_pdev->dev); + if (!group) { + /* + * The real_pdev has not had an iommu probed to it. We + * can't create a new group here because there is no way + * for pci_device_group(real_pdev) to pick it up. + */ + dev_err(dev, + "PCI device is probing out of order, real device of %s is not probed yet\n", + pci_name(real_pdev)); + return ERR_PTR(-EPROBE_DEFER); + } + return group; + } + + if (pdev->dev_flags & PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT) + return iommu_group_alloc(); + + /* Anything upstream of this enforcing non-isolated? */ + group = pci_hierarchy_group(pdev); + if (group) + return group; + + switch (pci_bus_isolated(pci_physfn(pdev)->bus)) { + case PCIE_ISOLATED: + /* Check multi-function groups and same-bus devfn aliases */ + group = pci_get_function_group(pdev); + if (group) + return group; + + /* No shared group found, allocate new */ + return iommu_group_alloc(); + + /* + * On legacy PCI there is no RID at an electrical level. On PCI-X the + * RID of the bridge may be used in some cases instead of the + * downstream's RID. This creates aliasing problems. PCI/PCI-X doesn't + * provide isolation either. The end result is that as soon as we hit a + * PCI/PCI-X bus we switch to non-isolated for the whole downstream for + * both aliasing and isolation reasons. The bridge has to be included in + * the group because of the aliasing. + */ + case PCI_BRIDGE_NON_ISOLATED: + /* A PCIe switch where the USP has MMIO and is not isolated. */ + case PCIE_NON_ISOLATED: + group = iommu_group_get(&pdev->bus->self->dev); + if (WARN_ON(!group)) + return ERR_PTR(-EINVAL); + /* + * No need to be concerned with aliases here since we are going + * to put the entire downstream tree in the bridge/USP's group. + */ + group->bus_data |= BUS_DATA_PCI_NON_ISOLATED; + return group; + + /* + * It is a PCI bus and the upstream bridge/port does not alias or allow + * P2P. + */ + case PCI_BUS_NON_ISOLATED: + /* + * It is a PCIe switch and the DSP cannot reach the USP. The DSP's + * are not isolated from each other and share a group. + */ + case PCIE_SWITCH_DSP_NON_ISOLATED: { + struct pci_dev *piter = NULL; + + /* + * All the downstream devices on the bus share a group. If this + * is a PCIe switch then they will all be DSPs + */ + for_each_pci_dev(piter) { + if (piter->bus != pdev->bus) + continue; + group = iommu_group_get(&piter->dev); + if (group) { + pci_dev_put(piter); + if (WARN_ON(!(group->bus_data & + BUS_DATA_PCI_NON_ISOLATED))) + group->bus_data |= + BUS_DATA_PCI_NON_ISOLATED; + return group; + } + } + return pci_group_alloc_non_isolated(); + } + default: + break; + } + WARN_ON(true); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(pci_device_group); diff --git a/include/linux/pci.h b/include/linux/pci.h index c36fff9d2254f8..fb9adf0562f8ef 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2093,6 +2093,9 @@ static inline int pci_dev_present(const struct pci_device_id *ids) #define no_pci_devices() (1) #define pci_dev_put(dev) do { } while (0) +static inline struct pci_dev *pci_real_dma_dev(struct pci_dev *dev) +{ return dev; } + static inline void pci_set_master(struct pci_dev *dev) { } static inline void pci_clear_master(struct pci_dev *dev) { } static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } -- 2.43.0 To avoid some internal padding. Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1874bbdc73b75e..543d6347c0e5e3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -58,13 +58,13 @@ struct iommu_group { void *iommu_data; void (*iommu_data_release)(void *iommu_data); char *name; - int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; struct iommu_domain *domain; struct list_head entry; - unsigned int owner_cnt; void *owner; + unsigned int owner_cnt; + int id; /* Used by the device_group() callbacks */ u32 bus_data; -- 2.43.0 Implement pci_reachable_set() to efficiently compute a set of devices on the same bus that are "reachable" from a starting device. The meaning of reachability is defined by the caller through a callback function. This is a faster implementation of the same logic in pci_device_group(). Being inside the PCI core allows use of pci_bus_sem so it can use list_for_each_entry() on a small list of devices instead of the expensive for_each_pci_dev(). Server systems can now have hundreds of PCI devices, but typically only a very small number of devices per bus. An example of a reachability function would be pci_devs_are_dma_aliases() which would compute a set of devices on the same bus that are aliases. This would also be useful in future support for the ACS P2P Egress Vector which has a similar reachability problem. This is effectively a graph algorithm where the set of devices on the bus are vertexes and the reachable() function defines the edges. It returns a set of vertexes that form a connected graph. Signed-off-by: Jason Gunthorpe --- drivers/pci/search.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 12 ++++++ 2 files changed, 102 insertions(+) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index fe6c07e67cb8ce..dac6b042fd5f5d 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -595,3 +595,93 @@ int pci_dev_present(const struct pci_device_id *ids) return 0; } EXPORT_SYMBOL(pci_dev_present); + +/** + * pci_reachable_set - Generate a bitmap of devices within a reachability set + * @start: First device in the set + * @devfns: The set of devices on the bus + * @reachable: Callback to tell if two devices can reach each other + * + * Compute a bitmap where every set bit is a device on the bus that is reachable + * from the start device, including the start device. Reachability between two + * devices is determined by a callback function. + * + * This is a non-recursive implementation that invokes the callback once per + * pair. The callback must be commutative: + * reachable(a, b) == reachable(b, a) + * reachable() can form a cyclic graph: + * reachable(a,b) == reachable(b,c) == reachable(c,a) == true + * + * Since this function is limited to a single bus the largest set can be 256 + * devices large. + */ +void pci_reachable_set(struct pci_dev *start, struct pci_reachable_set *devfns, + bool (*reachable)(struct pci_dev *deva, + struct pci_dev *devb)) +{ + struct pci_reachable_set todo_devfns = {}; + struct pci_reachable_set next_devfns = {}; + struct pci_bus *bus = start->bus; + bool again; + + /* Assume devfn of all PCI devices is bounded by MAX_NR_DEVFNS */ + static_assert(sizeof(next_devfns.devfns) * BITS_PER_BYTE >= + MAX_NR_DEVFNS); + + memset(devfns, 0, sizeof(devfns->devfns)); + __set_bit(start->devfn, devfns->devfns); + __set_bit(start->devfn, next_devfns.devfns); + + down_read(&pci_bus_sem); + while (true) { + unsigned int devfna; + unsigned int i; + + /* + * For each device that hasn't been checked compare every + * device on the bus against it. + */ + again = false; + for_each_set_bit(devfna, next_devfns.devfns, MAX_NR_DEVFNS) { + struct pci_dev *deva = NULL; + struct pci_dev *devb; + + list_for_each_entry(devb, &bus->devices, bus_list) { + if (devb->devfn == devfna) + deva = devb; + + if (test_bit(devb->devfn, devfns->devfns)) + continue; + + if (!deva) { + deva = devb; + list_for_each_entry_continue( + deva, &bus->devices, bus_list) + if (deva->devfn == devfna) + break; + } + + if (!reachable(deva, devb)) + continue; + + __set_bit(devb->devfn, todo_devfns.devfns); + again = true; + } + } + + if (!again) + break; + + /* + * Every new bit adds a new deva to check, reloop the whole + * thing. Expect this to be rare. + */ + for (i = 0; i != ARRAY_SIZE(devfns->devfns); i++) { + devfns->devfns[i] |= todo_devfns.devfns[i]; + next_devfns.devfns[i] = todo_devfns.devfns[i]; + todo_devfns.devfns[i] = 0; + } + } + up_read(&pci_bus_sem); +} +EXPORT_SYMBOL_GPL(pci_reachable_set); diff --git a/include/linux/pci.h b/include/linux/pci.h index fb9adf0562f8ef..21f6b20b487f8d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -855,6 +855,10 @@ struct pci_dynids { struct list_head list; /* For IDs added at runtime */ }; +struct pci_reachable_set { + DECLARE_BITMAP(devfns, 256); +}; + enum pci_bus_isolation { /* * The bus is off a root port and the root port has isolated ACS flags @@ -1269,6 +1273,9 @@ struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus, struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from); struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from); +void pci_reachable_set(struct pci_dev *start, struct pci_reachable_set *devfns, + bool (*reachable)(struct pci_dev *deva, + struct pci_dev *devb)); enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus); int pci_dev_present(const struct pci_device_id *ids); @@ -2084,6 +2091,11 @@ static inline struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from) { return NULL; } +static inline void +pci_reachable_set(struct pci_dev *start, struct pci_reachable_set *devfns, + bool (*reachable)(struct pci_dev *deva, struct pci_dev *devb)) +{ } + static inline enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus) { return PCIE_NON_ISOLATED; } -- 2.43.0 Like with switches the current MFD algorithm does not consider asymmetric ACS within a MFD. If any MFD function has ACS that permits P2P the spec says it can reach through the MFD internal loopback any other function in the device. For discussion let's consider a simple MFD topology like the below: -- MFD 00:1f.0 ACS != REQ_ACS_FLAGS Root 00:00.00 --|- MFD 00:1f.2 ACS != REQ_ACS_FLAGS |- MFD 00:1f.6 ACS = REQ_ACS_FLAGS This asymmetric ACS could be created using the config_acs kernel command line parameter, from quirks, or from a poorly thought out device that has ACS flags only on some functions. Since ACS is an egress property the asymmetric flags allow for 00:1f.0 to do memory acesses into 00:1f.6's BARs, but 00:1f.6 cannot reach any other function. Thus we expect an iommu_group to contain all three devices. Instead the current algorithm gives a group of [1f.0, 1f.2] and a single device group of 1f.6. The current algorithm sees the good ACS flags on 00:1f.6 and does not consider ACS on any other MFD functions. For path properties the ACS flags say that 00:1f.6 is safe to use with PASID and supports SVA as it will not have any portions of its address space routed away from the IOMMU, this part of the ACS system is working correctly. Further, if one of the MFD functions is a bridge, eg like 1f.2: -- MFD 00:1f.0 Root 00:00.00 --|- MFD 00:1f.2 Root Port --- 01:01.0 |- MFD 00:1f.6 Then the correct grouping will include 01:01.0, 00:1f.0/2/6 together in a group if there is any internal loopback within the MFD 00:1f. The current algorithm does not understand this and gives 01:01.0 it's own group even if it thinks there is an internal loopback in the MFD. Unfortunately this detail makes it hard to fix. Currently the code assumes that any MFD without an ACS cap has an internal loopback which will cause a large number of modern real systems to group in a pessimistic way. However, the PCI spec does not really support this: PCI Section 6.12.1.2 ACS Functions in SR-IOV, SIOV, and Multi-Function Devices ACS P2P Request Redirect: must be implemented by Functions that support peer-to-peer traffic with other Functions. Meaning from a spec perspective the absence of ACS indicates the absence of internal loopback. Granted I think we are aware of older real devices that ignore this, but it seems to be the only way forward. So, rely on 6.12.1.2 and assume functions without ACS do not have internal loopback. This resolves the common issue with modern systems and MFD root ports, but it makes the ACS quirks system less used. Instead we'd want quirks that say self-loopback is actually present, not like today's quirks that say it is absent. This is surely negative for older hardware, but positive for new HW that complies with the spec. Use pci_reachable_set() in pci_device_group() to make the resulting algorithm faster and easier to understand. Add pci_mfds_are_same_group() which specifically looks pair-wise at all functions in the MFDs. Any function with ACS capabilities and non-isolated aCS flags will become reachable to all other functions. pci_reachable_set() does the calculations for figuring out the set of devices under the pci_bus_sem, which is better than repeatedly searching across all PCI devices. Once the set of devices is determined and the set has more than one device use pci_get_slot() to search for any existing groups in the reachable set. Fixes: 104a1c13ac66 ("iommu/core: Create central IOMMU group lookup/creation interface") Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 189 +++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 102 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 543d6347c0e5e3..fc3c71b243a850 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1413,85 +1413,6 @@ int iommu_group_id(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_id); -static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, - unsigned long *devfns); - -/* - * For multifunction devices which are not isolated from each other, find - * all the other non-isolated functions and look for existing groups. For - * each function, we also need to look for aliases to or from other devices - * that may already have a group. - */ -static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev, - unsigned long *devfns) -{ - struct pci_dev *tmp = NULL; - struct iommu_group *group; - - if (!pdev->multifunction || pci_acs_enabled(pdev, PCI_ACS_ISOLATED)) - return NULL; - - for_each_pci_dev(tmp) { - if (tmp == pdev || tmp->bus != pdev->bus || - PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) || - pci_acs_enabled(tmp, PCI_ACS_ISOLATED)) - continue; - - group = get_pci_alias_group(tmp, devfns); - if (group) { - pci_dev_put(tmp); - return group; - } - } - - return NULL; -} - -/* - * Look for aliases to or from the given device for existing groups. DMA - * aliases are only supported on the same bus, therefore the search - * space is quite small (especially since we're really only looking at pcie - * device, and therefore only expect multiple slots on the root complex or - * downstream switch ports). It's conceivable though that a pair of - * multifunction devices could have aliases between them that would cause a - * loop. To prevent this, we use a bitmap to track where we've been. - */ -static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, - unsigned long *devfns) -{ - struct pci_dev *tmp = NULL; - struct iommu_group *group; - - if (test_and_set_bit(pdev->devfn & 0xff, devfns)) - return NULL; - - group = iommu_group_get(&pdev->dev); - if (group) - return group; - - for_each_pci_dev(tmp) { - if (tmp == pdev || tmp->bus != pdev->bus) - continue; - - /* We alias them or they alias us */ - if (pci_devs_are_dma_aliases(pdev, tmp)) { - group = get_pci_alias_group(tmp, devfns); - if (group) { - pci_dev_put(tmp); - return group; - } - - group = get_pci_function_alias_group(tmp, devfns); - if (group) { - pci_dev_put(tmp); - return group; - } - } - } - - return NULL; -} - /* * Generic device_group call-back function. It just allocates one * iommu-group per device. @@ -1534,44 +1455,108 @@ static struct iommu_group *pci_group_alloc_non_isolated(void) return group; } +/* + * All functions in the MFD need to be isolated from each other and get their + * own groups, otherwise the whole MFD will share a group. + */ +static bool pci_mfds_are_same_group(struct pci_dev *deva, struct pci_dev *devb) +{ + /* + * SRIOV VFs will use the group of the PF if it has + * BUS_DATA_PCI_NON_ISOLATED. We don't support VFs that also have ACS + * that are set to non-isolating. + */ + if (deva->is_virtfn || devb->is_virtfn) + return false; + + /* Are deva/devb functions in the same MFD? */ + if (PCI_SLOT(deva->devfn) != PCI_SLOT(devb->devfn)) + return false; + /* Don't understand what is happening, be conservative */ + if (deva->multifunction != devb->multifunction) + return true; + if (!deva->multifunction) + return false; + + /* + * PCI Section 6.12.1.2 ACS Functions in SR-IOV, SIOV, and + * Multi-Function Devices + * ... + * ACS P2P Request Redirect: must be implemented by Functions that + * support peer-to-peer traffic with other Functions. + * + * Therefore assume if a MFD has no ACS capability then it does not + * support a loopback. This is the reverse of what Linux <= v6.16 + * assumed - that any MFD was capable of P2P and used quirks identify + * devices that complied with the above. + */ + if (deva->acs_cap && !pci_acs_enabled(deva, PCI_ACS_ISOLATED)) + return true; + if (devb->acs_cap && !pci_acs_enabled(devb, PCI_ACS_ISOLATED)) + return true; + return false; +} + +static bool pci_devs_are_same_group(struct pci_dev *deva, struct pci_dev *devb) +{ + /* + * This is allowed to return cycles: a,b -> b,c -> c,a can be aliases. + */ + if (pci_devs_are_dma_aliases(deva, devb)) + return true; + + return pci_mfds_are_same_group(deva, devb); +} + /* * Return a group if the function has isolation restrictions related to * aliases or MFD ACS. */ static struct iommu_group *pci_get_function_group(struct pci_dev *pdev) { - struct iommu_group *group; - DECLARE_BITMAP(devfns, 256) = {}; + struct pci_reachable_set devfns; + const unsigned int NR_DEVFNS = sizeof(devfns.devfns) * BITS_PER_BYTE; + unsigned int devfn; /* - * Look for existing groups on device aliases. If we alias another - * device or another device aliases us, use the same group. + * Look for existing groups on device aliases and multi-function ACS. If + * we alias another device or another device aliases us, use the same + * group. + * + * pci_reachable_set() should return the same bitmap if called for any + * device in the set and we want all devices in the set to have the same + * group. */ - group = get_pci_alias_group(pdev, devfns); - if (group) - return group; + pci_reachable_set(pdev, &devfns, pci_devs_are_same_group); + /* start is known to have iommu_group_get() == NULL */ + __clear_bit(pdev->devfn, devfns.devfns); /* - * Look for existing groups on non-isolated functions on the same - * slot and aliases of those funcions, if any. No need to clear - * the search bitmap, the tested devfns are still valid. - */ - group = get_pci_function_alias_group(pdev, devfns); - if (group) - return group; - - /* - * When MFD's are included in the set due to ACS we assume that if ACS - * permits an internal loopback between functions it also permits the - * loopback to go downstream if a function is a bridge. + * When MFD functions are included in the set due to ACS we assume that + * if ACS permits an internal loopback between functions it also permits + * the loopback to go downstream if any function is a bridge. * * It is less clear what aliases mean when applied to a bridge. For now * be conservative and also propagate the group downstream. */ - __clear_bit(pdev->devfn & 0xFF, devfns); - if (!bitmap_empty(devfns, sizeof(devfns) * BITS_PER_BYTE)) - return pci_group_alloc_non_isolated(); - return NULL; + if (bitmap_empty(devfns.devfns, NR_DEVFNS)) + return NULL; + + for_each_set_bit(devfn, devfns.devfns, NR_DEVFNS) { + struct iommu_group *group; + struct pci_dev *pdev_slot; + + pdev_slot = pci_get_slot(pdev->bus, devfn); + group = iommu_group_get(&pdev_slot->dev); + pci_dev_put(pdev_slot); + if (group) { + if (WARN_ON(!(group->bus_data & + BUS_DATA_PCI_NON_ISOLATED))) + group->bus_data |= BUS_DATA_PCI_NON_ISOLATED; + return group; + } + } + return pci_group_alloc_non_isolated(); } /* Return a group if the upstream hierarchy has isolation restrictions. */ -- 2.43.0 Directly check that the devices touched by pci_for_each_dma_alias() match the groups that were built by pci_device_group(). This helps validate that pci_for_each_dma_alias() and pci_bus_isolated() are consistent. This should eventually be hidden behind a debug kconfig, but for now it is good to get feedback from more diverse systems if there are any problems. Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 76 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index fc3c71b243a850..2bd43a5a9ad8d8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1627,7 +1627,7 @@ static struct iommu_group *pci_hierarchy_group(struct pci_dev *pdev) * Once a PCI bus becomes non isolating the entire downstream hierarchy of * that bus becomes a single group. */ -struct iommu_group *pci_device_group(struct device *dev) +static struct iommu_group *__pci_device_group(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct iommu_group *group; @@ -1734,6 +1734,80 @@ struct iommu_group *pci_device_group(struct device *dev) WARN_ON(true); return ERR_PTR(-EINVAL); } + +struct check_group_aliases_data { + struct pci_dev *pdev; + struct iommu_group *group; +}; + +static void pci_check_group(const struct check_group_aliases_data *data, + u16 alias, struct pci_dev *pdev) +{ + struct iommu_group *group; + + group = iommu_group_get(&pdev->dev); + if (!group) + return; + + if (group != data->group) + dev_err(&data->pdev->dev, + "During group construction alias processing needed dev %s alias %x to have the same group but %u != %u\n", + pci_name(pdev), alias, data->group->id, group->id); + iommu_group_put(group); +} + +static int pci_check_group_aliases(struct pci_dev *pdev, u16 alias, + void *opaque) +{ + const struct check_group_aliases_data *data = opaque; + + /* + * Sometimes when a PCIe-PCI bridge is performing transactions on behalf + * of its subordinate bus it uses devfn=0 on the subordinate bus as the + * alias. This means that 0 will alias with all devfns on the + * subordinate bus and so we expect to see those in the same group. pdev + * in this case is the bridge itself and pdev->bus is the primary bus of + * the bridge. + */ + if (pdev->bus->number != PCI_BUS_NUM(alias)) { + struct pci_dev *piter = NULL; + + for_each_pci_dev(piter) { + if (pci_domain_nr(pdev->bus) == + pci_domain_nr(piter->bus) && + PCI_BUS_NUM(alias) == pdev->bus->number) + pci_check_group(data, alias, piter); + } + } else { + pci_check_group(data, alias, pdev); + } + + return 0; +} + +struct iommu_group *pci_device_group(struct device *dev) +{ + struct check_group_aliases_data data = { + .pdev = to_pci_dev(dev), + }; + struct iommu_group *group; + + if (!IS_ENABLED(CONFIG_PCI)) + return ERR_PTR(-EINVAL); + + group = __pci_device_group(dev); + if (IS_ERR(group)) + return group; + + /* + * The IOMMU driver should use pci_for_each_dma_alias() to figure out + * what RIDs to program and the core requires all the RIDs to fall + * within the same group. Validate that everything worked properly. + */ + data.group = group; + pci_for_each_dma_alias(data.pdev, pci_check_group_aliases, &data); + return group; +} EXPORT_SYMBOL_GPL(pci_device_group); /* Get the IOMMU group for device on fsl-mc bus */ -- 2.43.0 This brings the definitions up to PCI Express revision 5.0: * ACS I/O Request Blocking Enable * ACS DSP Memory Target Access Control * ACS USP Memory Target Access Control * ACS Unclaimed Request Redirect Support for this entire grouping is advertised by the ACS Enhanced Capability bit. Signed-off-by: Jason Gunthorpe --- include/uapi/linux/pci_regs.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 6095e7d7d4cc48..54621e6e83572e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1005,8 +1005,16 @@ #define PCI_ACS_UF 0x0010 /* Upstream Forwarding */ #define PCI_ACS_EC 0x0020 /* P2P Egress Control */ #define PCI_ACS_DT 0x0040 /* Direct Translated P2P */ +#define PCI_ACS_ENHANCED 0x0080 /* IORB, DSP_MT_xx, USP_MT_XX. Capability only */ +#define PCI_ACS_EGRESS_CTL_SZ GENMASK(15, 8) /* Egress Control Vector Size */ #define PCI_ACS_EGRESS_BITS 0x05 /* ACS Egress Control Vector Size */ #define PCI_ACS_CTRL 0x06 /* ACS Control Register */ +#define PCI_ACS_IORB 0x0080 /* I/O Request Blocking */ +#define PCI_ACS_DSP_MT_RB 0x0100 /* DSP Memory Target Access Control Request Blocking */ +#define PCI_ACS_DSP_MT_RR 0x0200 /* DSP Memory Target Access Control Request Redirect */ +#define PCI_ACS_USP_MT_RB 0x0400 /* USP Memory Target Access Control Request Blocking */ +#define PCI_ACS_USP_MT_RR 0x0800 /* USP Memory Target Access Control Request Redirect */ +#define PCI_ACS_UNCLAIMED_RR 0x1000 /* Unclaimed Request Redirect Control */ #define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */ /* -- 2.43.0 The ACS Enhanced bits are intended to address a lack of precision in the spec about what ACS P2P Request Redirect is supposed to do. While Linux has long assumed that PCI_ACS_RR would cover MMIO BARs located in the root port and PCIe Switch ports, the spec took the position that it is implementation specific. To get the behavior Linux has long assumed it should be setting: PCI_ACS_RR | PCI_ACS_DSP_MT_RR | PCI_ACS_USP_MT_RR | PCI_ACS_UNCLAMED_RR Follow this guidance in enable_acs and set the additional bits if ACS Enhanced is supported. Allow config_acs to control these bits if the device has ACS Enhanced. The spec permits the HW to wire the bits, so after setting them pci_acs_flags_enabled() does do a pci_read_config_word() to read the actual value in effect. Note that currently Linux sets these bits to 0, so any new HW that comes supporting ACS Enhanced will end up with historical Linux disabling these functions. Devices wanting to be compatible with old Linux will need to wire the ctrl bits to follow ACS_RR. Devices that implement ACS Enhanced and support the ctrl=0 behavior will break PASID SVA support and VFIO isolation when ACS_RR is enabled. Due to the above I strongly encourage backporting this change otherwise old kernels may have issues with new generations of PCI switches. Signed-off-by: Jason Gunthorpe --- drivers/pci/pci.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cddd..983f71211f0055 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -957,6 +957,7 @@ static void __pci_config_acs(struct pci_dev *dev, struct pci_acs *caps, const char *p, const u16 acs_mask, const u16 acs_flags) { u16 flags = acs_flags; + u16 supported_flags; u16 mask = acs_mask; char *delimit; int ret = 0; @@ -1001,8 +1002,14 @@ static void __pci_config_acs(struct pci_dev *dev, struct pci_acs *caps, } } - if (mask & ~(PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | PCI_ACS_CR | - PCI_ACS_UF | PCI_ACS_EC | PCI_ACS_DT)) { + supported_flags = PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | + PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_EC | + PCI_ACS_DT; + if (caps->cap & PCI_ACS_ENHANCED) + supported_flags |= PCI_ACS_USP_MT_RR | + PCI_ACS_DSP_MT_RR | + PCI_ACS_UNCLAIMED_RR; + if (mask & ~supported_flags) { pci_err(dev, "Invalid ACS flags specified\n"); return; } @@ -1062,6 +1069,14 @@ static void pci_std_enable_acs(struct pci_dev *dev, struct pci_acs *caps) /* Upstream Forwarding */ caps->ctrl |= (caps->cap & PCI_ACS_UF); + /* + * USP/DSP Memory Target Access Control and Unclaimed Request Redirect + */ + if (caps->cap & PCI_ACS_ENHANCED) { + caps->ctrl |= PCI_ACS_USP_MT_RR | PCI_ACS_DSP_MT_RR | + PCI_ACS_UNCLAIMED_RR; + } + /* Enable Translation Blocking for external devices and noats */ if (pci_ats_disabled() || dev->external_facing || dev->untrusted) caps->ctrl |= (caps->cap & PCI_ACS_TB); -- 2.43.0 Switches ignore the PASID when routing TLPs. This means the path from the PASID issuing end point to the IOMMU must be direct with no possibility for another device to claim the addresses. This is done using ACS flags and pci_enable_pasid() checks for this. The new ACS Enhanced bits clarify some undefined behaviors in the spec around what P2P Request Redirect means. Linux has long assumed that PCI_ACS_RR implies PCI_ACS_DSP_MT_RR | PCI_ACS_USP_MT_RR | PCI_ACS_UNCLAIMED_RR. If the device supports ACS Enhanced then use the information it reports to determine if PASID SVA is supported or not. PCI_ACS_DSP_MT_RR: Prevents Downstream Port BAR's from claiming upstream flowing transactions PCI_ACS_USP_MT_RR: Prevents Upstream Port BAR's from claiming upstream flowing transactions PCI_ACS_UNCLAIMED_RR: Prevents a hole in the USP bridge window compared to all the DSP bridge windows from generating a error. Each of these cases would poke a hole in the PASID address space which is not permitted. Enhance the comments around pci_acs_flags_enabled() to better explain the reasoning for its logic. Continue to take the approach of assuming the device is doing the "right ACS" if it does not explicitly declare otherwise. Fixes: 201007ef707a ("PCI: Enable PASID only when ACS RR & UF enabled on upstream path") Signed-off-by: Jason Gunthorpe --- drivers/pci/ats.c | 4 +++- drivers/pci/pci.c | 54 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index ec6c8dbdc5e9c9..00603c2c4ff0ea 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -416,7 +416,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (!pasid) return -EINVAL; - if (!pci_acs_path_enabled(pdev, NULL, PCI_ACS_RR | PCI_ACS_UF)) + if (!pci_acs_path_enabled(pdev, NULL, + PCI_ACS_RR | PCI_ACS_UF | PCI_ACS_USP_MT_RR | + PCI_ACS_DSP_MT_RR | PCI_ACS_UNCLAIMED_RR)) return -EINVAL; pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 983f71211f0055..620b7f79093854 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3606,6 +3606,52 @@ void pci_configure_ari(struct pci_dev *dev) } } + +/* + * The spec is not clear what it means if the capability bit is 0. One view is + * that the device acts as though the ctrl bit is zero, another view is the + * device behavior is undefined. + * + * Historically Linux has taken the position that the capability bit as 0 means + * the device supports the most favorable interpretation of the spec - ie that + * things like P2P RR are always on. As this is security sensitive we expect + * devices that do not follow this rule to be quirked. + * + * ACS Enhanced eliminated undefined areas of the spec around MMIO in root ports + * and switch ports. If those ports have no MMIO then it is not relavent. + * PCI_ACS_UNCLAIMED_RR eliminates the undefined area around an upstream switch + * window that is not fully decoded by the downstream windows. + * + * This takes the same approach with ACS Enhanced, if the device does not + * support it then we assume the ACS P2P RR has all the enhanced behaviors too. + * + * Due to ACS Enhanced bits being force set to 0 by older Linux kernels, and + * those values would break old kernels on the edge cases they cover, the only + * compatible thing for a new device to implement is ACS Enhanced supported with + * the control bits (except PCI_ACS_IORB) wired to follow ACS_RR. + */ +static u16 pci_acs_ctrl_mask(struct pci_dev *pdev, u16 hw_cap) +{ + /* + * Egress Control enables use of the Egress Control Vector which is not + * present without the cap. + */ + u16 mask = PCI_ACS_EC; + + mask = hw_cap & (PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | + PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT); + + /* + * If ACS Enhanced is supported the device reports what it is doing + * through these bits which may not be settable. + */ + if (hw_cap & PCI_ACS_ENHANCED) + mask |= PCI_ACS_IORB | PCI_ACS_DSP_MT_RB | PCI_ACS_DSP_MT_RR | + PCI_ACS_USP_MT_RB | PCI_ACS_USP_MT_RR | + PCI_ACS_UNCLAIMED_RR; + return mask; +} + static bool pci_acs_flags_enabled(struct pci_dev *pdev, u16 acs_flags) { int pos; @@ -3615,15 +3661,9 @@ static bool pci_acs_flags_enabled(struct pci_dev *pdev, u16 acs_flags) if (!pos) return false; - /* - * Except for egress control, capabilities are either required - * or only required if controllable. Features missing from the - * capability field can therefore be assumed as hard-wired enabled. - */ pci_read_config_word(pdev, pos + PCI_ACS_CAP, &cap); - acs_flags &= (cap | PCI_ACS_EC); - pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl); + acs_flags &= pci_acs_ctrl_mask(pdev, cap); return (ctrl & acs_flags) == acs_flags; } -- 2.43.0 When looking at a PCIe switch we want to see that the USP/DSP MMIO have request redirect enabled. Detect the case where the USP is expressly not isolated from the DSP and ensure the USP is included in the group. The DSP Memory Target also applies to the Root Port, check it there too. If upstream directed transactions can reach the root port MMIO then it is not isolated. Signed-off-by: Jason Gunthorpe --- drivers/pci/search.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index dac6b042fd5f5d..cba417cbe3476e 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -127,6 +127,8 @@ static enum pci_bus_isolation pcie_switch_isolated(struct pci_bus *bus) * traffic flowing upstream back downstream through another DSP. * * Thus any non-permissive DSP spoils the whole bus. + * PCI_ACS_UNCLAIMED_RR is not required since rejecting requests with + * error is still isolation. */ guard(rwsem_read)(&pci_bus_sem); list_for_each_entry(pdev, &bus->devices, bus_list) { @@ -136,8 +138,14 @@ static enum pci_bus_isolation pcie_switch_isolated(struct pci_bus *bus) pdev->dma_alias_mask) return PCIE_NON_ISOLATED; - if (!pci_acs_enabled(pdev, PCI_ACS_ISOLATED)) + if (!pci_acs_enabled(pdev, PCI_ACS_ISOLATED | + PCI_ACS_DSP_MT_RR | + PCI_ACS_USP_MT_RR)) { + /* The USP is isolated from the DSP */ + if (!pci_acs_enabled(pdev, PCI_ACS_USP_MT_RR)) + return PCIE_NON_ISOLATED; return PCIE_SWITCH_DSP_NON_ISOLATED; + } } return PCIE_ISOLATED; } @@ -232,11 +240,13 @@ enum pci_bus_isolation pci_bus_isolated(struct pci_bus *bus) /* * Since PCIe links are point to point root ports are isolated if there * is no internal loopback to the root port's MMIO. Like MFDs assume if - * there is no ACS cap then there is no loopback. + * there is no ACS cap then there is no loopback. The root port uses + * DSP_MT_RR for its own MMIO. */ case PCI_EXP_TYPE_ROOT_PORT: if (bridge->acs_cap && - !pci_acs_enabled(bridge, PCI_ACS_ISOLATED)) + !pci_acs_enabled(bridge, + PCI_ACS_ISOLATED | PCI_ACS_DSP_MT_RR)) return PCIE_NON_ISOLATED; return PCIE_ISOLATED; -- 2.43.0