From 21d1a13a903c50d94eb24a1220c47031edefde60 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:38 +0000 Subject: [PATCH 01/42] cxl: support Type2 when initializing cxl_dev_state BugLink: https://bugs.launchpad.net/bugs/2153819 In preparation for type2 drivers add function and macro for differentiating CXL memory expanders (type 3) from CXL device accelerators (type 2) helping drivers built from public headers to embed struct cxl_dev_state inside a private struct. Update type3 driver for using this same initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-2-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 9a775c07bb04384f7c03a35dd04818ed818c1f71) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit b3b0bd6c29f1b77917b3e07dc09855f227bc8ad2) --- drivers/cxl/cxlmem.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 776c50d1db51..700798d2f23c 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -440,6 +440,37 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) + enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, -- Gitee From d08195e26c7e901a822999223fd6aa134ca22fee Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:39 +0000 Subject: [PATCH 02/42] cxl: export internal structs for external Type2 drivers BugLink: https://bugs.launchpad.net/bugs/2153819 In preparation for type2 support, move structs and functions a type2 driver will need to access to into a new shared header file. Differentiate between public and private data to be preserved by type2 drivers. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Tested-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-3-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 005869886d1d370afb6c10cd40709d956960e9c2) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit c6ac38b966f9f6c122600da3360782c42e0e96f9) --- drivers/cxl/cxlmem.h | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 700798d2f23c..776c50d1db51 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -440,37 +440,6 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } -struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, - enum cxl_devtype type, - u64 serial, u16 dvsec, - size_t size, bool has_mbox); - -/** - * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a - * driver specific struct. - * - * @parent: device behind the request - * @type: CXL device type - * @serial: device identification - * @dvsec: dvsec capability offset - * @drv_struct: driver struct embedding a cxl_dev_state struct - * @member: name of the struct cxl_dev_state member in drv_struct - * @mbox: true if mailbox supported - * - * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state - * struct initialized. - * - * Introduced for Type2 driver support. - */ -#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ - ({ \ - static_assert(__same_type(struct cxl_dev_state, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member) == 0); \ - (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ - sizeof(drv_struct), mbox); \ - }) - enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, -- Gitee From f1e5255a054e9c0515c16ddb9c93004d9a5d8f87 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:40 +0000 Subject: [PATCH 03/42] cxl: Move pci generic code from cxl_pci to core/cxl_pci BugLink: https://bugs.launchpad.net/bugs/2153819 Inside cxl/core/pci.c there are helpers for CXL PCIe initialization meanwhile cxl/pci_drv.c implements the functionality for a Type3 device initialization. In preparation for type2 support, move helper functions from cxl/pci.c to cxl/core/pci.c in order to be exported and used by type2 drivers. [ dj: Clarified subject. ] Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: Gregory Price Link: https://patch.msgid.link/20260306164741.3796372-4-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 58f28930c7fb0e24cdf2972a9c3b7c91aeef4539) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit f20bbd18cc0d44efea3e09d21b55d238f749a788) --- drivers/cxl/core/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index d1f487b3d809..c32cc62c501d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -707,6 +707,11 @@ static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, .resource = CXL_RESOURCE_NONE, }; + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); if (component_reg_phys == CXL_RESOURCE_NONE) return -ENXIO; -- Gitee From d9d9f3f3fafd621466b218a7f531858a1b0f18f1 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 6 Mar 2026 16:47:41 +0000 Subject: [PATCH 04/42] cxl/pci: Remove redundant cxl_pci_find_port() call BugLink: https://bugs.launchpad.net/bugs/2153819 Remove the redundant port lookup from cxl_rcrb_get_comp_regs() and use the dport parameter directly. The caller has already validated the port is non-NULL before invoking this function, and dport is given as a param. This is simpler than getting dport in the callee and return the pointer to the caller what would require more changes. Signed-off-by: Gregory Price Reviewed-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20260306164741.3796372-5-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit d537d953c47866bafc89feb66d8ef34baf17659a) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 9dad3d47e247618fedf0351b1e754fb441e66a99) --- drivers/cxl/core/pci.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c32cc62c501d..d1f487b3d809 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -707,11 +707,6 @@ static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, .resource = CXL_RESOURCE_NONE, }; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); if (component_reg_phys == CXL_RESOURCE_NONE) return -ENXIO; -- Gitee From c08ff96e72ddba4372d1967270c9fa106fdc5b00 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:24 +0100 Subject: [PATCH 05/42] NVIDIA: VR: SAUCE: cxl: Prepare memdev creation for type2 BugLink: https://bugs.launchpad.net/bugs/2153819 Current cxl core is relying on a CXL_DEVTYPE_CLASSMEM type device when creating a memdev leading to problems when obtaining cxl_memdev_state references from a CXL_DEVTYPE_DEVMEM type. Modify check for obtaining cxl_memdev_state adding CXL_DEVTYPE_DEVMEM support. Make devm_cxl_add_memdev accessible from an accel driver. Signed-off-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Dan Williams (cherry picked from https://lore.kernel.org/r/20260423180528.17166-5-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 1b0bf4179e1ec4d96fce92643649a31a683a256a) --- drivers/cxl/core/memdev.c | 15 +++++++++++-- drivers/cxl/cxlmem.h | 6 ------ drivers/cxl/mem.c | 45 +++++++++++++++++++++++++++++---------- include/cxl/cxl.h | 8 +++++++ 4 files changed, 55 insertions(+), 19 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 80e65690eb77..0587a7509a6f 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "trace.h" #include "core.h" @@ -579,9 +580,16 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +static const struct device_type cxl_accel_memdev_type = { + .name = "cxl_accel_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, +}; + bool is_cxl_memdev(const struct device *dev) { - return dev->type == &cxl_memdev_type; + return (dev->type == &cxl_memdev_type || + dev->type == &cxl_accel_memdev_type); } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); @@ -710,7 +718,10 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->parent = cxlds->dev; dev->bus = &cxl_bus_type; dev->devt = MKDEV(cxl_mem_major, cxlmd->id); - dev->type = &cxl_memdev_type; + if (cxlds->type == CXL_DEVTYPE_DEVMEM) + dev->type = &cxl_accel_memdev_type; + else + dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); INIT_WORK(&cxlmd->detach_work, detach_memdev); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 776c50d1db51..92cca400d113 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,10 +34,6 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) -struct cxl_memdev_attach { - int (*probe)(struct cxl_memdev *cxlmd); -}; - /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -103,8 +99,6 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, - const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index fcffe24dcb42..ff858318091f 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -65,6 +65,26 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static void cxl_memdev_poison_enable(struct cxl_memdev_state *mds, + struct cxl_memdev *cxlmd, + struct dentry *dentry) +{ + /* + * Avoid poison debugfs for DEVMEM aka accelerators as they rely on + * cxl_memdev_state. + */ + if (!mds) + return; + + if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) + debugfs_create_file("inject_poison", 0200, dentry, cxlmd, + &cxl_poison_inject_fops); + + if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) + debugfs_create_file("clear_poison", 0200, dentry, cxlmd, + &cxl_poison_clear_fops); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -92,12 +112,7 @@ static int cxl_mem_probe(struct device *dev) dentry = cxl_debugfs_create_dir(dev_name(dev)); debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); - if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) - debugfs_create_file("inject_poison", 0200, dentry, cxlmd, - &cxl_poison_inject_fops); - if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) - debugfs_create_file("clear_poison", 0200, dentry, cxlmd, - &cxl_poison_clear_fops); + cxl_memdev_poison_enable(mds, cxlmd, dentry); rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); if (rc) @@ -206,16 +221,24 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +static bool cxl_poison_attr_visible(struct kobject *kobj, struct attribute *a) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); - if (a == &dev_attr_trigger_poison_list.attr) - if (!test_bit(CXL_POISON_ENABLED_LIST, - mds->poison.enabled_cmds)) - return 0; + if (!mds || + !test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return false; + + return true; +} + +static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_trigger_poison_list.attr && + !cxl_poison_attr_visible(kobj, a)) + return 0; return a->mode; } diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index fa7269154620..10a9b8fa2f6b 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -149,6 +149,10 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_dev_state - The driver device state * @@ -223,4 +227,8 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ sizeof(drv_struct), mbox); \ }) + +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); #endif /* __CXL_CXL_H__ */ -- Gitee From 0cfd011e2b27f0f856a50c216c6ad499f196aac2 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:26 +0100 Subject: [PATCH 06/42] NVIDIA: VR: SAUCE: cxl: attach region to an accelerator/type2 memdev BugLink: https://bugs.launchpad.net/bugs/2153819 Support an accelerator driver to safely work with an autodiscovered region from a committed HDM decoder through: 1) an accelerator driver cxl_attach_region struct with attach and detach callbacks. 2) a specific function, cxl_memdev_attach_region() keeping the required locks for finding a region linked to the memdev endpoint, and 3) invoking attach callback while keeping the locking allowing to work (ioremap and other internal stuff) with the related physical range by the accelerator driver, and 4) linking a detach callback to the endpoint device removal where the accelerator driver can stop using the region range. This covers the cases of a potential removal of cxl_acpi module or a accelerator memdev unbinding from cxl_mem driver through sysfs. Signed-off-by: Alejandro Lucero (backported from https://lore.kernel.org/r/20260423180528.17166-7-alejandro.lucero-palau@amd.com) [kobak: Check cxl_memdev_attach_region() errors and propagate failure so SFC probe does not continue after CXL core tears down the attached region. Set probe_data->cxl before attaching so the attach callback can use it, guard attach attempts before a valid endpoint exists, explicitly unwind attach/autoremove side effects if devres action registration fails, preserve DEVMEM target type for autodiscovered regions, and route delete / construct-failure cleanup through endpoint-owned devres actions.] [kobak: Keep no-detach DEVMEM unregister under the endpoint-device guard so attach cannot install endpoint devres actions for a region being freed.] [kobak: Avoid devres-registration failure cleanup under cxl_rwsem.region read lock: keep endpoint->dev locked, drop the region/DPA read guards before unregister_region(), and use devm_remove_action() so failed detach-action registration does not run cxl_endpoint_region_autoremove() under the read lock.] Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 8e713680899db48ba544181a839d00c8df19d459) --- drivers/cxl/core/region.c | 184 ++++++++++++++++++++++++++++++++++++-- drivers/cxl/cxl.h | 4 + include/cxl/cxl.h | 17 ++++ 3 files changed, 197 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e50dc716d4e8..8cf6f0c89e4d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2559,6 +2559,41 @@ static void unregister_region(void *_cxlr) put_device(&cxlr->dev); } +static void cxl_endpoint_region_autoremove(void *_cxlr); + +static void cxl_region_release_action(struct cxl_region *cxlr) +{ + struct cxl_port *port = cxlrd_to_port(cxlr->cxlrd); + + if (cxlr->type != CXL_DECODER_DEVMEM) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return; + } + + if (cxlr->params.nr_targets) { + struct cxl_endpoint_decoder *cxled = cxlr->params.targets[0]; + struct cxl_port *endpoint = cxled_to_port(cxled); + + guard(device)(&endpoint->dev); + if (cxlr->detach) { + void (*detach)(void *data) = cxlr->detach; + void *detach_data = cxlr->detach_data; + + cxlr->detach = NULL; + cxlr->detach_data = NULL; + devm_release_action(&endpoint->dev, detach, detach_data); + devm_release_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + } else { + unregister_region(cxlr); + } + return; + } + + unregister_region(cxlr); +} + static struct lock_class_key cxl_region_key; static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) @@ -2711,9 +2746,16 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); - if (rc) - return ERR_PTR(rc); + /* + * For accelerators/type2, region release linked to endpoint device. + * See handling of cxl_endpoint_region_autoremove() below by + * cxl_memdev_attach_region(). + */ + if (type == CXL_DECODER_HOSTONLYMEM) { + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + } dev_dbg(port->uport_dev, "%s: created %s\n", dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); @@ -2764,7 +2806,6 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, memregion_free(rc); return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, target_type); } @@ -2836,14 +2877,13 @@ static ssize_t delete_region_store(struct device *dev, const char *buf, size_t len) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); - struct cxl_port *port = to_cxl_port(dev->parent); struct cxl_region *cxlr; cxlr = cxl_find_region_by_name(cxlrd, buf); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); put_device(&cxlr->dev); return len; @@ -3709,7 +3749,6 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, { struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; @@ -3730,7 +3769,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = __construct_region(cxlr, ctx); if (rc) { - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); return ERR_PTR(rc); } @@ -4043,6 +4082,135 @@ static int cxl_region_can_probe(struct cxl_region *cxlr) return 0; } +static int first_mapped_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (cxled->cxld.region) + return 1; + + return 0; +} + +/* + * As this is running in endpoint port remove context it does not race cxl_root + * destruction since port topologies are always removed depth first. + */ +static void cxl_endpoint_region_autoremove(void *_cxlr) +{ + unregister_region(_cxlr); +} + +/** + * cxl_memdev_attach_region - bind region to accelerator memdev + * + * @cxlmd: a pointer to cxl_memdev to use + * @attach: a pointer to region attach struct with callbacks for + * safely working with a region range by the caller + * + * Returns 0 or error. + */ +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, + struct cxl_attach_region *attach) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + int rc; + + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + if (!endpoint) + return -ENXIO; + + { + /* hold endpoint lock to setup autoremove of the region */ + guard(device)(&endpoint->dev); + if (!endpoint->dev.driver) + return -ENXIO; + + { + guard(rwsem_read)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + + /* + * TODO auto-instantiate a region, for now assume this will + * find an auto-region. + */ + struct device *dev __free(put_device) = + device_find_child(&endpoint->dev, NULL, + first_mapped_decoder); + + if (!dev) { + dev_dbg(cxlmd->cxlds->dev, + "no region found for memdev %s\n", + dev_name(&cxlmd->dev)); + return -ENXIO; + } + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + + if (cxlr->params.state < CXL_CONFIG_COMMIT) { + dev_dbg(cxlmd->cxlds->dev, + "region %s not committed for memdev %s\n", + dev_name(&cxlr->dev), dev_name(&cxlmd->dev)); + return -ENXIO; + } + + if (cxlr->params.nr_targets > 1) { + dev_dbg(cxlmd->cxlds->dev, + "Only attach to local non-interleaved region\n"); + return -ENXIO; + } + + attach->region = (struct range) { + .start = cxlr->params.res->start, + .end = cxlr->params.res->end, + }; + + /* + * With endpoint locked leave the caller to safely work + * with the region range. + */ + rc = attach->attach(attach->data); + if (rc) + return rc; + + /* Only teardown regions that pass validation, ignore the rest */ + rc = devm_add_action(&endpoint->dev, + cxl_endpoint_region_autoremove, cxlr); + if (rc) { + attach->detach(attach->data); + goto err_unregister; + } + + /* Link type2 driver callback for stopping use of the region range. */ + rc = devm_add_action_or_reset(&endpoint->dev, + attach->detach, attach->data); + if (rc) { + devm_remove_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + goto err_unregister; + } + + cxlr->detach = attach->detach; + cxlr->detach_data = attach->data; + + return 0; + } +err_unregister: + unregister_region(cxlr); + return rc; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_attach_region, "CXL"); + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 1297594beaec..32702da9edfd 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -455,6 +455,8 @@ struct cxl_region_params { * @hpa_range: Address range occupied by the region * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type + * @detach: accelerator detach callback for device-memory regions + * @detach_data: accelerator detach callback data * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge * @flags: Region state flags @@ -470,6 +472,8 @@ struct cxl_region { struct range hpa_range; enum cxl_partition_mode mode; enum cxl_decoder_type type; + void (*detach)(void *data); + void *detach_data; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; unsigned long flags; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 10a9b8fa2f6b..22d9435b351f 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -7,6 +7,7 @@ #include #include +#include #include /** @@ -153,6 +154,20 @@ struct cxl_memdev_attach { int (*probe)(struct cxl_memdev *cxlmd); }; +/** + * struct cxl_attach_region - accelerator region handling + * @attach: invoked at cxl_memdev_attach_region() with endpoint device locked. + * @detach: invoked at endpoint release. + * @data: pointer referencing accelerator data for attach and detach calls. + * @region: initialised with autodiscovered region values linked to memdev. + */ +struct cxl_attach_region { + int (*attach)(void *); + void (*detach)(void *); + void *data; + struct range region; +}; + /** * struct cxl_dev_state - The driver device state * @@ -231,4 +246,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); +struct cxl_region; +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); #endif /* __CXL_CXL_H__ */ -- Gitee From 8835092edb38c6f37e4fcab77765b9fd195843ab Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:27 +0100 Subject: [PATCH 07/42] NVIDIA: VR: SAUCE: cxl: Avoid dax creation for accelerators BugLink: https://bugs.launchpad.net/bugs/2153819 By definition a type2 cxl device will use the host managed memory for specific functionality, therefore it should not be available to other uses like DAX. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (cherry picked from https://lore.kernel.org/r/20260423180528.17166-8-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit c8df664ce1c8567f4c89c58d7e074ca56d691379) --- drivers/cxl/core/region.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 8cf6f0c89e4d..f9e9e650253f 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -4221,6 +4221,13 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* + * HDM-D[B] (device-memory) regions have accelerator specific usage. + * Skip device-dax registration. + */ + if (cxlr->type == CXL_DECODER_DEVMEM) + return 0; + /* * From this point on any path that changes the region's state away from * CXL_CONFIG_COMMIT is also responsible for releasing the driver. -- Gitee From 811b706512697db2836e820aa95912a36dc08679 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:55 +0000 Subject: [PATCH 08/42] NVIDIA: VR: SAUCE: cxl/region: Skip decoder reset on detach for autodiscovered regions BugLink: https://bugs.launchpad.net/bugs/2153819 __cxl_decoder_detach() currently resets decoder programming whenever a region is detached if cxl_config_state is beyond CXL_CONFIG_ACTIVE. For autodiscovered regions, this can incorrectly tear down decoder state that may be relied upon by other consumers or by subsequent ownership decisions. Skip cxl_region_decode_reset() during detach when CXL_REGION_F_AUTO is set. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-4-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-4-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit ba18d87a25e07df2fb0909849cbd437cb75dfd8e) --- drivers/cxl/core/region.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index f9e9e650253f..6cd8e5a11e7c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2277,7 +2277,9 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled->part = -1; if (p->state > CXL_CONFIG_ACTIVE) { - cxl_region_decode_reset(cxlr, p->interleave_ways); + if (!test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } -- Gitee From af173386a2fab8d4e500fc4d486942e94f820393 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:58 +0000 Subject: [PATCH 09/42] NVIDIA: VR: SAUCE: cxl/region: Add helper to check Soft Reserved containment by CXL regions BugLink: https://bugs.launchpad.net/bugs/2153819 Add a helper to determine whether a given Soft Reserved memory range is fully contained within the committed CXL region. This helper provides a primitive for policy decisions in subsequent patches such as co-ordination with dax_hmem to determine whether CXL has fully claimed ownership of Soft Reserved memory ranges. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-7-Smita.KoralahalliChannabasappa@amd.com (backported from https://lore.kernel.org/r/20260210064501.157591-7-Smita.KoralahalliChannabasappa@amd.com) [kobak: Added the Soft Reserved declaration to the existing Type2 include/cxl/cxl.h header instead of recreating that header.] Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 11b8aaac68e515d53f89c2c356f86b3d2cb21d43) --- drivers/cxl/core/region.c | 7 +++++++ include/cxl/cxl.h | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 6cd8e5a11e7c..54ba2b898378 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -4059,6 +4060,12 @@ bool cxl_region_contains_resource(const struct resource *res) } EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem"); +bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return cxl_region_contains_resource(res); +} +EXPORT_SYMBOL_GPL(cxl_region_contains_soft_reserve); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 22d9435b351f..3fbd9eac137e 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -248,4 +248,13 @@ struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); struct cxl_region; int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); + +#ifdef CONFIG_CXL_REGION +bool cxl_region_contains_soft_reserve(struct resource *res); +#else +static inline bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return false; +} +#endif #endif /* __CXL_CXL_H__ */ -- Gitee From e66d7a593a5a64386b05afb2bb5852d0668cc46d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 28 Oct 2025 10:47:53 +0100 Subject: [PATCH 10/42] NVIDIA: VR: SAUCE: cxl/region: Support multi-level interleaving with smaller granularities for lower levels BugLink: https://bugs.launchpad.net/bugs/2153819 The CXL specification supports multi-level interleaving "as long as all the levels use different, but consecutive, HPA bits to select the target and no Interleave Set has more than 8 devices" (from 3.2). Currently the kernel expects that a decoder's "interleave granularity is a multiple of @parent_port granularity". That is, the granularity of a lower level is bigger than those of the parent and uses the outer HPA bits as selector. It works e.g. for the following 8-way config: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 256 granularity * Selector: HPA[8:9] * sub-link (CXL Host bridge config of the HDM): * 2-way * 1024 granularity * Selector: HPA[10] Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way config could look like this: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 512 granularity * Selector: HPA[9:10] * sub-link (CXL Host bridge config of the HDM): * 2-way * 256 granularity * Selector: HPA[8] The enumeration of decoders for this configuration fails then with following error: cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200] cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff cxl_port endpoint12: failed to attach decoder12.0 to region0: -6 Note that this happens only if firmware is setting up the decoders (CXL_REGION_F_AUTO). For userspace region assembly the granularities are chosen to increase from root down to the lower levels. That is, outer HPA bits are always used for lower interleaving levels. Rework the implementation to also support multi-level interleaving with smaller granularities for lower levels. Determine the interleave set of autodetected decoders. Check that it is a subset of the root interleave. The HPA selector bits are extracted for all decoders of the set and checked that there is no overlap and bits are consecutive. All decoders can be programmed now to use any bit range within the region's target selector. Signed-off-by: Robert Richter (backported from https://lore.kernel.org/all/20251028094754.72816-1-rrichter@amd.com/) [kobak: resolved conflicts with cxlr->cxlrd and spa_maps_hpa()] Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 8354f8e0fa3a272f4bc0f6f51ba59528fa8c0724) --- drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 93 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 54ba2b898378..1256661a4208 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1424,57 +1424,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) return 0; } +static inline u64 get_selector(u64 ways, u64 gran) +{ + if (!is_power_of_2(ways)) + ways /= 3; + + if (!is_power_of_2(ways) || !is_power_of_2(gran)) + return 0; + + return (ways - 1) * gran; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = cxlr->cxlrd; - int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_ep *ep = cxl_ep_load(port, cxlmd); struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; - struct cxl_switch_decoder *cxlsd; + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev); struct cxl_port *iter = port; - u16 eig, peig; - u8 eiw, peiw; + int ig, iw = cxl_rr->nr_targets, rc, pos = cxled->pos; + int distance, parent_distance; + u64 selector, cxlr_sel; + u16 eig; + u8 eiw; /* * While root level decoders support x3, x6, x12, switch level * decoders only support powers of 2 up to x16. */ - if (!is_power_of_2(cxl_rr->nr_targets)) { + if (!is_power_of_2(iw)) { dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - cxl_rr->nr_targets); + dev_name(port->uport_dev), dev_name(&port->dev), iw); return -EINVAL; } - cxlsd = to_cxl_switch_decoder(&cxld->dev); - if (cxl_rr->nr_targets_set) { - int i, distance = 1; - struct cxl_region_ref *cxl_rr_iter; + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } - /* - * The "distance" between peer downstream ports represents which - * endpoint positions in the region interleave a given port can - * host. - * - * For example, at the root of a hierarchy the distance is - * always 1 as every index targets a different host-bridge. At - * each subsequent switch level those ports map every Nth region - * position where N is the width of the switch == distance. - */ - do { - cxl_rr_iter = cxl_rr_load(iter, cxlr); - distance *= cxl_rr_iter->nr_targets; - iter = to_cxl_port(iter->dev.parent); - } while (!is_cxl_root(iter)); - distance *= cxlrd->cxlsd.cxld.interleave_ways; + /* + * Calculate the effective granularity and ways to determine + * HPA bits used as target selectors of the interleave set. + * Use this to check if the root decoder and all subsequent + * HDM decoders only use bits from that range as selectors. + * + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + + /* Start with the root decoders selector and distance. */ + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways, + cxlrd->cxlsd.cxld.interleave_granularity); + distance = cxlrd->cxlsd.cxld.interleave_ways; + if (!is_power_of_2(distance)) + distance /= 3; + + for (iter = parent_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr); + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder; + u64 cxld_sel; + + if (cxld_iter->interleave_ways == 1) + continue; + + cxld_sel = get_selector(cxld_iter->interleave_ways, + cxld_iter->interleave_granularity); + + if (cxld_sel & selector) { + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxld_sel, selector); + return -ENXIO; + } - for (i = 0; i < cxl_rr->nr_targets_set; i++) + selector |= cxld_sel; + distance *= cxl_rr_iter->nr_targets; + } + + parent_distance = distance; + distance *= iw; + + /* The combined selector bits must fit the region selector. */ + cxlr_sel = get_selector(p->interleave_ways, + p->interleave_granularity); + + if ((cxlr_sel & selector) != selector) { + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxlr_sel, selector); + return -ENXIO; + } + + /* Calculate remaining selector bits available for use. */ + selector = cxlr_sel & ~selector; + + if (cxl_rr->nr_targets_set) { + for (int i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, distance); @@ -1485,87 +1547,40 @@ static int cxl_port_setup_targets(struct cxl_port *port, goto add_target; } - if (is_cxl_root(parent_port)) { + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + ig = cxld->interleave_granularity; + else /* + * Set the interleave granularity with each interleave + * level to a multiple of it's parent port interleave + * ways. Beginning with the granularity of the root + * decoder set to the region granularity (starting + * with the inner selector bits of the HPA), the + * granularity is increased with each level. Calculate + * this using the parent distance and region + * granularity. + * * Root decoder IG is always set to value in CFMWS which * may be different than this region's IG. We can use the * region's IG here since interleave_granularity_store() * does not allow interleaved host-bridges with * root IG != region IG. */ - parent_ig = p->interleave_granularity; - parent_iw = cxlrd->cxlsd.cxld.interleave_ways; - /* - * For purposes of address bit routing, use power-of-2 math for - * switch ports. - */ - if (!is_power_of_2(parent_iw)) - parent_iw /= 3; - } else { - struct cxl_region_ref *parent_rr; - struct cxl_decoder *parent_cxld; - - parent_rr = cxl_rr_load(parent_port, cxlr); - parent_cxld = parent_rr->decoder; - parent_ig = parent_cxld->interleave_granularity; - parent_iw = parent_cxld->interleave_ways; - } - - rc = granularity_to_eig(parent_ig, &peig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_ig); - return rc; - } - - rc = ways_to_eiw(parent_iw, &peiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_iw); - return rc; - } + ig = p->interleave_granularity * parent_distance; - iw = cxl_rr->nr_targets; rc = ways_to_eiw(iw, &eiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), iw); - return rc; - } - - /* - * Interleave granularity is a multiple of @parent_port granularity. - * Multiplier is the parent port interleave ways. - */ - rc = granularity_to_eig(parent_ig * parent_iw, &eig); - if (rc) { - dev_dbg(&cxlr->dev, - "%s: invalid granularity calculation (%d * %d)\n", - dev_name(&parent_port->dev), parent_ig, parent_iw); - return rc; - } - - rc = eig_to_granularity(eig, &ig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - 256 << eig); - return rc; - } + if (!rc) + rc = granularity_to_eig(ig, &eig); - if (iw > 8 || iw > cxlsd->nr_targets) { - dev_dbg(&cxlr->dev, - "%s:%s:%s: ways: %d overflows targets: %d\n", + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n", dev_name(port->uport_dev), dev_name(&port->dev), - dev_name(&cxld->dev), iw, cxlsd->nr_targets); + iw, ig, selector); return -ENXIO; } if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - (iw > 1 && cxld->interleave_granularity != ig) || !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, -- Gitee From a965477ba47ea865a11e1a6a3816320150682155 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:16 +0000 Subject: [PATCH 11/42] NVIDIA: VR: SAUCE: cxl: Move HDM decoder and register map definitions to include/cxl/cxl.h BugLink: https://bugs.launchpad.net/bugs/2153819 BugLink: https://bugs.launchpad.net/bugs/2143032 Move CXL HDM decoder register defines, register map structs (cxl_reg_map, cxl_component_reg_map, cxl_device_reg_map, cxl_pmu_reg_map, cxl_register_map), cxl_hdm_decoder_count(), enum cxl_regloc_type, and cxl_find_regblock()/cxl_setup_regs() declarations from internal CXL headers to include/cxl/pci.h. This makes them accessible to code outside the CXL subsystem, in particular the PCI core CXL state save/restore support added in a subsequent patch. No functional change. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit b5e166cae47a8356338c607c99d98007b83d3324 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Also move CXL_CM_CAP_CAP_ID_RAS, CXL_CM_CAP_CAP_ID_HDM, and CXL_CM_CAP_CAP_HDM_VERSION into public include/cxl/cxl.h to keep the public CXL header layout consistent.] Signed-off-by: Koba Ko Acked-by: Matt Ochs --- drivers/cxl/cxl.h | 83 ++--------------------------------------------- include/cxl/cxl.h | 58 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 81 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 32702da9edfd..c7bffa399581 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -24,72 +24,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; * (port-driver, region-driver, nvdimm object-drivers... etc). */ -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -#define CXL_CM_CAP_CAP_ID_RAS 0x2 -#define CXL_CM_CAP_CAP_ID_HDM 0x5 -#define CXL_CM_CAP_CAP_HDM_VERSION 1 - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - -/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ -#define CXL_DECODER_MIN_GRANULARITY 256 -#define CXL_DECODER_MAX_ENCODED_IG 6 - -static inline int cxl_hdm_decoder_count(u32 cap_hdr) -{ - int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - - switch (val) { - case 0: - return 1; - case 1 ... 8: - return val * 2; - case 9 ... 12: - return (val - 4) * 4; - default: - return -ENXIO; - } -} - /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ static inline int eig_to_granularity(u16 eig, unsigned int *granularity) { @@ -223,13 +157,9 @@ int cxl_map_device_regs(const struct cxl_register_map *map, int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); #define CXL_INSTANCES_COUNT -1 -enum cxl_regloc_type; int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); -int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); -int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); @@ -250,7 +180,6 @@ int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_DECODER_F_LOCK BIT(4) #define CXL_DECODER_F_ENABLE BIT(5) #define CXL_DECODER_F_NORMALIZED_ADDRESSING BIT(6) -#define CXL_DECODER_F_RESET_MASK (CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK) enum cxl_decoder_type { CXL_DECODER_DEVMEM = 2, @@ -296,14 +225,12 @@ struct cxl_decoder { }; /* - * Track whether this decoder is free for userspace provisioning, reserved for - * region autodiscovery, whether it is started connecting (awaiting other - * peers), or has completed auto assembly. + * Track whether this decoder is reserved for region autodiscovery, or + * free for userspace provisioning. */ enum cxl_decoder_state { CXL_DECODER_STATE_MANUAL, CXL_DECODER_STATE_AUTO, - CXL_DECODER_STATE_AUTO_STAGED, }; /** @@ -727,7 +654,6 @@ DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev)) DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) -DEFINE_FREE(put_cxl_dax_region, struct cxl_dax_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); void cxl_bus_rescan(void); @@ -859,7 +785,6 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); -bool cxl_region_contains_resource(const struct resource *res); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -882,10 +807,6 @@ static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, { return 0; } -static inline bool cxl_region_contains_resource(const struct resource *res) -{ - return false; -} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 3fbd9eac137e..1c496c1e846c 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -5,6 +5,7 @@ #ifndef __CXL_CXL_H__ #define __CXL_CXL_H__ +#include #include #include #include @@ -71,6 +72,63 @@ struct cxl_regs { ); }; +#define CXL_CM_CAP_CAP_ID_RAS 0x2 +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + +/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) + +/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ +#define CXL_DECODER_MIN_GRANULARITY 256 +#define CXL_DECODER_MAX_ENCODED_IG 6 + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + struct cxl_reg_map { bool valid; int id; -- Gitee From 972b867960b941b4b62de31eb7aecf2600eb0b6b Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:00 +0530 Subject: [PATCH 12/42] NVIDIA: VR: SAUCE: cxl: Move component/HDM register defines to uapi/cxl/cxl_regs.h BugLink: https://bugs.launchpad.net/bugs/2152222 VFIO and other code outside the CXL core needs the same offset/mask constants the core uses for the component register block and HDM decoders. Pull them into a new include/uapi/cxl/cxl_regs.h (GPL-2.0 WITH Linux-syscall-note) and include it from include/cxl/cxl.h. Use uapi-friendly __GENMASK helpers for masks and _BITUL() for single-bit flags because UAPI headers cannot depend on kernel-internal BIT(). Section comments in the new file reference CXL spec r4.0 numbering. For UAPI change, replaced the SZ_64K with actual size as the macro will not be available for userspace programs. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 52ead24ed8ade6b664e65fbc5514147a6022263e from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Remove defines from include/cxl/cxl.h instead of drivers/cxl/cxl.h as they were already moved there by Srirangan's SAUCE commit, Add #include needed by __GENMASK() in uapi header] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- include/cxl/cxl.h | 43 +-------------------------- include/uapi/cxl/cxl_regs.h | 58 +++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 42 deletions(-) create mode 100644 include/uapi/cxl/cxl_regs.h diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 1c496c1e846c..cf7f37e67644 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * enum cxl_devtype - delineate type-2 from a generic type-3 device @@ -76,48 +77,6 @@ struct cxl_regs { #define CXL_CM_CAP_CAP_ID_HDM 0x5 #define CXL_CM_CAP_CAP_HDM_VERSION 1 -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - /* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ #define CXL_DECODER_MIN_GRANULARITY 256 #define CXL_DECODER_MAX_ENCODED_IG 6 diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h new file mode 100644 index 000000000000..c821ef7ec2bb --- /dev/null +++ b/include/uapi/cxl/cxl_regs.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * CXL Standard defines + * + * Hardware register offsets and bit-field masks for the CXL Component + * Register block, as defined by the CXL Specification r4.0. + */ + +#ifndef _UAPI_CXL_REGS_H_ +#define _UAPI_CXL_REGS_H_ + +#include /* __BITS_PER_LONG; needed by __GENMASK() */ +#include /* _BITUL(), _BITULL() */ +#include /* __GENMASK() */ + +/* CXL 4.0 8.2.3 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE 0x00010000 + +/* CXL 4.0 8.2.4 CXL.cache and CXL.mem Registers*/ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK __GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK __GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK __GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK __GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK __GENMASK(31, 20) + +/* CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE _BITUL(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK _BITUL(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT _BITUL(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED _BITUL(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR _BITUL(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY _BITUL(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +#endif /* _UAPI_CXL_REGS_H_ */ -- Gitee From ff26896ef5019bdf6992d2c4c18d6ba4e0d09fe7 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:08:59 +0530 Subject: [PATCH 13/42] NVIDIA: VR: SAUCE: cxl: Declare cxl_probe_component_regs in public header BugLink: https://bugs.launchpad.net/bugs/2152222 vfio-cxl lives outside drivers/cxl/ but still needs to locate the component register block and fill cxl_component_reg_map. BOS already has cxl_find_regblock() in include/cxl/pci.h, but cxl_probe_component_regs() was still private to drivers/cxl/cxl.h. Declare cxl_probe_component_regs() in include/cxl/pci.h next to the existing register-block helpers so VFIO CXL can use the parsed component register map. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit e02c1b7ac02a7 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Move cxl_probe_component_regs() to include/cxl/pci.h instead of include/cxl/cxl.h to align with existing Srirangan/Alejandro convention; skip cxl_find_regblock() move as it is already in include/cxl/pci.h; add struct cxl_component_reg_map forward declaration] [kobak: Kept the target's private drivers/cxl/cxl.h declarations while adding the public include/cxl/pci.h header expected by VFIO CXL.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- include/cxl/cxl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index cf7f37e67644..ce0cec6c76a1 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -274,4 +274,8 @@ static inline bool cxl_region_contains_soft_reserve(struct resource *res) return false; } #endif +struct cxl_component_reg_map; +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map); + #endif /* __CXL_CXL_H__ */ -- Gitee From 8642569045a402d8610694cde2c1ebeb39728ae0 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:02 +0530 Subject: [PATCH 14/42] NVIDIA: VR: SAUCE: cxl: Record BIR and BAR offset in cxl_register_map BugLink: https://bugs.launchpad.net/bugs/2152222 The Register Locator DVSEC (CXL 4.0 8.1.9) describes register blocks by BAR index (BIR) and offset within the BAR. CXL core currently only stores the resolved HPA (resource + offset) in struct cxl_register_map, so callers that need to use pci_iomap() or report the BAR to userspace must reverse-engineer the BAR from the HPA. Add bar_index and bar_offset to struct cxl_register_map and fill them in cxl_decode_regblock() when the regblock is BAR-backed (BIR 0-5). Add cxl_regblock_get_bar_info() so callers (e.g. vfio-cxl) can get BAR index and offset directly and use pci_iomap() instead of ioremap(HPA). Return -EINVAL if the map is not BAR-backed. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 947749bd1b8d0308311553dbb7ed3db38be55907 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Add cxl_regblock_get_bar_info() declaration to include/cxl/pci.h unconditionally instead of include/cxl/cxl.h with CONFIG_CXL_BUS guards, consistent with existing convention, Add BIR range validation (reject BIR >= PCI_STD_NUM_BARS) plus a bar_index bounds check in cxl_regblock_get_bar_info()] [kobak: Added the target-local private drivers/cxl/cxl.h cxl_regblock_get_bar_info() prototype; struct cxl_register_map carries bar_index/bar_offset in include/cxl/cxl.h.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/cxl/core/regs.c | 49 +++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 ++ include/cxl/cxl.h | 12 ++++++++++ 3 files changed, 63 insertions(+) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 93710cf4f0a6..e59dea079216 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -276,6 +276,19 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, u64 offset = ((u64)reg_hi << 32) | (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); + /* + * The BIR field is 3 bits wide (CXL spec); values 6 and 7 are + * reserved. PCI only defines BAR 0-5, and pci_resource_*() on a + * higher index reads past the resource array. Reject those here + * so callers do not get garbage. + */ + if (bar >= PCI_STD_NUM_BARS) { + dev_warn(&pdev->dev, + "Reserved BIR %d in Register Locator entry (type %d)\n", + bar, reg_type); + return false; + } + if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, @@ -286,9 +299,44 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; + map->bar_index = bar; + map->bar_offset = offset; return true; } +/** + * cxl_regblock_get_bar_info() - Get BAR index and offset for a BAR-backed + * regblock + * @map: Register map from cxl_find_regblock() or cxl_find_regblock_instance() + * @bar_index: Output BAR index (0-5). Optional, may be NULL. + * @bar_offset: Output offset within the BAR. Optional, may be NULL. + * + * When the register block was found via the Register Locator DVSEC and + * lives in a PCI BAR (BIR 0-5), this returns the BAR index and the offset + * within that BAR. + * + * Return: 0 if the regblock is BAR-backed (bar_index <= 5), -EINVAL otherwise. + */ +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset) +{ + if (!map || map->bar_index == 0xff) + return -EINVAL; + /* + * Guard callers against stale or out-of-range bar_index. Only BAR + * indices 0..5 are valid PCI BARs; anything else means the map was + * not BAR-backed or was filled from a reserved BIR. + */ + if (map->bar_index >= PCI_STD_NUM_BARS) + return -EINVAL; + if (bar_index) + *bar_index = map->bar_index; + if (bar_offset) + *bar_offset = map->bar_offset; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_regblock_get_bar_info, "CXL"); + /* * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index * Use CXL_INSTANCES_COUNT for @index if counting instances. @@ -307,6 +355,7 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty *map = (struct cxl_register_map) { .host = &pdev->dev, + .bar_index = 0xFF, .resource = CXL_RESOURCE_NONE, }; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c7bffa399581..d95bfdd8aee1 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -160,6 +160,8 @@ int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, + u8 *bar_index, resource_size_t *bar_offset); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index ce0cec6c76a1..fbeccabe94b3 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -117,9 +117,16 @@ struct cxl_pmu_reg_map { * @resource: physical resource base of the register block * @max_size: maximum mapping size to perform register search * @reg_type: see enum cxl_regloc_type + * @bar_index: PCI BAR index (0-5) when regblock is BAR-backed; 0xFF otherwise + * @bar_offset: offset within the BAR; only valid when bar_index <= 5 * @component_map: cxl_reg_map for component registers * @device_map: cxl_reg_maps for device registers * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + * + * When the register block is described by the Register Locator DVSEC with + * a BAR Indicator (BIR 0-5), bar_index and bar_offset are set so callers can + * use pci_iomap(pdev, bar_index, size) and base + bar_offset instead of + * ioremap(resource). */ struct cxl_register_map { struct device *host; @@ -127,6 +134,8 @@ struct cxl_register_map { resource_size_t resource; resource_size_t max_size; u8 reg_type; + u8 bar_index; + resource_size_t bar_offset; union { struct cxl_component_reg_map component_map; struct cxl_device_reg_map device_map; @@ -278,4 +287,7 @@ struct cxl_component_reg_map; void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset); + #endif /* __CXL_CXL_H__ */ -- Gitee From ce7d4a1d4c0875c8f602706fb16d0a913f569600 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:08:58 +0530 Subject: [PATCH 15/42] NVIDIA: VR: SAUCE: cxl: Add cxl_get_hdm_info() for HDM decoder metadata BugLink: https://bugs.launchpad.net/bugs/2152222 cxl_probe_component_regs() finds the HDM decoder block during device probe and caches its location, but does not record the decoder count and does not expose the result outside drivers/cxl/. vfio-cxl needs the decoder count and the byte offset and size of the HDM block without re-running the probe sequence. Record decoder_cnt in rmap->count when parsing the HDM capability in cxl_probe_component_regs(), extend struct cxl_reg_map with a count member, and add cxl_get_hdm_info() to return offset, size, and count from the cached map. Export under the CXL namespace; stub to -EOPNOTSUPP when CONFIG_CXL_BUS is off. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit fd317b86093e9 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [kobak: Added the target-local private drivers/cxl/cxl.h cxl_get_hdm_info() prototype because drivers/cxl/core/pci.c includes the private CXL header in addition to the public include/cxl/cxl.h declaration.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/cxl/core/pci.c | 29 +++++++++++++++++++++++++++++ drivers/cxl/core/regs.c | 1 + drivers/cxl/cxl.h | 2 ++ include/cxl/cxl.h | 15 +++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index d1f487b3d809..2490ea74bfa3 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -448,6 +448,35 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL"); +/** + * cxl_get_hdm_info - Get HDM decoder register block location and count + * @cxlds: CXL device state (must have component regs enumerated via + * cxl_probe_component_regs()) + * @count: number of HDM decoders in the block (from HDM Capability bits [3:0]) + * @offset: byte offset of HDM decoder block within the component register BAR + * @size: size in bytes of the HDM decoder block + * + * Return: 0 on success. -ENODEV if the HDM decoder block is not present. + */ +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ + struct cxl_reg_map *hdm = &cxlds->reg_map.component_map.hdm_decoder; + + if (WARN_ON(!count || !offset || !size)) + return -EINVAL; + + if (!hdm->valid) + return -ENODEV; + + *count = hdm->count; + *offset = hdm->offset; + *size = hdm->size; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hdm_info, "CXL"); + #define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff #define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 #define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index e59dea079216..c73a05742be0 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -84,6 +84,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, decoder_cnt = cxl_hdm_decoder_count(hdr); length = 0x20 * decoder_cnt + 0x10; rmap = &map->hdm_decoder; + rmap->count = decoder_cnt; break; } case CXL_CM_CAP_CAP_ID_RAS: diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index d95bfdd8aee1..605af66d50dd 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -724,6 +724,8 @@ int cxl_port_setup_regs(struct cxl_port *port, resource_size_t component_reg_phys); struct cxl_dev_state; +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, struct cxl_endpoint_dvsec_info *info); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index fbeccabe94b3..ddefc43561fd 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -93,6 +93,7 @@ struct cxl_reg_map { int id; unsigned long offset; unsigned long size; + u8 count; }; struct cxl_component_reg_map { @@ -290,4 +291,18 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, resource_size_t *bar_offset); +#ifdef CONFIG_CXL_BUS + +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); + +#else + +static inline +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ return -EOPNOTSUPP; } + +#endif /* CONFIG_CXL_BUS */ + #endif /* __CXL_CXL_H__ */ -- Gitee From 17d04599dbd347b1da944a37d3114200ec5d353e Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:01 +0530 Subject: [PATCH 16/42] NVIDIA: VR: SAUCE: cxl: Split cxl_await_range_active() from media-ready wait BugLink: https://bugs.launchpad.net/bugs/2152222 Before accessing CXL device memory after reset/power-on, the driver must ensure media is ready. Not every CXL device implements the CXL Memory Device register group (many Type-2 devices do not). cxl_await_media_ready() reads cxlds->regs.memdev. Access to the memory device registers on a Type-2 device may result in kernel panic. Split the HDM DVSEC range-active poll out of cxl_await_media_ready() into a new function, cxl_await_range_active(). Type-2 devices often lack the CXLMDEV status register, so they need the range check without the memdev read. cxl_await_media_ready() now calls cxl_await_range_active() for the DVSEC poll, then reads the memory device status as before. Co-developed-by: Zhi Wang Reviewed-by: Dave Jiang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 023bae337329a533c86481d829ea1bbc9ea7aa21 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Add cxl_await_range_active() declaration to include/cxl/pci.h unconditionally instead of include/cxl/cxl.h with CONFIG_CXL_BUS guards, consistent with existing convention] [kobak: Folded the private drivers/cxl/cxl.h cxl_await_range_active() prototype into this helper commit because drivers/cxl/core/pci.c includes the private CXL header.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/cxl/core/pci.c | 35 ++++++++++++++++++++++++++++++----- drivers/cxl/cxl.h | 1 + include/cxl/cxl.h | 9 +++++++++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 2490ea74bfa3..9493bcdbf34a 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -141,16 +141,24 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) return 0; } -/* - * Wait up to @media_ready_timeout for the device to report memory - * active. +/** + * cxl_await_range_active - Wait for all HDM DVSEC memory ranges to be active + * @cxlds: CXL device state (DVSEC and HDM count must be valid) + * + * For each HDM decoder range reported in the CXL DVSEC capability, waits for + * the range to report MEM INFO VALID (up to 1s per range), then MEM ACTIVE + * (up to media_ready_timeout seconds per range, default 60s). Used by + * cxl_await_media_ready() and by callers that only need range readiness + * without checking the memory device status register. + * + * Return: 0 if all ranges become valid and active, -ETIMEDOUT if a timeout + * occurs, or a negative errno from config read on failure. */ -int cxl_await_media_ready(struct cxl_dev_state *cxlds) +int cxl_await_range_active(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; int rc, i, hdm_count; - u64 md_status; u16 cap; rc = pci_read_config_word(pdev, @@ -171,6 +179,23 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) return rc; } + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_range_active, "CXL"); + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + u64 md_status; + int rc; + + rc = cxl_await_range_active(cxlds); + if (rc) + return rc; + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 605af66d50dd..535786860049 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -724,6 +724,7 @@ int cxl_port_setup_regs(struct cxl_port *port, resource_size_t component_reg_phys); struct cxl_dev_state; +int cxl_await_range_active(struct cxl_dev_state *cxlds); int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, resource_size_t *offset, resource_size_t *size); int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index ddefc43561fd..87fb52e9c47f 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -305,4 +305,13 @@ int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, #endif /* CONFIG_CXL_BUS */ +/* f951acc: split from media-ready wait */ +struct cxl_dev_state; +int cxl_await_range_active(struct cxl_dev_state *cxlds); + +/* a6a063d: exported reset helpers for VFIO */ +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); + #endif /* __CXL_CXL_H__ */ -- Gitee From 68e74e52d5918cc3d6ac8f0290b46cd11ae7b7d6 Mon Sep 17 00:00:00 2001 From: Vishal Aslot Date: Tue, 14 Oct 2025 19:40:06 -0700 Subject: [PATCH 17/42] NVIDIA: VR: SAUCE: cxl: Allow zero sized HDM decoders BugLink: https://bugs.launchpad.net/bugs/2138266 CXL spec permits committing zero sized decoders. Linux currently considers them as an error. Zero-sized decoders are helpful when the BIOS is committing them. Often BIOS will also lock them to prevent them being changed due to the TSP requirement. For example, if the type 3 device is part of a TCB. The host bridge, switch, and end-point decoders can all be committed with zero-size. If they are locked along the VH, it is often to prevent hotplugging of a new device that could not be attested post boot and cannot be included in TCB. The caller leaves the decoder allocated but does not add it. It simply continues to the next decoder. Signed-off-by: Vishal Aslot (backported from https://lore.kernel.org/all/20251015024019.1189713-1-vaslot@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg (cherry picked from commit aa8224a0ffff58f91499084437badac56740dcc7 noble:linux-nvidia-6.17) Signed-off-by: Jacob Martin --- drivers/cxl/core/hdm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 0c80b76a5f9b..5a2c83705a8d 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -1031,13 +1031,14 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, return -ENXIO; } + port->commit_end = cxld->id; + if (size == 0) { - dev_warn(&port->dev, + dev_dbg(&port->dev, "decoder%d.%d: Committed with zero size\n", port->id, cxld->id); - return -ENXIO; + return -ENOSPC; } - port->commit_end = cxld->id; } else { if (cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -1193,6 +1194,8 @@ static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, rc = init_hdm_decoder(port, cxld, hdm, i, &dpa_base, info); if (rc) { + if (rc == -ENOSPC) + continue; dev_warn(&port->dev, "Failed to initialize decoder%d.%d\n", port->id, i); -- Gitee From a05062dad34cc0912c8bbc9c15b97132fbf7768e Mon Sep 17 00:00:00 2001 From: Koba Ko Date: Tue, 25 Nov 2025 13:07:35 +0000 Subject: [PATCH 18/42] NVIDIA: VR: SAUCE: cxl/region: Validate partition index before array access BugLink: https://bugs.launchpad.net/bugs/2138266 Check partition index bounds before accessing cxlds->part[] to prevent out-of-bounds when part is -1 or invalid. Fixes: 5ec67596e368) cxl/region: Drop goto pattern of construct_region() Signed-off-by: Koba Ko Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg (cherry picked from commit d769d573d8adfcaa6c588b7f079b05962716316a noble:linux-nvidia-6.17) Signed-off-by: Jacob Martin --- drivers/cxl/core/region.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 1256661a4208..48c7cb61944c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3771,6 +3771,14 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; + if (part < 0 || part >= cxlds->nr_partitions) { + dev_err(cxlmd->dev.parent, + "%s:%s: invalid partition index %d (max %u)\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + part, cxlds->nr_partitions); + return ERR_PTR(-ENXIO); + } + do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, atomic_read(&cxlrd->region_id), -- Gitee From 71343c3fb23d87e9073ae0485551fc181444246f Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:18 +0000 Subject: [PATCH 19/42] NVIDIA: VR: SAUCE: cxl: Add memory offlining and cache flush helpers BugLink: https://bugs.launchpad.net/bugs/2153819 BugLink: https://bugs.launchpad.net/bugs/2143032 Add infrastructure for quiescing the CXL data path before reset: - Memory offlining: check if CXL-backed memory is online and offline it via offline_and_remove_memory() before reset, per CXL spec requirement to quiesce all CXL.mem transactions before issuing CXL Reset. - CPU cache flush: invalidate cache lines before reset as a safety measure after memory offline. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 98bfbf9c3f88013ffbff4b08a1da0043606d0269 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Use a real System RAM walker callback so resource walks never invoke a NULL function pointer.] Signed-off-by: Koba Ko Acked-by: Matt Ochs --- drivers/cxl/core/pci.c | 120 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 9493bcdbf34a..49b1baee8c06 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -980,3 +982,121 @@ int cxl_port_get_possible_dports(struct cxl_port *port) return ctx.count; } + +/* + * CXL Reset support - core-provided reset logic for CXL devices. + * + * These functions implement the CXL reset sequence. + */ + +/* + * If CXL memory backed by this decoder is online as System RAM, offline + * and remove it per CXL spec requirements before issuing CXL Reset. + * Returns 0 if memory was not online or was successfully offlined. + */ +static int cxl_is_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static int __maybe_unused cxl_offline_memory(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct cxl_region_params *p; + int rc; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr) + return 0; + + p = &cxlr->params; + if (!p->res) + return 0; + + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, NULL, + cxl_is_system_ram) <= 0) + return 0; + + dev_info(dev, "Offlining CXL memory [%pr] for reset\n", p->res); + +#ifdef CONFIG_MEMORY_HOTREMOVE + rc = offline_and_remove_memory(p->res->start, resource_size(p->res)); + if (rc) { + dev_err(dev, + "Failed to offline CXL memory [%pr]: %d\n", + p->res, rc); + return rc; + } +#else + dev_err(dev, "Memory hotremove not supported, cannot offline CXL memory\n"); + rc = -EOPNOTSUPP; + return rc; +#endif + + return 0; +} + +static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + struct device *dev; + + if (!cxlmd || !cxlmd->cxlds) + return -ENODEV; + + dev = cxlmd->cxlds->dev; + endpoint = cxlmd->endpoint; + if (!endpoint) + return 0; + + return device_for_each_child(&endpoint->dev, NULL, + cxl_offline_memory); +} + +static int __maybe_unused cxl_decoder_flush_cache(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct resource *res; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr || !cxlr->params.res) + return 0; + + res = cxlr->params.res; + cpu_cache_invalidate_memregion(res->start, resource_size(res)); + return 0; +} + +static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + + if (!cxlmd) + return 0; + + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + if (!cpu_cache_has_invalidate_memregion()) + return 0; + + device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); + return 0; +} -- Gitee From 969f62e21feab528869b090d25f968a4ed63047e Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:19 +0000 Subject: [PATCH 20/42] NVIDIA: VR: SAUCE: cxl: Add multi-function sibling coordination for CXL reset BugLink: https://bugs.launchpad.net/bugs/2153819 BugLink: https://bugs.launchpad.net/bugs/2143032 Add sibling PCI function save/disable/restore coordination for CXL reset. Before reset, all CXL.cachemem sibling functions are locked, saved, and disabled; after reset they are restored. The Non-CXL Function Map DVSEC and per-function DVSEC capability register are consulted to skip non-CXL and CXL.io-only functions. A global mutex serializes concurrent resets to prevent deadlocks between sibling functions. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 9a08c0246be53f047ed4128455f708b7a4350261 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Propagate sibling collection allocation failures after pci_walk_bus() so reset aborts instead of proceeding with a partial sibling list.] Signed-off-by: Koba Ko Acked-by: Matt Ochs --- drivers/cxl/core/pci.c | 156 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 49b1baee8c06..9f8b335dfeb3 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -15,6 +15,9 @@ #include "core.h" #include "trace.h" +/* Initial sibling array capacity: covers max non-ARI functions per slot */ +#define CXL_RESET_SIBLINGS_INIT 8 + /** * DOC: cxl core pci * @@ -1100,3 +1103,156 @@ static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); return 0; } + +/* + * Serialize all CXL reset operations globally. + */ +static DEFINE_MUTEX(cxl_reset_mutex); + +struct cxl_reset_context { + struct pci_dev *target; + struct pci_dev **pci_functions; + int pci_func_count; + int pci_func_cap; +}; + +/* + * Check if a sibling function is non-CXL using the Non-CXL Function Map + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including + * on any read failure). + */ +static bool cxl_is_non_cxl_function(struct pci_dev *pdev, + u16 func_map_dvsec, int fn) +{ + int reg, bit; + u32 map; + + if (pci_ari_enabled(pdev->bus)) { + reg = fn / 32; + bit = fn % 32; + } else { + reg = 0; + bit = fn; + } + + if (pci_read_config_dword(pdev, + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4), + &map)) + return false; + + return map & BIT(bit); +} + +struct cxl_reset_walk_ctx { + struct cxl_reset_context *ctx; + u16 func_map_dvsec; + int error; + bool ari; +}; + +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) +{ + struct cxl_reset_walk_ctx *wctx = data; + struct cxl_reset_context *ctx = wctx->ctx; + struct pci_dev *pdev = ctx->target; + u16 dvsec, cap; + int fn; + + if (func == pdev) + return 0; + + if (!wctx->ari && + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn)) + return 0; + + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn); + if (wctx->func_map_dvsec && + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn)) + return 0; + + /* Only coordinate with siblings that have CXL.cachemem */ + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return 0; + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return 0; + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE | + PCI_DVSEC_CXL_MEM_CAPABLE))) + return 0; + + /* Grow sibling array; double capacity for ARI devices when running out of space */ + if (ctx->pci_func_count >= ctx->pci_func_cap) { + struct pci_dev **new; + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2 + : CXL_RESET_SIBLINGS_INIT; + + new = krealloc(ctx->pci_functions, + new_cap * sizeof(*new), GFP_KERNEL); + if (!new) { + wctx->error = -ENOMEM; + return 1; + } + ctx->pci_functions = new; + ctx->pci_func_cap = new_cap; + } + + pci_dev_get(func); + ctx->pci_functions[ctx->pci_func_count++] = func; + return 0; +} + +static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) + pci_dev_put(ctx->pci_functions[i]); + kfree(ctx->pci_functions); + ctx->pci_functions = NULL; + ctx->pci_func_count = 0; + ctx->pci_func_cap = 0; +} + +static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +{ + struct pci_dev *pdev = ctx->target; + struct cxl_reset_walk_ctx wctx; + int i; + + ctx->pci_func_count = 0; + ctx->pci_functions = NULL; + ctx->pci_func_cap = 0; + + wctx.ctx = ctx; + wctx.ari = pci_ari_enabled(pdev->bus); + wctx.error = 0; + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP); + + /* Collect CXL.cachemem siblings under pci_bus_sem */ + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx); + if (wctx.error) { + cxl_pci_functions_reset_release(ctx); + return wctx.error; + } + + /* Lock and save/disable siblings outside pci_bus_sem */ + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_lock(ctx->pci_functions[i]); + pci_dev_save_and_disable(ctx->pci_functions[i]); + } + + return 0; +} + +static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_restore(ctx->pci_functions[i]); + pci_dev_unlock(ctx->pci_functions[i]); + } + cxl_pci_functions_reset_release(ctx); +} -- Gitee From 33ae6191799006e7c89a017f475d49e8c5b92e8b Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:20 +0000 Subject: [PATCH 21/42] NVIDIA: VR: SAUCE: cxl: Add CXL DVSEC reset sequence and flow orchestration BugLink: https://bugs.launchpad.net/bugs/2153819 BugLink: https://bugs.launchpad.net/bugs/2143032 cxl_dev_reset() implements the hardware reset sequence: optionally enable memory clear, initiate reset via CTRL2, wait for completion, and re-enable caching. cxl_do_reset() orchestrates the full reset flow: 1. CXL pre-reset: mem offlining and cache flush (when memdev present) 2. PCI save/disable: pci_dev_save_and_disable() automatically saves CXL DVSEC and HDM decoder state via PCI core hooks 3. Sibling coordination: save/disable CXL.cachemem sibling functions 4. Execute CXL DVSEC reset 5. Sibling restore: always runs to re-enable sibling functions 6. PCI restore: pci_dev_restore() automatically restores CXL state The CXL-specific DVSEC and HDM save/restore is handled by the PCI core's CXL save/restore infrastructure (drivers/pci/cxl.c). Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 92fb80732a4ea34b76cbe51b15e95ff04f49cb89 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Treat error-valued cxlmd->endpoint as no endpoint to avoid dereferencing ERR_PTR before endpoint attach.] [koba: Check sibling collection failure before starting the CXL reset so allocation failure restores the target and aborts.] [koba: Limit the memdev device lock to endpoint-dependent memory preparation and cache flush, restore memory quiesce before PCI disable, and track sibling reset preparation so reset_done cleanup only runs after successful sibling prepare.] [koba: Guard reset_done() against NULL/ERR_PTR memdev endpoints before decoder reset detection.] Signed-off-by: Koba Ko Acked-by: Matt Ochs --- drivers/cxl/core/pci.c | 196 ++++++++++++++++++++++++++++++++++++++++- drivers/cxl/cxl.h | 1 + drivers/cxl/pci.c | 10 +-- 3 files changed, 197 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 9f8b335dfeb3..50cfe768af0b 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1058,7 +1058,7 @@ static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) dev = cxlmd->cxlds->dev; endpoint = cxlmd->endpoint; - if (!endpoint) + if (!endpoint || IS_ERR(endpoint)) return 0; return device_for_each_child(&endpoint->dev, NULL, @@ -1202,7 +1202,7 @@ static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) return 0; } -static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) { int i; @@ -1214,7 +1214,7 @@ static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_cont ctx->pci_func_cap = 0; } -static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +static int cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) { struct pci_dev *pdev = ctx->target; struct cxl_reset_walk_ctx wctx; @@ -1246,7 +1246,7 @@ static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_conte return 0; } -static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) { int i; @@ -1256,3 +1256,191 @@ static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context } cxl_pci_functions_reset_release(ctx); } + +/* + * CXL device reset execution + */ +static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +{ + static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 cap, ctrl2, status2; + u32 timeout_ms; + int rc, idx; + + if (!pci_wait_for_pending_transaction(pdev)) + pci_err(pdev, "timed out waiting for pending transactions\n"); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + /* + * Disable caching and initiate cache writeback+invalidation if the + * device supports it. Poll for completion. + * Per CXL r3.2 section 9.6, software may use the cache size from + * DVSEC CXL Capability2 to compute a suitable timeout; we use a + * default of 10ms. + */ + if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) { + u32 wbi_poll_us = 100; + s32 wbi_remaining_us = 10000; + + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CACHE_WBI; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + do { + usleep_range(wbi_poll_us, wbi_poll_us + 1); + wbi_remaining_us -= wbi_poll_us; + rc = pci_read_config_word(pdev, + dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + } while (!(status2 & PCI_DVSEC_CXL_CACHE_INV) && + wbi_remaining_us > 0); + + if (!(status2 & PCI_DVSEC_CXL_CACHE_INV)) { + pci_err(pdev, "CXL cache WB+I timed out\n"); + return -ETIMEDOUT; + } + } else if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) { + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + if (cap & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + idx = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, cap); + if (idx >= ARRAY_SIZE(reset_timeout_ms)) + idx = ARRAY_SIZE(reset_timeout_ms) - 1; + timeout_ms = reset_timeout_ms[idx]; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CXL_RST; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + msleep(timeout_ms); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + + if (status2 & PCI_DVSEC_CXL_RST_ERR) { + pci_err(pdev, "CXL reset error\n"); + return -EIO; + } + + if (!(status2 & PCI_DVSEC_CXL_RST_DONE)) { + pci_err(pdev, "CXL reset timeout\n"); + return -ETIMEDOUT; + } + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 &= ~PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + return 0; +} + +static int match_memdev_by_parent(struct device *dev, const void *parent) +{ + return is_cxl_memdev(dev) && dev->parent == parent; +} + +static int __cxl_do_reset(struct pci_dev *pdev, struct cxl_memdev *cxlmd, + int dvsec) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + if (cxlmd) { + guard(device)(&cxlmd->dev); + + rc = cxl_reset_prepare_memdev(cxlmd); + if (rc) + goto out_unlock; + + cxl_reset_flush_cpu_caches(cxlmd); + } + + pci_dev_save_and_disable(pdev); + + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + +out_unlock: + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} + +static int cxl_do_reset(struct pci_dev *pdev) +{ + int dvsec; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return -ENODEV; + + struct device *memdev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, &pdev->dev, + match_memdev_by_parent); + if (!memdev) + return __cxl_do_reset(pdev, NULL, dvsec); + + struct cxl_memdev *cxlmd = to_cxl_memdev(memdev); + + return __cxl_do_reset(pdev, cxlmd, dvsec); +} diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 535786860049..83e9ed0db04b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -182,6 +182,7 @@ int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_DECODER_F_LOCK BIT(4) #define CXL_DECODER_F_ENABLE BIT(5) #define CXL_DECODER_F_NORMALIZED_ADDRESSING BIT(6) +#define CXL_DECODER_F_RESET_MASK (CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK) enum cxl_decoder_type { CXL_DECODER_DEVMEM = 2, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index bace662dc988..deafa5bae2c7 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -977,6 +977,7 @@ static void cxl_reset_done(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct cxl_port *endpoint; struct device *dev = &pdev->dev; /* @@ -986,14 +987,11 @@ static void cxl_reset_done(struct pci_dev *pdev) * that no longer exists. */ guard(device)(&cxlmd->dev); - if (!cxlmd->dev.driver) + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) return; - if (cxlmd->endpoint && - cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { - device_for_each_child(&cxlmd->endpoint->dev, NULL, - cxl_endpoint_decoder_clear_reset_flags); - + if (cxl_endpoint_decoder_reset_detected(endpoint)) { dev_crit(dev, "SBR happened without memory regions removal.\n"); dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); -- Gitee From 85b53e92b839a599ed135bd27d7ca58799c55ee8 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:21 +0000 Subject: [PATCH 22/42] NVIDIA: VR: SAUCE: cxl: Add cxl_reset sysfs interface for PCI devices BugLink: https://bugs.launchpad.net/bugs/2153819 BugLink: https://bugs.launchpad.net/bugs/2143032 Add a "cxl_reset" sysfs attribute to PCI devices that support CXL Reset (CXL r3.2 section 8.1.3.1). The attribute is visible only on devices with both CXL.cache and CXL.mem capabilities and the CXL Reset Capable bit set in the DVSEC. Writing "1" to the attribute triggers the full CXL reset flow via cxl_do_reset(). The interface is decoupled from memdev creation: when a CXL memdev exists, memory offlining and cache flush are performed; otherwise reset proceeds without the memory management. The sysfs attribute is managed entirely by the CXL module using sysfs_create_group() / sysfs_remove_group() rather than the PCI core's static attribute groups. This avoids cross-module symbol dependencies between the PCI core (always built-in) and CXL_BUS (potentially modular). At module init, existing PCI devices are scanned and a PCI bus notifier handles hot-plug/unplug. kernfs_drain() makes sure that any in-flight store() completes before sysfs_remove_group() returns, preventing use-after-free during module unload. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 6e96f7e341a4eb1b9631e40b43d120b2b9e2c6e2 nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko Acked-by: Matt Ochs --- drivers/cxl/core/core.h | 2 + drivers/cxl/core/pci.c | 113 ++++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 3 ++ 3 files changed, 118 insertions(+) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 82ca3a476708..7b3bdcee6416 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -138,6 +138,8 @@ extern struct cxl_rwsem cxl_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); enum cxl_poison_trace_type { CXL_POISON_TRACE_LIST, diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 50cfe768af0b..6b860cadfe2e 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1444,3 +1444,116 @@ static int cxl_do_reset(struct pci_dev *pdev) return __cxl_do_reset(pdev, cxlmd, dvsec); } + +/* + * CXL reset sysfs attribute management. + * + * The cxl_reset attribute is added to PCI devices that advertise CXL Reset + * capability. Managed entirely by the CXL module via subsys_interface on + * pci_bus_type, avoiding cross-module symbol dependencies between the PCI + * core (built-in) and CXL (potentially modular). + * + * subsys_interface handles existing devices at register time and hot-plug + * add/remove automatically. On unregister, remove_dev runs for all tracked + * devices under bus core serialization. + */ + +static bool pci_cxl_reset_capable(struct pci_dev *pdev) +{ + int dvsec; + u16 cap; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return false; + + if (pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return false; + + if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE) || + !(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) + return false; + + return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); +} + +static ssize_t cxl_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc; + + if (!sysfs_streq(buf, "1")) + return -EINVAL; + + rc = cxl_do_reset(pdev); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(cxl_reset); + +static umode_t cxl_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return a->mode; +} + +static struct attribute *cxl_reset_attrs[] = { + &dev_attr_cxl_reset.attr, + NULL, +}; + +static const struct attribute_group cxl_reset_attr_group = { + .attrs = cxl_reset_attrs, + .is_visible = cxl_reset_attr_is_visible, +}; + +static int cxl_reset_add_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return sysfs_create_group(&dev->kobj, &cxl_reset_attr_group); +} + +static void cxl_reset_remove_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return; + + sysfs_remove_group(&dev->kobj, &cxl_reset_attr_group); +} + +static struct subsys_interface cxl_reset_interface = { + .name = "cxl_reset", + .subsys = &pci_bus_type, + .add_dev = cxl_reset_add_dev, + .remove_dev = cxl_reset_remove_dev, +}; + +void cxl_reset_sysfs_init(void) +{ + int rc; + + rc = subsys_interface_register(&cxl_reset_interface); + if (rc) + pr_warn("CXL: failed to register cxl_reset interface (%d)\n", + rc); +} + +void cxl_reset_sysfs_exit(void) +{ + subsys_interface_unregister(&cxl_reset_interface); +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5aacd7054f1..f95f0bdd7b90 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2530,6 +2530,8 @@ static __init int cxl_core_init(void) if (rc) goto err_ras; + cxl_reset_sysfs_init(); + return 0; err_ras: @@ -2545,6 +2547,7 @@ static __init int cxl_core_init(void) static void cxl_core_exit(void) { + cxl_reset_sysfs_exit(); cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); -- Gitee From 62395f32078864374a0c324726b1dcb71387f242 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 08:44:02 +0530 Subject: [PATCH 23/42] NVIDIA: VR: SAUCE: cxl: Export the CXL reset helpers for VFIO users BugLink: https://bugs.launchpad.net/bugs/2152222 Export CXL reset helper entry points for VFIO CXL users so vfio-pci can coordinate CXL reset and memory/cache state safely. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from commit 2d40efbb4f42 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [kobak: Kept the BOS CXL core tail and placed the exported reset helpers after cxl_port_get_possible_dports().] [kobak: Adapted to the BOS cxl_pci_functions_reset_prepare() error-return flow and added the target-local CXL reset helper prototypes required by public and private CXL headers.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/cxl/core/pci.c | 57 ++++++++++++++++++++++++++++++++--- drivers/cxl/cxl.h | 5 +++ include/linux/pci.h | 3 ++ include/uapi/linux/pci_regs.h | 13 ++++++++ 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 6b860cadfe2e..c4f4d5e161e0 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1260,7 +1260,7 @@ static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) /* * CXL device reset execution */ -static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en) { static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; u16 cap, ctrl2, status2; @@ -1330,7 +1330,17 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) if (rc) return rc; - ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + /* + * Explicitly set or clear RST_MEM_CLR_EN rather than only + * setting it. A previous reset may have left the bit set in + * hardware; if mem_clr_en is false we must clear it so that a + * guest-triggered reset does not unexpectedly scrub DPA. + */ + if (mem_clr_en) + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + else + ctrl2 &= ~PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); if (rc) @@ -1379,6 +1389,44 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset, "CXL"); + +/** + * cxl_dev_reset_locked() - cxl_dev_reset() under cxl_reset_mutex with sibling + * CXL.cachemem function save/restore. + * @pdev: Target CXL function + * @dvsec: CXL DVSEC capability offset (pci_find_dvsec_capability()) + * @mem_clr_en: Pass-through to cxl_dev_reset() (Mem_Clr_Enable in CTRL2) + * + * Return: 0 on success, negative errno from cxl_dev_reset() or sibling + * coordination failure. + */ +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + pci_dev_save_and_disable(pdev); + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec, mem_clr_en); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset_locked, "CXL"); static int match_memdev_by_parent(struct device *dev, const void *parent) { @@ -1410,7 +1458,7 @@ static int __cxl_do_reset(struct pci_dev *pdev, struct cxl_memdev *cxlmd, rc = cxl_pci_functions_reset_prepare(&ctx); if (!rc) { siblings_prepared = true; - rc = cxl_dev_reset(pdev, dvsec); + rc = cxl_dev_reset(pdev, dvsec, true); } if (siblings_prepared) @@ -1458,7 +1506,7 @@ static int cxl_do_reset(struct pci_dev *pdev) * devices under bus core serialization. */ -static bool pci_cxl_reset_capable(struct pci_dev *pdev) +bool pci_cxl_reset_capable(struct pci_dev *pdev) { int dvsec; u16 cap; @@ -1477,6 +1525,7 @@ static bool pci_cxl_reset_capable(struct pci_dev *pdev) return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); } +EXPORT_SYMBOL_NS_GPL(pci_cxl_reset_capable, "CXL"); static ssize_t cxl_reset_store(struct device *dev, struct device_attribute *attr, diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 83e9ed0db04b..7973d1519cc2 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -162,6 +162,11 @@ int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, resource_size_t *bar_offset); +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4f1308244c82..82b86c7bdf6e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2049,6 +2049,9 @@ int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) +void pci_dev_save_and_disable(struct pci_dev *dev); +void pci_dev_restore(struct pci_dev *dev); + /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), * a PCI domain is defined to be a set of PCI buses which share diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 14f634ab9350..a7ac017baa1c 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1353,6 +1353,19 @@ #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) + +#define PCI_DVSEC_CXL_CTRL_RWL 0x5FED +#define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) +#define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) +#define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) +#define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) +#define PCI_DVSEC_CXL_STATUS2 0x12 +#define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) +#define PCI_DVSEC_CXL_RST_DONE _BITUL(1) +#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) +#define PCI_DVSEC_CXL_LOCK 0x14 +#define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) -- Gitee From 48e64c2bff7c5c1cb8aa6f7931f9d4131cafd9ca Mon Sep 17 00:00:00 2001 From: Vishal Aslot Date: Tue, 14 Oct 2025 19:40:05 -0700 Subject: [PATCH 24/42] NVIDIA: VR: SAUCE: cxl_test: enable zero sized decoders under hb0 BugLink: https://bugs.launchpad.net/bugs/2138266 The cxl core in linux updated to supported committed decoders of zero size, because this is allowed by the CXL spec. This patch updates cxl_test to enable decoders 1 and 2 in the host-bridge 0 port, in a switch uport under hb0, and the endpoints ports with size zero simulating committed zero sized decoders. Signed-off-by: Vishal Aslot (backported from https://lore.kernel.org/all/20251015024019.1189713-1-vaslot@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg (cherry picked from commit a40b0390737baececc5d0b6ee4fb174516373ac9 noble:linux-nvidia-6.17) Signed-off-by: Jacob Martin --- tools/testing/cxl/test/cxl.c | 113 +++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 18 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 418669927fb0..6acdf48d2bd3 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1041,6 +1041,45 @@ static void default_mock_decoder(struct cxl_decoder *cxld) WARN_ON_ONCE(!cxld_registry_new(cxld)); } +static void size_zero_mock_decoder_ep(struct cxl_decoder *cxld, u64 base) +{ + struct cxl_endpoint_decoder *cxled; + + cxled = to_cxl_endpoint_decoder(&cxld->dev); + cxld->hpa_range = (struct range){ + .start = base, + .end = base - 1, /* Size 0 */ + }; + + cxld->interleave_ways = 2; + cxld->interleave_granularity = 4096; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->flags = CXL_DECODER_F_ENABLE; + cxled->state = CXL_DECODER_STATE_AUTO; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; +} + +static void size_zero_mock_decoder_sw(struct device *dev, u64 base, int i) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + + cxlsd = to_cxl_switch_decoder(dev); + cxld = &cxlsd->cxld; + cxld->flags = CXL_DECODER_F_ENABLE; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + if (i == 0) + cxld->interleave_ways = 2; + else + cxld->interleave_ways = 1; + cxld->interleave_granularity = 4096; + cxld->hpa_range = (struct range) { + .start = base, + .end = base - 1, /* Size 0 */ + }; +} + static int first_decoder(struct device *dev, const void *data) { struct cxl_decoder *cxld; @@ -1053,22 +1092,31 @@ static int first_decoder(struct device *dev, const void *data) return 0; } -/* - * Initialize a decoder during HDM enumeration. - * - * If a saved registry entry exists: - * - enabled decoders are restored from the saved programming - * - disabled decoders are initialized in a clean disabled state - * - * If no registry entry exists the decoder follows the normal mock - * initialization path, including the special auto-region setup for - * the first endpoints under host-bridge0. - * - * Returns true if decoder state was restored from the registry. In - * that case the saved decode configuration (including target mapping) - * has already been applied and the map_targets() is skipped. - */ -static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) +static int second_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 1) + return 1; + return 0; +} + +static int third_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 2) + return 1; + return 0; +} + +static void mock_init_hdm_decoder(struct cxl_decoder *cxld) { struct acpi_cedt_cfmws *window = mock_cfmws[0]; struct platform_device *pdev = NULL; @@ -1080,7 +1128,7 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_dport *dport; struct device *dev; bool hb0 = false; - u64 base; + u64 base = window->base_hpa; int i; if (is_endpoint_decoder(&cxld->dev)) { @@ -1122,6 +1170,20 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) return false; } + /* + * Decoders 1 and 2 of the endpoint under host bridge 0 should be enabled as zero-sized. + * It would be even better to make sure that the parent switch uport decoder was + * also enabled before enabling the size zero decoders but there is no harm in doing it + * anyway. + */ + if (hb0 && (cxld->id == 1 || cxld->id == 2)) { + port = to_cxl_port(cxld->dev.parent); + size_zero_mock_decoder_ep(cxld, base); + /* Commit the zero-sized decoder */ + port->commit_end = cxld->id; + return; + } + /* * The first decoder on the first 2 devices on the first switch * attached to host-bridge0 mock a fake / static RAM region. All @@ -1142,7 +1204,6 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) return false; } - base = window->base_hpa; if (extended_linear_cache) base += mock_auto_region_size; cxld->hpa_range = (struct range) { @@ -1214,6 +1275,22 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld_registry_update(cxld); put_device(dev); + + /* Enable the next two decoders also and make them zero sized */ + dev = device_find_child(&iter->dev, NULL, second_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 1; + put_device(dev); + } + dev = device_find_child(&iter->dev, NULL, third_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 2; + put_device(dev); + } } return false; -- Gitee From 426b74e4cd7ed7030e393056a3609f6f34b9d8eb Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:03 +0530 Subject: [PATCH 25/42] NVIDIA: VR: SAUCE: vfio: UAPI for CXL-capable PCI device assignment BugLink: https://bugs.launchpad.net/bugs/2152222 Vendor GPUs and accelerators can expose CXL.mem (HDM-D or HDM-DB) without using PCI class code 0x0502. VMMs need a stable way to learn DPA sizing, firmware commit state, and where the extra VFIO regions live. Add VFIO_DEVICE_FLAGS_CXL (bit 9) and VFIO_DEVICE_INFO_CAP_CXL (cap ID 6). The capability struct carries: hdm_regs_bar_index PCI BAR containing the component register block hdm_regs_offset byte offset within that BAR to the CXL.mem area (comp_reg_offset + CXL_CM_OFFSET) dpa_region_index VFIO region index for the DPA window comp_regs_region_index VFIO region index for the emulated COMP_REGS HDM decoder count and the HDM block offset within COMP_REGS are intentionally absent; both are derivable from the CXL Capability Array at COMP_REGS offset 0. Locate cap ID 0x5 (HDM) and read bits[31:20] of its entry for the byte offset. Then read bits[3:0] of the HDM Decoder Capability register for the count: count = (field == 0) ? 1 : field * 2. Two flags accompany the capability: VFIO_CXL_CAP_FIRMWARE_COMMITTED A decoder covering @dpa_size bytes was programmed and committed by platform firmware before device open. The VMM can use the DPA region immediately without re-committing. VFIO_CXL_CAP_CACHE_CAPABLE The device is HDM-DB (CXL.mem + CXL.cache). HDM-DB requires a Write-Back Invalidation sequence before FLR to flush dirty cache lines; HDM-D (CXL.mem only) does not. QEMU uses this flag to schedule WBI and to report Back-Invalidation capability accurately in the virtual CXL topology. Mirrors the Cache_Capable bit from the CXL DVSEC Capability register. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (cherry-picked from commit c0f4d247a0e714f22dcd9fe6679132edf1b6a846 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- include/uapi/linux/vfio.h | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5ee..fd1f007b76f7 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -215,6 +215,16 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +/* + * Vendor-specific CXL device with CXL.mem capability (HDM-D or HDM-DB + * decoder, PCI class code != PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 + * accelerators and non-class-code Type-3 variants. When set, + * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The + * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL + * describing HDM decoders, region indices, decoder layout, and CXL-specific + * options. + */ +#define VFIO_DEVICE_FLAGS_CXL (1 << 9) /* Device supports CXL */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ @@ -257,6 +267,70 @@ struct vfio_device_info_cap_pci_atomic_comp { __u32 reserved; }; +/* + * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability + * + * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL + * is set. Describes Host Managed Device Memory (HDM) layout and CXL + * memory options so that userspace (e.g. QEMU) can expose the CXL region + * and component registers correctly to the guest. + * + * The HDM decoder count and HDM decoder block offset within the COMP_REGS + * region are derivable from the COMP_REGS region itself. + * + * To find the HDM decoder block offset (hdm_decoder_offset), traverse the CXL + * Capability Array starting at COMP_REGS region offset 0: + * - Dword 0 bits[31:24] (CXL_CM_CAP_HDR_ARRAY_SIZE_MASK): number of + * capability entries. + * - Each subsequent dword at offset (cap * 4): bits[15:0] = cap ID + * (CXL_CM_CAP_HDR_ID_MASK), bits[31:20] = byte offset from COMP_REGS + * start to that capability's register block (CXL_CM_CAP_PTR_MASK). + * - Locate the entry with cap ID == CXL_CM_CAP_CAP_ID_HDM (0x5); the + * extracted bits[31:20] value is directly the byte offset + * hdm_decoder_offset (no further scaling required). + * + * To find the HDM decoder count, pread the HDM Decoder Capability register + * at hdm_decoder_offset + CXL_HDM_DECODER_CAP_OFFSET within the + * COMP_REGS region; bits[3:0] (CXL_HDM_DECODER_COUNT_MASK) encode the count + * using the formula: count = (field == 0) ? 1 : field * 2. + */ +#define VFIO_DEVICE_INFO_CAP_CXL 6 +struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; + __u8 hdm_regs_bar_index; /* PCI BAR containing HDM registers */ + __u8 reserved[3]; + __u32 flags; +/* Decoder was committed by host firmware/BIOS */ +#define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) +/* + * Device implements an HDM-DB decoder (CXL.cache + CXL.mem). Reflects + * the Cache_Capable bit (bit 0) in the CXL DVSEC Capability register. + * + * When clear: HDM-D decoder (CXL.mem only, no CXL.cache). FLR does not + * require a Write-Back Invalidation (WBI) sequence; the device holds no + * coherent copies of host memory. + * + * When set: HDM-DB decoder (CXL 3.0+). The kernel driver does not + * perform Write-Back Invalidation (WBI) automatically. The VMM must + * issue a WBI sequence before asserting FLR to flush dirty device cache + * lines and prevent coherency violations, and should advertise + * Back-Invalidation support in the virtual CXL topology. + */ +#define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + /* + * Byte offset within the BAR to the CXL.mem register area start + * (= comp_reg_offset + CXL_CM_OFFSET). This is where the CXL + * Capability Array Header lives. + */ + __u64 hdm_regs_offset; + /* + * Region indices for the two CXL VFIO device regions. + * Avoids forcing userspace to scan all regions by type/subtype. + */ + __u32 dpa_region_index; /* VFIO_REGION_SUBTYPE_CXL */ + __u32 comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */ +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -370,6 +444,18 @@ struct vfio_region_info_cap_type { */ #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) +/* 1e98 vendor PCI sub-types (CXL Consortium) */ +/* + * CXL memory region. Use with region type + * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE). + * DPA memory region (fault+zap mmap) + */ +#define VFIO_REGION_SUBTYPE_CXL (1) +/* + * HDM decoder register emulation region (read/write only, no mmap). + */ +#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS (2) + /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) -- Gitee From 8c8b42b7a0dc1e90864ff4dfcf342eb4e2ebc544 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:04 +0530 Subject: [PATCH 26/42] NVIDIA: VR: SAUCE: vfio/pci: Add CXL state to vfio_pci_core_device BugLink: https://bugs.launchpad.net/bugs/2152222 Add struct vfio_pci_cxl_state and hang a pointer to it off vfio_pci_core_device. vdev->cxl stays NULL for non-CXL devices, so existing vfio-pci-core paths just pay a NULL check. The new struct embeds struct cxl_dev_state by value (CXL core uses container_of() against this field) and stores pointers to the cxl_memdev, root decoder, and endpoint decoder that the CXL core owns. cxl_region is not introduced here; it is added later when region management lands. The series builds the CXL Type-2 passthrough path inside vfio-pci-core rather than in a separate variant driver. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 87b80cc08c264 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve context mismatch in vfio_pci_core.h; add #include to vfio_cxl_priv.h for cxl_find_regblock/cxl_probe_component_regs declarations] [kobak: Preserved existing VFIO PCI DMABUF forward declarations while adding the CXL state forward declaration.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 29 ++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 2 ++ 2 files changed, 31 insertions(+) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_priv.h diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h new file mode 100644 index 000000000000..0ea1d8ddbd49 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common infrastructure for CXL Type-2 device variant drivers + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef __LINUX_VFIO_CXL_PRIV_H +#define __LINUX_VFIO_CXL_PRIV_H + +#include +#include +#include + +/* CXL device state embedded in vfio_pci_core_device */ +struct vfio_pci_cxl_state { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; + struct cxl_root_decoder *cxlrd; + struct cxl_endpoint_decoder *cxled; + resource_size_t hdm_reg_offset; + size_t hdm_reg_size; + resource_size_t comp_reg_offset; + size_t comp_reg_size; + u8 hdm_count; + u8 comp_reg_bar; +}; + +#endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..6e8885f79d26 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -29,6 +29,7 @@ struct vfio_pci_core_device; struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_attachment; +struct vfio_pci_cxl_state; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -137,6 +138,7 @@ struct vfio_pci_core_device { struct mutex ioeventfds_lock; struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; + struct vfio_pci_cxl_state *cxl; struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; -- Gitee From a589f18573da82a106229cbcc889a56454fbc2c5 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:05 +0530 Subject: [PATCH 27/42] NVIDIA: VR: SAUCE: vfio/pci: Add CONFIG_VFIO_CXL_CORE and stub CXL hooks BugLink: https://bugs.launchpad.net/bugs/2152222 Introduce the Kconfig option CONFIG_VFIO_CXL_CORE and the necessary build rules to compile CXL.mem passthrough infrastructure for vendor-specific CXL devices into the vfio-pci-core module. The new option depends on VFIO_PCI_CORE, CXL_BUS and CXL_MEM. Wire up the detection and cleanup entry-point stubs in vfio_pci_core_register_device() and vfio_pci_core_unregister_device() so that subsequent patches can fill in the CXL-specific logic without touching the vfio-pci-core flow again. The vfio_cxl_core.c file added here is an empty skeleton; the actual CXL detection and initialisation code is introduced in the following patch to keep this build-system patch reviewable on its own. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 336a1448463a from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve context mismatches in Kconfig, Makefile, and vfio_pci_priv.h due to missing upstream xe/dmabuf support in NV-Kernels base] [kobak: Preserved existing VFIO PCI DMABUF declarations while adding VFIO CXL stubs.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/Kconfig | 2 ++ drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/cxl/Kconfig | 9 ++++++ drivers/vfio/pci/cxl/vfio_cxl_core.c | 41 ++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 4 +++ drivers/vfio/pci/vfio_pci_priv.h | 14 ++++++++++ 6 files changed, 71 insertions(+) create mode 100644 drivers/vfio/pci/cxl/Kconfig create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_core.c diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 296bf01e185e..c3f54fc8ccc0 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -70,6 +70,8 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/cxl/Kconfig" + source "drivers/vfio/pci/qat/Kconfig" source "drivers/vfio/pci/xe/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 6138f1bf241d..941293c20c27 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/Kconfig b/drivers/vfio/pci/cxl/Kconfig new file mode 100644 index 000000000000..fad53300fecf --- /dev/null +++ b/drivers/vfio/pci/cxl/Kconfig @@ -0,0 +1,9 @@ +config VFIO_CXL_CORE + bool "VFIO CXL core" + depends on VFIO_PCI_CORE && CXL_BUS && CXL_MEM + help + Extends vfio-pci-core with CXL.mem passthrough for vendor-specific + CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or HDM-DB + decoders without the standard CXL memory expander class code + (PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 accelerators and + non-class-code Type-3 variants (e.g. compressed memory devices). diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c new file mode 100644 index 000000000000..d12afec82ecd --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO CXL Core - CXL.mem passthrough for vendor-specific CXL devices + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * + * This module extends vfio-pci-core to pass through CXL.mem regions for + * vendor-specific CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or + * HDM-DB decoders but do not report the standard CXL memory expander class + * code (PCI_CLASS_MEMORY_CXL, 0x0502). This covers both CXL Type-2 + * accelerators (with CXL.cache) and non-class-code Type-3 variants (e.g. + * compressed memory devices) which cannot be paravirtualized by the host + * CXL subsystem and require direct DPA region access from the guest. + */ + +#include +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/** + * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific + * CXL.mem device + * @vdev: VFIO PCI device + * + * Called from vfio_pci_core_register_device(). Detects CXL DVSEC capability + * and initializes CXL features. On failure vdev->cxl remains NULL and the + * device operates as a standard PCI device. + */ +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) +{ +} + +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) +{ +} + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ad52abc46c04..b59929636484 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2187,6 +2187,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (ret) goto out_vf; + vfio_pci_cxl_detect_and_init(vdev); + vfio_pci_probe_power_state(vdev); /* @@ -2230,6 +2232,8 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vf_uninit(vdev); vfio_pci_vga_uninit(vdev); + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index fca9d0dfac90..4342374107fa 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -137,4 +137,18 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, } #endif +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); + +#else + +static inline void +vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } + +#endif /* CONFIG_VFIO_CXL_CORE */ + #endif -- Gitee From 0a079d266743b327c8615193a0c3bd14472a5cc1 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:07 +0530 Subject: [PATCH 28/42] NVIDIA: VR: SAUCE: vfio/pci: Export config access helpers BugLink: https://bugs.launchpad.net/bugs/2152222 Promote vfio_raw_config_write() and vfio_raw_config_read() to non-static so that the CXL DVSEC write handler in the next patch can call them. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (cherry-picked from commit 07d7141447024a424c16206383e15bcddf8dfb9f from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/vfio_pci_config.c | 12 ++++++------ drivers/vfio/pci/vfio_pci_priv.h | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index a10ed733f0e3..ac2d7f5fa40e 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -270,9 +270,9 @@ static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, } /* Raw access skips any kind of virtualization */ -static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -283,9 +283,9 @@ static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, return count; } -static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 4342374107fa..c19269362d06 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -41,6 +41,14 @@ ssize_t vfio_pci_config_rw_single(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); + +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); -- Gitee From 7f3d19b5593423f77f87b7d441798102adfff876 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:06 +0530 Subject: [PATCH 29/42] NVIDIA: VR: SAUCE: vfio/cxl: Detect CXL DVSEC and probe HDM block BugLink: https://bugs.launchpad.net/bugs/2152222 Detect a vendor-specific CXL device at vfio-pci bind time and probe its HDM decoder register block. vfio_cxl_create_device_state() allocates per-device state via devm, reads the DVSEC length from PCI_DVSEC_HEADER1, and records MEM_CAPABLE and CACHE_CAPABLE from the CXL DVSEC. vfio_cxl_setup_regs() locates the component register block, claims and maps that BAR window, calls cxl_probe_component_regs() to find the HDM block, then unmaps and releases the window on all paths. vfio_pci_cxl_detect_and_init() enables PCI memory decoding for the probe, chains these setup steps, disables the device again, and leaves vdev->cxl NULL on failure so the device falls back to plain vfio-pci. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (cherry-picked from commit 939ebb73d430f185c3e8ba55f26ba6888452b0d7 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Use pci_get_dsn() instead of pdev->dev.id for cxlds serial; expand comment explaining why] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 222 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 12 ++ 2 files changed, 234 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index d12afec82ecd..8b83f6619aa0 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,163 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +/* + * vfio_cxl_create_device_state - Allocate and validate CXL device state + * + * Returns a pointer to the allocated vfio_pci_cxl_state on success, or + * ERR_PTR on failure. The allocation uses devm; the caller must call + * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release + * the resource before device unbind. Using devm_kfree() to undo a devm + * allocation early is explicitly supported by the devres API. + * + * The caller assigns vdev->cxl only after all setup steps succeed, preventing + * partially-initialised state from being visible through vdev->cxl on any + * failure path. + */ +static struct vfio_pci_cxl_state * +vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) +{ + struct vfio_pci_cxl_state *cxl; + u16 cap_word; + u32 hdr1; + + /* + * Freed automatically when pdev->dev is released. Use the PCI Device + * Serial Number capability for cxlds->serial; pdev->dev.id is the + * generic-device sibling counter (typically 0) and surfaces as a bogus + * serial in sysfs and CXL tracepoints. + */ + cxl = devm_cxl_dev_state_create(&pdev->dev, + CXL_DEVTYPE_DEVMEM, + pci_get_dsn(pdev), dvsec, + struct vfio_pci_cxl_state, + cxlds, false); + if (!cxl) + return ERR_PTR(-ENOMEM); + + pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1); + cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1); + + pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET, + &cap_word); + + /* + * Only handle vendor devices (class != 0x0502) with Mem_Capable set. + * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI + * sequence is needed before FLR. + */ + if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) || + (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { + devm_kfree(&pdev->dev, cxl); + return ERR_PTR(-ENODEV); + } + + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word); + + return cxl; +} + +static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct cxl_register_map *map = &cxl->cxlds.reg_map; + resource_size_t offset, bar_offset, size; + struct pci_dev *pdev = vdev->pdev; + void __iomem *base; + int ret; + u8 count; + u8 bar; + + if (WARN_ON_ONCE(!pci_is_enabled(pdev))) + return -EINVAL; + + /* Find component register block via Register Locator DVSEC */ + ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map); + if (ret) + return ret; + + /* + * Request the region and map. This is a transient mapping + * used only to probe register capabilities; released immediately + * after cxl_probe_component_regs() returns. + */ + if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe")) + return -EBUSY; + + base = ioremap(map->resource, map->max_size); + if (!base) { + ret = -ENOMEM; + goto failed_release; + } + + /* Probe component register capabilities */ + cxl_probe_component_regs(&pdev->dev, base, &map->component_map); + + /* Check if HDM decoder was found */ + if (!map->component_map.hdm_decoder.valid) { + ret = -ENODEV; + goto failed_unmap; + } + + pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n", + map->component_map.hdm_decoder.offset, + map->component_map.hdm_decoder.size); + + /* Get HDM register info */ + ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size); + if (ret) + goto failed_unmap; + + if (!count || !size) { + ret = -ENODEV; + goto failed_unmap; + } + + cxl->hdm_count = count; + /* + * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + + * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before + * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset). + * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem + * register area start, which is where comp_reg_virt[0] is anchored. + * The physical BAR address for hdm_iobase is recovered by adding + * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs(). + */ + cxl->hdm_reg_offset = offset - CXL_CM_OFFSET; + cxl->hdm_reg_size = size; + + ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset); + if (ret) + goto failed_unmap; + + cxl->comp_reg_bar = bar; + cxl->comp_reg_offset = bar_offset; + cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + iounmap(base); + release_mem_region(map->resource, map->max_size); + + return 0; + +failed_unmap: + iounmap(base); +failed_release: + release_mem_region(map->resource, map->max_size); + + return ret; +} + +/* + * Free CXL state early on probe failure. devm_kfree() on a live devres + * allocation removes it from the list immediately, so the normal devres + * teardown at unbind time won't double-free it. + */ +static void vfio_cxl_dev_state_free(struct pci_dev *pdev, + struct vfio_pci_cxl_state *cxl) +{ + devm_kfree(&pdev->dev, cxl); +} + /** * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific * CXL.mem device @@ -32,10 +189,75 @@ */ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { + struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_cxl_state *cxl; + u16 dvsec; + int ret; + + if (!pcie_is_cxl(pdev)) + return; + + dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + /* + * CXL DVSEC found: any failure from here is a hard probe error on + * a confirmed CXL-capable device, not a silent non-CXL fallback. + * Warn the operator so misconfiguration is visible. + */ + cxl = vfio_cxl_create_device_state(pdev, dvsec); + if (IS_ERR(cxl)) { + if (PTR_ERR(cxl) != -ENODEV) + pci_warn(pdev, + "vfio-cxl: CXL device state allocation failed: %ld\n", + PTR_ERR(cxl)); + return; + } + + /* + * Required for ioremap of the component register block and + * calls to cxl_probe_component_regs(). + */ + ret = pci_enable_device_mem(pdev); + if (ret) { + pci_warn(pdev, + "vfio-cxl: pci_enable_device_mem failed: %d\n", ret); + goto free_cxl; + } + + ret = vfio_cxl_setup_regs(vdev, cxl); + if (ret) { + pci_warn(pdev, + "vfio-cxl: HDM register probing failed: %d\n", ret); + pci_disable_device(pdev); + goto free_cxl; + } + + pci_disable_device(pdev); + + /* + * Register probing succeeded. Assign vdev->cxl now so that + * all subsequent helpers can access state via vdev->cxl. + * All failure paths below clear vdev->cxl before calling + * vfio_cxl_dev_state_free(). + */ + vdev->cxl = cxl; + + return; + +free_cxl: + vfio_cxl_dev_state_free(pdev, cxl); } void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl) + return; } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 0ea1d8ddbd49..bb03f9363d98 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -22,8 +22,20 @@ struct vfio_pci_cxl_state { size_t hdm_reg_size; resource_size_t comp_reg_offset; size_t comp_reg_size; + u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; + bool cache_capable; }; +/* + * CXL DVSEC for CXL Devices - register offsets within the DVSEC + * (CXL 4.0 8.1.3). + * Offsets are relative to the DVSEC capability base (cxl->dvsec). + */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0xa +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ +#define CXL_DVSEC_CACHE_CAPABLE BIT(0) + #endif /* __LINUX_VFIO_CXL_PRIV_H */ -- Gitee From 585bbae8c6448b37f7dd48d3e78805c21092b31e Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:08 +0530 Subject: [PATCH 30/42] NVIDIA: VR: SAUCE: vfio/cxl: Introduce HDM decoder register emulation framework BugLink: https://bugs.launchpad.net/bugs/2152222 Add HDM decoder register emulation for CXL devices assigned to a guest. New file vfio_cxl_emu.c allocates comp_reg_virt[] covering the full component register block (CXL_COMPONENT_REG_BLOCK_SIZE), snapshots it from MMIO after probe, and registers a VFIO device region (VFIO_REGION_SUBTYPE_CXL_COMP_REGS) with read/write ops but no mmap, so every access hits the emulated buffer and write dispatchers. vfio_cxl_setup_virt_regs() is called from the tail of vfio_cxl_setup_regs(); vfio_cxl_clean_virt_regs() runs on cleanup. HDM decoder register defines come from include/uapi/cxl/cxl_regs.h. Bits with no hardware equivalent stay in vfio_cxl_priv.h. hdm_decoder_n_ctrl_write() allows the guest to clear the LOCK bit. A firmware-committed decoder arrives with LOCK=1; the guest driver must clear it before reprogramming BASE and SIZE with the VM's GPA. Such a write clears the bit in the shadow while preserving all other fields. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 4ab495542be13ab04ac277ff8731d4233d661b97 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve Makefile context mismatch due to missing upstream dmabuf support in NV-Kernels base, Add CTRL LOCK enforcement in BASE_LO/SIZE_LO writes, BI bit masking for non-cache-capable devices, pass max_size to vfio_cxl_setup_virt_regs() for bounds check, add vfio_pci_cxl_cleanup() in registration error path] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/cxl/vfio_cxl_core.c | 5 + drivers/vfio/pci/cxl/vfio_cxl_emu.c | 462 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 48 +++ drivers/vfio/pci/vfio_pci_core.c | 9 + include/uapi/cxl/cxl_regs.h | 5 + 6 files changed, 530 insertions(+), 1 deletion(-) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_emu.c diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 941293c20c27..c735b4c665c7 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o -vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 8b83f6619aa0..72033fda7954 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -154,8 +154,11 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, cxl->comp_reg_offset = bar_offset; cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + ret = vfio_cxl_setup_virt_regs(vdev, cxl, base, map->max_size); iounmap(base); release_mem_region(map->resource, map->max_size); + if (ret) + return ret; return 0; @@ -258,6 +261,8 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) if (!cxl) return; + + vfio_cxl_clean_virt_regs(cxl); } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c new file mode 100644 index 000000000000..5626ab51a053 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/* + * comp_reg_virt[] shadow layout: + * Covers the full CXL.mem register area (starting at CXL_CM_OFFSET + * within the component register block). Index 0 is the CXL Capability + * Array Header; the HDM decoder block starts at index + * hdm_reg_offset / sizeof(__le32). + * + * Register layout within the HDM block (CXL spec 4.0 8.2.4.20 CXL HDM Decoder + * Capability Structure): + * 0x00: HDM Decoder Capability + * 0x04: HDM Decoder Global Control + * 0x08: (reserved) + * 0x0c: (reserved) + * For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20: + * +0x00: BASE_LO + * +0x04: BASE_HI + * +0x08: SIZE_LO + * +0x0c: SIZE_HI + * +0x10: CTRL + * +0x14: TARGET_LIST_LO + * +0x18: TARGET_LIST_HI + * +0x1c: (reserved) + */ + +static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +{ + /* + * hdm_off is a byte offset within the HDM decoder block. + * comp_reg_virt covers the CXL.mem register area starting at + * CXL_CM_OFFSET within the component register block. + * hdm_reg_offset is CXL.mem-relative, so adding hdm_reg_offset + * gives the correct index into comp_reg_virt[]. + */ + return &cxl->comp_reg_virt[(cxl->hdm_reg_offset + hdm_off) / + sizeof(__le32)]; +} + +static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + /* Discard writes on reserved registers. */ + return size; +} + +static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 new_val = le32_to_cpu(*val32); + u32 dec_idx, ctrl_off, ctrl; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* + * Honor the CTRL LOCK bit the same way BASE_HI/SIZE_HI do: once the + * guest sets LOCK, BASE_LO/SIZE_LO must remain frozen in shadow. + */ + dec_idx = ((u32)offset - CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) / + CXL_HDM_DECODER_BLOCK_STRIDE; + ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + dec_idx * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + ctrl = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, ctrl_off)); + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* Bits [27:0] are reserved. */ + new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 size) +{ + u32 hdm_gcap; + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bit [31:2] are reserved. */ + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK; + + /* Poison On Decode Error Enable (bit 0) is RO=0 if not supported. */ + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + if (!(hdm_gcap & CXL_HDM_DECODER_POISON_ON_DECODE_ERR)) + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT; + + *hdm_reg_ptr(vdev->cxl, CXL_HDM_DECODER_CTRL_OFFSET) = + cpu_to_le32(new_val); + + return size; +} + +/** + * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register. + * @vdev: VFIO PCI core device + * @val32: New register value supplied by userspace (little-endian) + * @offset: Byte offset within the HDM block for this decoder's CTRL register + * @size: Access size in bytes; must equal CXL_REG_SIZE_DWORD + * + * The COMMIT bit (bit 9) is the key: setting it requests the hardware to + * lock the decoder. The emulated COMMITTED bit (bit 10) mirrors COMMIT + * immediately to allow QEMU's notify_change to detect the transition and + * map/unmap the DPA MemoryRegion in the guest address space. + * + * Note: the actual hardware HDM decoder programming (writing the real + * BASE/SIZE with host physical addresses) happens in the QEMU notify_change + * callback BEFORE this write reaches the hardware. This ordering is + * correct because vfio_region_write() calls notify_change() first. + * + * Return: @size on success, %-EINVAL if @size is not %CXL_REG_SIZE_DWORD. + */ +static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 hdm_gcap; + u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK; + u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK; + u32 new_val = le32_to_cpu(*val32); + u32 cur_val; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset)); + if (cur_val & CXL_HDM_DECODER0_CTRL_LOCK) { + if (new_val & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* LOCK_0 only: preserve all other bits, clear LOCK */ + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32( + cur_val & ~CXL_HDM_DECODER0_CTRL_LOCK); + return size; + } + + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO; + rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED; + + if (!(hdm_gcap & CXL_HDM_DECODER_UIO_CAPABLE)) + rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED; + + /* + * BI (bit 13) is RsvdP for devices without CXL.cache. HDM-D decoders + * on a CXL.mem-only device must not see BI set in shadow. + */ + if (!vdev->cxl->cache_capable) + rev_mask |= CXL_HDM_DECODER_CTRL_BI_RESERVED; + + new_val &= ~rev_mask; + cur_val &= ro_mask; + new_val = (new_val & ~ro_mask) | cur_val; + + /* + * Mirror COMMIT to COMMITTED immediately in the emulated state. + */ + if (new_val & CXL_HDM_DECODER0_CTRL_COMMIT) + new_val |= CXL_HDM_DECODER0_CTRL_COMMITTED; + else + new_val &= ~CXL_HDM_DECODER0_CTRL_COMMITTED; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +/* + * Dispatch table for COMP_REGS region writes. Indexed by byte offset within + * the HDM decoder block. Returns the appropriate write handler. + * + * Layout: + * 0x00 HDM Decoder Capability (RO) + * 0x04 HDM Global Control (RW with reserved masking) + * 0x08-0x0f (reserved) (ignored) + * Per decoder N, base = 0x10 + N*0x20: + * base+0x00 BASE_LO (RW, [27:0] reserved) + * base+0x04 BASE_HI (RW) + * base+0x08 SIZE_LO (RW, [27:0] reserved) + * base+0x0c SIZE_HI (RW) + * base+0x10 CTRL (RW, complex rules) + * base+0x14 TARGET_LIST_LO (ignored for Type-2) + * base+0x18 TARGET_LIST_HI (ignored for Type-2) + * base+0x1c (reserved) (ignored) + */ +static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev, + u32 off, const __le32 *val32, u32 size) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 dec_base, dec_off; + + /* HDM Decoder Capability (0x00): RO */ + if (off == CXL_HDM_DECODER_CAP_OFFSET) + return size; + + /* HDM Global Control (0x04) */ + if (off == CXL_HDM_DECODER_CTRL_OFFSET) + return hdm_decoder_global_ctrl_write(vdev, val32, size); + + /* + * Offsets 0x08-0x0f are reserved per CXL 4.0 Table 8-115. + * Per-decoder registers start at 0x10, stride 0x20 + */ + if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) + return size; /* reserved gap */ + + dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET; + /* + * Reject accesses beyond the last implemented HDM decoder. + * Without this check an out-of-bounds offset would silently + * corrupt comp_reg_virt[] memory past the end of the allocation. + */ + if ((off - dec_base) / CXL_HDM_DECODER_BLOCK_STRIDE >= cxl->hdm_count) + return size; + + dec_off = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE; + + switch (dec_off) { + case CXL_HDM_DECODER_N_BASE_LOW_OFFSET: /* BASE_LO */ + case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET: /* SIZE_LO */ + return hdm_decoder_n_lo_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */ + case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */ + { + /* Full 32-bit write, no reserved bits; frozen when COMMIT_LOCK set */ + u32 ctrl_off = off - dec_off + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + *hdm_reg_ptr(cxl, off) = *val32; + return size; + } + case CXL_HDM_DECODER_N_CTRL_OFFSET: /* CTRL */ + return hdm_decoder_n_ctrl_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET: + case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET: + case CXL_HDM_DECODER_N_REV_OFFSET: + return virt_hdm_rev_reg_write(vdev, val32, off, size); + default: + return size; + } +} + +/* + * vfio_cxl_comp_regs_rw - regops rw handler for + * VFIO_REGION_SUBTYPE_CXL_COMP_REGS. + * + * Reads return the emulated HDM state (comp_reg_virt[]). + * Writes go through comp_regs_dispatch_write() for bit-field enforcement. + * Only 4-byte aligned 4-byte accesses are supported (hardware requirement). + */ +static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t done = 0; + + if (!count) + return 0; + + /* Clamp to total region size: cap array prefix + HDM block */ + if (pos >= cxl->hdm_reg_offset + cxl->hdm_reg_size) + return -EINVAL; + count = min(count, + (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + + while (done < count) { + u32 sz = count - done; + u32 off = pos + done; + __le32 v; + + /* Enforce exactly 4-byte, 4-byte-aligned accesses */ + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) + return done ? (ssize_t)done : -EINVAL; + + if (iswrite) { + if (off < cxl->hdm_reg_offset) { + /* Cap array area is read-only; discard writes */ + done += sizeof(v); + continue; + } + if (copy_from_user(&v, buf + done, sizeof(v))) + return done ? (ssize_t)done : -EFAULT; + comp_regs_dispatch_write(vdev, + off - cxl->hdm_reg_offset, + &v, sizeof(v)); + } else { + /* Read from extended buffer _ covers cap array and HDM */ + v = cxl->comp_reg_virt[off / sizeof(__le32)]; + if (copy_to_user(buf + done, &v, sizeof(v))) + return done ? (ssize_t)done : -EFAULT; + } + done += sizeof(v); + } + + *ppos += done; + return done; +} + +static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + /* comp_reg_virt is freed in vfio_cxl_clean_virt_regs() */ +} + +static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = { + .rw = vfio_cxl_comp_regs_rw, + .release = vfio_cxl_comp_regs_release, +}; + +/* + * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state. + * + * Allocates comp_reg_virt as a compact __le32 array covering only + * hdm_reg_size bytes of HDM decoder registers. The initial values + * are read from hardware via the BAR ioremap established by the caller. + * + * DVSEC state is accessed via vdev->vconfig (see the following patch). + */ +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base, + resource_size_t max_size) +{ + size_t total_size, nregs, i; + + if (WARN_ON(!cxl->hdm_reg_size)) + return -EINVAL; + + total_size = cxl->hdm_reg_offset + cxl->hdm_reg_size; + + /* + * The caller's map covers [comp_reg_offset, comp_reg_offset+max_size) + * inside the BAR; the HDM block ends at CXL_CM_OFFSET + total_size + * relative to that map. Reject HDM blocks that walk past the + * advertised map size; pci_resource_len() would happily allow a stale + * BAR-wide window and the subsequent readl()s would run off the + * ioremap range. + */ + if (CXL_CM_OFFSET + total_size > max_size) + return -ENODEV; + + nregs = total_size / sizeof(__le32); + cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL); + if (!cxl->comp_reg_virt) + return -ENOMEM; + + /* + * Snapshot the CXL.mem register area from the caller's mapping. + * cap_base maps the component register block from comp_reg_offset. + * The CXL.mem registers start at CXL_CM_OFFSET (= 0x1000) within that + * block; reading from cap_base + CXL_CM_OFFSET ensures comp_reg_virt[0] + * holds the CXL Capability Array Header required by guest drivers. + */ + for (i = 0; i < nregs; i++) + cxl->comp_reg_virt[i] = + cpu_to_le32(readl(cap_base + CXL_CM_OFFSET + + i * sizeof(__le32))); + + /* + * Establish persistent mapping; kept alive until + * vfio_cxl_clean_virt_regs(). + */ + cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, + cxl->comp_reg_bar) + + cxl->comp_reg_offset + CXL_CM_OFFSET + + cxl->hdm_reg_offset, + cxl->hdm_reg_size); + if (!cxl->hdm_iobase) { + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; + return -ENOMEM; + } + + return 0; +} + +/* + * Called with memory_lock write side held (from vfio_cxl_reactivate_region). + * Uses the pre-established hdm_iobase, no ioremap() under the lock, + * which would deadlock on PREEMPT_RT where ioremap() can sleep. + */ +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl) +{ + size_t i, nregs; + u32 n; + + if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase) + return; + + nregs = cxl->hdm_reg_size / sizeof(__le32); + + for (i = 0; i < nregs; i++) + *hdm_reg_ptr(cxl, i * sizeof(__le32)) = + cpu_to_le32(readl(cxl->hdm_iobase + + i * sizeof(__le32))); + + /* + * For firmware-committed decoders, clear COMMIT_LOCK (bit 8) and zero + * BASE in comp_reg_virt[] so QEMU can write the correct guest GPA via + * setup_locked_hdm() before guest DPA access begins. + * + * Check the COMMITTED bit (bit 10) directly from the freshly-snapshotted + * ctrl register rather than relying on cxl->precommitted. At probe time + * this function is called before cxl->precommitted is set (it is set + * after vfio_cxl_read_committed_decoder_size() succeeds), so using + * cxl->precommitted here would silently skip the LOCK clearing and leave + * the hardware HPA in comp_reg_virt[]. + */ + for (n = 0; n < cxl->hdm_count; n++) { + u32 ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 base_lo_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_LOW_OFFSET; + u32 base_hi_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_HIGH_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) { + *hdm_reg_ptr(cxl, ctrl_off) = + cpu_to_le32(ctrl & + ~CXL_HDM_DECODER0_CTRL_LOCK); + *hdm_reg_ptr(cxl, base_lo_off) = 0; + *hdm_reg_ptr(cxl, base_hi_off) = 0; + } + } +} + +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) +{ + if (cxl->hdm_iobase) { + iounmap(cxl->hdm_iobase); + cxl->hdm_iobase = NULL; + } + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; +} diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index bb03f9363d98..8017d92e483f 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -22,12 +22,53 @@ struct vfio_pci_cxl_state { size_t hdm_reg_size; resource_size_t comp_reg_offset; size_t comp_reg_size; + __le32 *comp_reg_virt; + void __iomem *hdm_iobase; u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; bool cache_capable; }; +/* Register access sizes */ +#define CXL_REG_SIZE_WORD 2 +#define CXL_REG_SIZE_DWORD 4 + +/* HDM Decoder - register offsets (CXL 4.0 Table 8-115) */ +#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET 0x10 +#define CXL_HDM_DECODER_BLOCK_STRIDE 0x20 +#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET 0x0 +#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET 0x4 +#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET 0x8 +#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET 0xc +#define CXL_HDM_DECODER_N_CTRL_OFFSET 0x10 +#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET 0x14 +#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18 +#define CXL_HDM_DECODER_N_REV_OFFSET 0x1c + +/* + * HDM Decoder N Control emulation masks. + * + * Single-bit hardware definitions are in as + * CXL_HDM_DECODER0_CTRL_* (bits 0-14) and CXL_HDM_DECODER_*_CAP. + * The masks below express emulation policy for a CXL.mem device. + */ +#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK (BIT(10) | BIT(11)) +#define CXL_HDM_DECODER_CTRL_RESERVED_MASK (BIT(15) | GENMASK(31, 28)) +#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO BIT(12) +#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED (GENMASK(19, 16) | GENMASK(23, 20)) +#define CXL_HDM_DECODER_CTRL_UIO_RESERVED (BIT(14) | GENMASK(27, 24)) +/* + * bit 13 (BI) is RsvdP for devices without CXL.cache (Cache_Capable=0). + * HDM-D (CXL.mem only) decoders must not have BI set by the guest. + */ +#define CXL_HDM_DECODER_CTRL_BI_RESERVED BIT(13) +#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK GENMASK(27, 0) + +#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2) +#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) + /* * CXL DVSEC for CXL Devices - register offsets within the DVSEC * (CXL 4.0 8.1.3). @@ -38,4 +79,11 @@ struct vfio_pci_cxl_state { /* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ #define CXL_DVSEC_CACHE_CAPABLE BIT(0) +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base, + resource_size_t max_size); +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); + #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b59929636484..2ec3e9c47eff 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2213,6 +2213,15 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return 0; out_power: + /* + * vfio_pci_cxl_detect_and_init() may have assigned vdev->cxl and + * allocated comp_reg_virt[] / hdm_iobase / region state above. The + * normal teardown via vfio_pci_core_unregister_device() will not run + * if registration failed, so release the CXL state here. No-op when + * vdev->cxl is NULL (non-CXL device or detect skipped). + */ + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(dev); diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h index c821ef7ec2bb..aa9fea9b88f7 100644 --- a/include/uapi/cxl/cxl_regs.h +++ b/include/uapi/cxl/cxl_regs.h @@ -34,8 +34,13 @@ #define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) #define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) #define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_POISON_ON_DECODE_ERR _BITUL(10) #define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) #define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_UIO_CAPABLE _BITUL(13) +#define CXL_HDM_DECODER_UIO_COUNT_MASK __GENMASK(19, 16) +#define CXL_HDM_DECODER_MEMDATA_NXM _BITUL(20) +#define CXL_HDM_DECODER_COHERENCY_MODELS_MASK __GENMASK(22, 21) #define CXL_HDM_DECODER_CTRL_OFFSET 0x4 #define CXL_HDM_DECODER_ENABLE _BITUL(1) #define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -- Gitee From b8ecf537b56b9aa232fae032fb3aef5afe3010e1 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:09 +0530 Subject: [PATCH 31/42] NVIDIA: VR: SAUCE: vfio/cxl: Wait for HDM ranges and create memdev BugLink: https://bugs.launchpad.net/bugs/2152222 After HDM registers are mapped, call cxl_await_range_active() so we only proceed when DVSEC ranges report active, avoiding access to the memdev register group that Type-2 devices may lack. This wait is required before re-snapshotting component registers: firmware commits final HDM decoder values such as SIZE_HIGH only after MEM_ACTIVE. Once cxl_await_range_active() confirms that state, re-read component regs with vfio_cxl_reinit_comp_regs() so those committed values land in comp_reg_virt. Read committed decoder size from hardware, set capacity via cxl_set_capacity(), and devm_cxl_add_memdev(). Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (cherry-picked from commit 537d8a2414cf52e363f0d82ea34b98647e8af2f7 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Line offset adjustments only (cascading from 0011 changes)] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 56 ++++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_emu.c | 42 +++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 4 ++ 3 files changed, 102 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 72033fda7954..3279d2a89feb 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -170,6 +170,22 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, return ret; } +static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + int ret; + + ret = cxl_set_capacity(&cxl->cxlds, capacity); + if (ret) + return ret; + + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) + return PTR_ERR(cxl->cxlmd); + + return 0; +} + /* * Free CXL state early on probe failure. devm_kfree() on a live devres * allocation removes it from the list immediately, so the normal devres @@ -194,6 +210,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_cxl_state *cxl; + resource_size_t capacity = 0; u16 dvsec; int ret; @@ -239,8 +256,44 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto free_cxl; } + cxl->cxlds.media_ready = !cxl_await_range_active(&cxl->cxlds); + if (!cxl->cxlds.media_ready) { + pci_warn(pdev, "CXL media not ready\n"); + pci_disable_device(pdev); + goto regs_failed; + } + + /* + * Take the single authoritative HDM decoder snapshot now that + * MEM_ACTIVE is confirmed and BAR memory is still enabled. Using + * readl() per-dword ensures correct MMIO serialisation and captures + * the final firmware-written values for all fields including SIZE_HIGH, + * which firmware commits to the BAR at MEM_ACTIVE time. + */ + vfio_cxl_reinit_comp_regs(cxl); + pci_disable_device(pdev); + capacity = vfio_cxl_read_committed_decoder_size(vdev, cxl); + if (capacity == 0) { + /* + * TODO: Add handling for devices which do not have + * firmware pre-committed decoders + */ + pci_info(pdev, "Uncommitted region size must be configured via sysfs before bind\n"); + goto regs_failed; + } + + cxl->dpa_size = capacity; + + pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); + + ret = vfio_cxl_create_memdev(cxl, capacity); + if (ret) { + pci_warn(pdev, "Failed to create memdev\n"); + goto regs_failed; + } + /* * Register probing succeeded. Assign vdev->cxl now so that * all subsequent helpers can access state via vdev->cxl. @@ -251,6 +304,9 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) return; +regs_failed: + vfio_cxl_clean_virt_regs(cxl); + free_cxl: vfio_cxl_dev_state_free(pdev, cxl); } diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 5626ab51a053..5cd0dfdcb714 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -394,6 +394,48 @@ int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, return 0; } +/* + * vfio_cxl_read_committed_decoder_size - Extract committed DPA capacity from + * comp_reg_virt[]. + * + * Called from probe context after vfio_cxl_reinit_comp_regs() has taken the + * post-MEM_ACTIVE readl() snapshot and patched SIZE_HIGH/SIZE_LOW from DVSEC. + * comp_reg_virt[] is already correct at this point; no hardware access needed. + * + * Returns the committed DPA capacity in bytes, or 0 if the decoder is not + * committed. + */ +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct pci_dev *pdev = vdev->pdev; + resource_size_t capacity; + u32 ctrl, sz_hi, sz_lo; + + if (WARN_ON(!cxl || !cxl->comp_reg_virt)) + return 0; + + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_CTRL_OFFSET(0))); + sz_hi = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(0))); + sz_lo = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_LOW_OFFSET(0))); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) { + pci_dbg(pdev, + "vfio_cxl: decoder0 not committed: ctrl=0x%08x\n", + ctrl); + return 0; + } + + capacity = ((resource_size_t)sz_hi << 32) | (sz_lo & GENMASK(31, 28)); + + pci_dbg(pdev, + "vfio_cxl: decoder0 committed: sz_hi=0x%08x sz_lo=0x%08x capacity=0x%llx\n", + sz_hi, sz_lo, (unsigned long long)capacity); + + return capacity; +} + /* * Called with memory_lock write side held (from vfio_cxl_reactivate_region). * Uses the pre-established hdm_iobase, no ioremap() under the lock, diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 8017d92e483f..a9b244f1624c 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -23,6 +23,7 @@ struct vfio_pci_cxl_state { resource_size_t comp_reg_offset; size_t comp_reg_size; __le32 *comp_reg_virt; + size_t dpa_size; void __iomem *hdm_iobase; u16 dvsec_len; u8 hdm_count; @@ -85,5 +86,8 @@ int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, resource_size_t max_size); void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl); #endif /* __LINUX_VFIO_CXL_PRIV_H */ -- Gitee From 775411e92e25ac945623326982ef7be5747ed510 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:10 +0530 Subject: [PATCH 32/42] NVIDIA: VR: SAUCE: vfio/cxl: CXL region management support BugLink: https://bugs.launchpad.net/bugs/2152222 Region Management makes use of APIs provided by CXL_CORE as below: CREATE_REGION flow: 1. Validate request (size, decoder availability) 2. Allocate HPA via cxl_get_hpa_freespace() 3. Allocate DPA via cxl_request_dpa() 4. Create region via cxl_create_region() - commits HDM decoder 5. Get HPA range via cxl_get_region_range() DESTROY_REGION flow: 1. Detach decoder via cxl_decoder_detach() 2. Free DPA via cxl_dpa_free() 3. Release root decoder via cxl_put_root_decoder() Use DEFINE_FREE scope helpers so error paths unwind cleanly. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 799c46dc149544101e6293d3f600a4ba115a270f from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Add borrowed-reference comment for precommitted decoders, init region to NULL, do not unregister precommitted regions in teardown] [kobak: Restored BOS CXL helper providers/exports and vfio-pci-core CXL namespace import so the region-management backport builds against BOS CXL core.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/cxl/core/hdm.c | 97 +++++++++++ drivers/cxl/core/region.c | 251 +++++++++++++++++++++++++++ drivers/cxl/cxl.h | 1 + drivers/vfio/pci/cxl/vfio_cxl_core.c | 122 +++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 8 + drivers/vfio/pci/vfio_pci_core.c | 1 + include/cxl/cxl.h | 29 ++++ 7 files changed, 509 insertions(+) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 5a2c83705a8d..081b763ebd1e 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -565,6 +565,7 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) devm_cxl_dpa_release(cxled); return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dpa_free, "CXL"); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode) @@ -596,6 +597,64 @@ int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, return 0; } +static int find_free_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == (port->hdm_end + 1); +} + +static struct cxl_endpoint_decoder * +cxl_find_free_decoder(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct device *dev; + + guard(rwsem_read)(&cxl_rwsem.dpa); + dev = device_find_child(&endpoint->dev, NULL, find_free_decoder); + if (!dev) + return NULL; + + return to_cxl_endpoint_decoder(dev); +} + +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc) +{ + struct cxl_endpoint_decoder *cxled; + int rc; + + if (!IS_ALIGNED(alloc, SZ_256M)) + return ERR_PTR(-EINVAL); + + cxled = cxl_find_free_decoder(cxlmd); + if (!cxled) + return ERR_PTR(-ENODEV); + + rc = cxl_dpa_set_part(cxled, mode); + if (rc) + goto err_put; + + rc = cxl_dpa_alloc(cxled, alloc); + if (rc) + goto err_put; + + return cxled; + +err_put: + put_device(&cxled->cxld.dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(cxl_request_dpa, "CXL"); + static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -679,6 +738,44 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } +static int find_committed_endpoint_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == port->hdm_end; +} + +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct device *cxled_dev; + + if (!endpoint) + return NULL; + + guard(rwsem_read)(&cxl_rwsem.dpa); + cxled_dev = device_find_child(&endpoint->dev, NULL, + find_committed_endpoint_decoder); + if (!cxled_dev) + return NULL; + + cxled = to_cxl_endpoint_decoder(cxled_dev); + *cxlr = cxled->cxld.region; + + put_device(cxled_dev); + return cxled; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_committed_decoder, "CXL"); + static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) { u16 eig; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 48c7cb61944c..779dfa81a7ff 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -733,6 +733,141 @@ static int free_hpa(struct cxl_region *cxlr) return 0; } +struct cxlrd_max_context { + struct device * const *host_bridges; + int interleave_ways; + unsigned long flags; + resource_size_t max_hpa; + struct cxl_root_decoder *cxlrd; +}; + +static int find_max_hpa(struct device *dev, void *data) +{ + struct cxlrd_max_context *ctx = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_root_decoder *cxlrd; + struct resource *res, *prev; + struct cxl_decoder *cxld; + resource_size_t free = 0; + resource_size_t max; + int found = 0; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + cxlsd = &cxlrd->cxlsd; + cxld = &cxlsd->cxld; + + if ((cxld->flags & ctx->flags) != ctx->flags) { + dev_dbg(dev, "flags not matching: %08lx vs %08lx\n", + cxld->flags, ctx->flags); + return 0; + } + + for (int i = 0; i < ctx->interleave_ways; i++) { + for (int j = 0; j < ctx->interleave_ways; j++) { + if (ctx->host_bridges[i] == cxlsd->target[j]->dport_dev) { + found++; + break; + } + } + } + + if (found != ctx->interleave_ways) { + dev_dbg(dev, + "Not enough host bridges. Found %d for %d interleave ways requested\n", + found, ctx->interleave_ways); + return 0; + } + + lockdep_assert_held_read(&cxl_rwsem.region); + res = cxlrd->res->child; + + if (!res) + max = resource_size(cxlrd->res); + else + max = 0; + + for (prev = NULL; res; prev = res, res = res->sibling) { + if (!prev && res->start == cxlrd->res->start && + res->end == cxlrd->res->end) { + max = resource_size(cxlrd->res); + break; + } + if (prev && !resource_size(prev)) + continue; + + if (!prev && res->start > cxlrd->res->start) { + free = res->start - cxlrd->res->start; + max = max(free, max); + } + if (prev && res->start > prev->end + 1) { + free = res->start - prev->end + 1; + max = max(free, max); + } + } + + if (prev && prev->end + 1 < cxlrd->res->end + 1) { + free = cxlrd->res->end + 1 - prev->end + 1; + max = max(free, max); + } + + dev_dbg(&cxlrd->cxlsd.cxld.dev, "found %pa bytes of free space\n", &max); + if (max > ctx->max_hpa) { + if (ctx->cxlrd) + put_device(&ctx->cxlrd->cxlsd.cxld.dev); + get_device(&cxlrd->cxlsd.cxld.dev); + ctx->cxlrd = cxlrd; + ctx->max_hpa = max; + } + return 0; +} + +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max_avail_contig) +{ + struct cxlrd_max_context ctx = { + .flags = flags, + .interleave_ways = interleave_ways, + }; + struct cxl_port *root_port; + struct cxl_port *endpoint; + + endpoint = cxlmd->endpoint; + if (!endpoint) { + dev_dbg(&cxlmd->dev, "endpoint not linked to memdev\n"); + return ERR_PTR(-ENXIO); + } + + ctx.host_bridges = &endpoint->host_bridge; + + struct cxl_root *root __free(put_cxl_root) = find_cxl_root(endpoint); + if (!root) { + dev_dbg(&endpoint->dev, "endpoint is not related to a root port\n"); + return ERR_PTR(-ENXIO); + } + + root_port = &root->port; + scoped_guard(rwsem_read, &cxl_rwsem.region) + device_for_each_child(&root_port->dev, &ctx, find_max_hpa); + + if (!ctx.cxlrd) + return ERR_PTR(-ENOMEM); + + *max_avail_contig = ctx.max_hpa; + return ctx.cxlrd; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hpa_freespace, "CXL"); + +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd) +{ + put_device(&cxlrd->cxlsd.cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_put_root_decoder, "CXL"); + static ssize_t size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -2612,6 +2747,27 @@ static void cxl_region_release_action(struct cxl_region *cxlr) unregister_region(cxlr); } +void cxl_unregister_region(struct cxl_region *cxlr) +{ + cxl_region_release_action(cxlr); +} +EXPORT_SYMBOL_NS_GPL(cxl_unregister_region, "CXL"); + +int cxl_get_region_range(struct cxl_region *region, struct range *range) +{ + if (WARN_ON_ONCE(!region)) + return -ENODEV; + + if (!region->params.res) + return -ENOSPC; + + range->start = region->params.res->start; + range->end = region->params.res->end; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_region_range, "CXL"); + static struct lock_class_key cxl_region_key; static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) @@ -3802,6 +3958,101 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, return cxlr; } +DEFINE_FREE(cxl_region_release, struct cxl_region *, + if (!IS_ERR_OR_NULL(_T)) cxl_region_release_action(_T)) + +static struct cxl_region * +__construct_new_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, int ways) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled[0]); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p; + resource_size_t size = 0; + int rc, i, part = READ_ONCE(cxled[0]->part); + + if (part < 0 || part >= cxlds->nr_partitions) { + dev_err(cxlmd->dev.parent, + "%s:%s: invalid partition index %d (max %u)\n", + dev_name(&cxlmd->dev), dev_name(&cxled[0]->cxld.dev), + part, cxlds->nr_partitions); + return ERR_PTR(-ENXIO); + } + + struct cxl_region *cxlr __free(cxl_region_release) = + __create_region(cxlrd, cxlds->part[part].mode, + atomic_read(&cxlrd->region_id), + cxled[0]->cxld.target_type); + if (IS_ERR(cxlr)) + return cxlr; + + guard(rwsem_write)(&cxl_rwsem.region); + + p = &cxlr->params; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s unexpected region state\n", + dev_name(&cxlmd->dev), dev_name(&cxled[0]->cxld.dev), + __func__); + return ERR_PTR(-EBUSY); + } + + if (ways < 1) + return ERR_PTR(-EINVAL); + + p->interleave_ways = ways; + p->interleave_granularity = cxld->interleave_granularity; + + scoped_guard(rwsem_read, &cxl_rwsem.dpa) { + for (i = 0; i < ways; i++) { + if (!cxled[i]->dpa_res) + return ERR_PTR(-EINVAL); + size += resource_size(cxled[i]->dpa_res); + } + + rc = alloc_hpa(cxlr, size); + if (rc) + return ERR_PTR(rc); + + for (i = 0; i < ways; i++) { + rc = cxl_region_attach(cxlr, cxled[i], 0); + if (rc) + return ERR_PTR(rc); + } + } + + rc = cxl_region_decode_commit(cxlr); + if (rc) + return ERR_PTR(rc); + + p->state = CXL_CONFIG_COMMIT; + + return no_free_ptr(cxlr); +} + +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways) +{ + struct cxl_region *cxlr; + + mutex_lock(&cxlrd->range_lock); + cxlr = __construct_new_region(cxlrd, cxled, ways); + mutex_unlock(&cxlrd->range_lock); + if (IS_ERR(cxlr)) + return cxlr; + + if (device_attach(&cxlr->dev) <= 0) { + dev_err(&cxlr->dev, "failed to create region\n"); + cxl_region_release_action(cxlr); + return ERR_PTR(-ENODEV); + } + + return cxlr; +} +EXPORT_SYMBOL_NS_GPL(cxl_create_region, "CXL"); + static struct cxl_region * cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa_range) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 7973d1519cc2..08755b9404e6 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -13,6 +13,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 3279d2a89feb..3e2cb2e67f8b 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,13 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +/* + * Scope-based cleanup wrappers for the CXL resource APIs + */ +DEFINE_FREE(cxl_put_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_put_root_decoder(_T)) +DEFINE_FREE(cxl_dpa_free, struct cxl_endpoint_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_dpa_free(_T)) +DEFINE_FREE(cxl_unregister_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) cxl_unregister_region(_T)) + /* * vfio_cxl_create_device_state - Allocate and validate CXL device state * @@ -170,6 +177,115 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, return ret; } +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size) +{ + resource_size_t max_size; + + struct cxl_root_decoder *cxlrd __free(cxl_put_root_decoder) = + cxl_get_hpa_freespace(cxl->cxlmd, 1, + CXL_DECODER_F_RAM | CXL_DECODER_F_TYPE2, + &max_size); + if (IS_ERR(cxlrd)) + return PTR_ERR(cxlrd); + + /* Insufficient HPA space; cxlrd freed automatically by __free() */ + if (max_size < size) + return -ENOSPC; + + struct cxl_endpoint_decoder *cxled __free(cxl_dpa_free) = + cxl_request_dpa(cxl->cxlmd, CXL_PARTMODE_RAM, size); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + + struct cxl_region *region __free(cxl_unregister_region) = + cxl_create_region(cxlrd, &cxled, 1); + if (IS_ERR(region)) + return PTR_ERR(region); + + /* All operations succeeded; transfer ownership to cxl state */ + cxl->cxlrd = no_free_ptr(cxlrd); + cxl->cxled = no_free_ptr(cxled); + cxl->region = no_free_ptr(region); + + return 0; +} + +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl) +{ + if (!cxl->region) + return; + + /* + * Precommitted regions are obtained via cxl_get_committed_decoder() as + * a borrowed reference owned by the cxl core; do not unregister or + * free the decoder objects from here. Only vfio_cxl_create_cxl_region() + * owns the region and decoders. + */ + if (!cxl->precommitted) { + cxl_unregister_region(cxl->region); + cxl_dpa_free(cxl->cxled); + cxl_put_root_decoder(cxl->cxlrd); + } + + cxl->region = NULL; + cxl->cxled = NULL; + cxl->cxlrd = NULL; +} + +static int vfio_cxl_create_region_helper(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + struct pci_dev *pdev = vdev->pdev; + struct range range; + int ret; + + if (cxl->precommitted) { + struct cxl_endpoint_decoder *cxled; + /* + * cxl_get_committed_decoder() does not write *region on every + * failure path (e.g. when cxlmd->endpoint is NULL or no decoder + * is committed). Initialise to NULL so the !cxl->region check + * below catches it regardless of stack-init mode. + */ + struct cxl_region *region = NULL; + + cxled = cxl_get_committed_decoder(cxl->cxlmd, ®ion); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + cxl->cxled = cxled; + cxl->region = region; + } else { + ret = vfio_cxl_create_cxl_region(cxl, capacity); + if (ret) + return ret; + } + + if (!cxl->region) { + pci_err(pdev, "Failed to create CXL region\n"); + ret = -ENODEV; + goto failed; + } + + ret = cxl_get_region_range(cxl->region, &range); + if (ret) + goto failed; + + cxl->region_hpa = range.start; + cxl->region_size = range_len(&range); + + pci_dbg(pdev, "CXL region: HPA 0x%llx size %lu MB\n", + cxl->region_hpa, cxl->region_size >> 20); + + return 0; + +failed: + vfio_cxl_destroy_cxl_region(cxl); + + return ret; +} + static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, resource_size_t capacity) { @@ -284,6 +400,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto regs_failed; } + cxl->precommitted = true; cxl->dpa_size = capacity; pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); @@ -294,6 +411,10 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto regs_failed; } + ret = vfio_cxl_create_region_helper(vdev, cxl, capacity); + if (ret) + goto regs_failed; + /* * Register probing succeeded. Assign vdev->cxl now so that * all subsequent helpers can access state via vdev->cxl. @@ -319,6 +440,7 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) return; vfio_cxl_clean_virt_regs(cxl); + vfio_cxl_destroy_cxl_region(cxl); } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index a9b244f1624c..e7d4f38d03be 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -18,6 +18,10 @@ struct vfio_pci_cxl_state { struct cxl_memdev *cxlmd; struct cxl_root_decoder *cxlrd; struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + resource_size_t region_hpa; + size_t region_size; + void *region_vaddr; resource_size_t hdm_reg_offset; size_t hdm_reg_size; resource_size_t comp_reg_offset; @@ -29,6 +33,7 @@ struct vfio_pci_cxl_state { u8 hdm_count; u8 comp_reg_bar; bool cache_capable; + bool precommitted; }; /* Register access sizes */ @@ -89,5 +94,8 @@ void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); resource_size_t vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, struct vfio_pci_cxl_state *cxl); +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size); +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 2ec3e9c47eff..07dfc1c84b3d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2608,3 +2608,4 @@ module_exit(vfio_pci_core_cleanup); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_IMPORT_NS("CXL"); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 87fb52e9c47f..b3287411460d 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -177,6 +177,18 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +/* + * cxl_decoder flags that define the type of memory / devices this decoder + * supports as well as configuration lock status. + */ +#define CXL_DECODER_F_RAM BIT(0) +#define CXL_DECODER_F_PMEM BIT(1) +#define CXL_DECODER_F_TYPE2 BIT(2) +#define CXL_DECODER_F_TYPE3 BIT(3) +#define CXL_DECODER_F_LOCK BIT(4) +#define CXL_DECODER_F_ENABLE BIT(5) +#define CXL_DECODER_F_MASK GENMASK(5, 0) + struct cxl_memdev_attach { int (*probe)(struct cxl_memdev *cxlmd); }; @@ -274,6 +286,23 @@ int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); struct cxl_region; +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr); +int cxl_get_region_range(struct cxl_region *region, struct range *range); +void cxl_unregister_region(struct cxl_region *cxlr); +struct cxl_port; +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max); +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd); +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways); int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); #ifdef CONFIG_CXL_REGION -- Gitee From 30f833a819cb761d321ab7b3de4ace6a5e45423a Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:11 +0530 Subject: [PATCH 33/42] NVIDIA: VR: SAUCE: vfio/cxl: DPA VFIO region with demand fault mmap and reset zap BugLink: https://bugs.launchpad.net/bugs/2152222 Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can mmap guest device memory directly. Faults call vmf_insert_pfn() to insert one PFN at a time rather than mapping the full range upfront. CXL region lifecycle: - The CXL memory region is registered with VFIO layer during vfio_pci_open_device - mmap() establishes the VMA with vm_ops but inserts no PTEs - Each guest page fault calls vfio_cxl_region_page_fault() which inserts a single PFN under the memory_lock read side - On device reset, vfio_cxl_zap_region_locked() sets region_active=false and calls unmap_mapping_range() to invalidate all DPA PTEs atomically while holding memory_lock for writing - Faults racing with reset see region_active==false and return VM_FAULT_SIGBUS - vfio_cxl_reactivate_region() restores region_active after successful hardware reset Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so that FLR correctly invalidates DPA mappings and restores them on success. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit f5e419121227 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve context mismatches in vfio_pci_core.c and vfio_pci_priv.h due to missing upstream dmabuf support in NV-Kernels base, Add vdev back-pointer in cxl_state, hold memory_lock read-side in fault/rw paths, advance *ppos in region rw, add vfio_direct_config_read export and use it instead of vfio_raw_config_read in DVSEC fallback] [kobak: Preserved existing VFIO PCI DMABUF reset movement while adding CXL DPA zap/reactivation.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 232 ++++++++++++++++++++++++++- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 14 +- drivers/vfio/pci/vfio_pci_core.c | 11 ++ drivers/vfio/pci/vfio_pci_priv.h | 6 + 5 files changed, 262 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 3e2cb2e67f8b..04f6f5cb47f6 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -419,8 +419,10 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) * Register probing succeeded. Assign vdev->cxl now so that * all subsequent helpers can access state via vdev->cxl. * All failure paths below clear vdev->cxl before calling - * vfio_cxl_dev_state_free(). + * vfio_cxl_dev_state_free(). cxl->vdev is the back-pointer used + * by vm_fault and other helpers that only have the cxl state in hand. */ + cxl->vdev = vdev; vdev->cxl = cxl; return; @@ -443,4 +445,232 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) vfio_cxl_destroy_cxl_region(cxl); } +static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) +{ + struct vfio_pci_region *region = vmf->vma->vm_private_data; + struct vfio_pci_cxl_state *cxl = region->data; + struct vfio_pci_core_device *vdev = cxl->vdev; + unsigned long pgoff; + unsigned long pfn; + vm_fault_t ret; + + /* + * Hold memory_lock read side across the region_active check and the + * vmf_insert_pfn so the reset path cannot run unmap_mapping_range + * between the two and leave a stale PTE pointing at the pre-reset HPA. + * vfio_cxl_prepare_reset holds the write side while it clears + * region_active and zaps existing PTEs. + */ + down_read(&vdev->memory_lock); + + if (!cxl->region_active) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + pgoff = vmf->pgoff & + ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (pgoff >= (cxl->region_size >> PAGE_SHIFT)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + pfn = PHYS_PFN(cxl->region_hpa) + pgoff; + ret = vmf_insert_pfn(vmf->vma, vmf->address, pfn); + +out: + up_read(&vdev->memory_lock); + return ret; +} + +static const struct vm_operations_struct vfio_cxl_region_vm_ops = { + .fault = vfio_cxl_region_vm_fault, +}; + +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u64 req_len, pgoff, end; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + if (end > cxl->region_size) + return -EINVAL; + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + + vma->vm_ops = &vfio_cxl_region_vm_ops; + vma->vm_private_data = region; + + return 0; +} + +/* + * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + + WRITE_ONCE(cxl->region_active, false); +} + +/* + * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset. + * + * Must be called with vdev->memory_lock held for writing. Re-reads the + * HDM decoder state from hardware (FLR cleared it) and sets region_active + * so that subsequent I/O to the region is permitted again. + */ +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + /* + * Re-initialise the emulated HDM comp_reg_virt[] from hardware. + * After FLR the decoder registers read as zero; mirror that in + * the emulated state so QEMU sees a clean slate. + */ + vfio_cxl_reinit_comp_regs(cxl); + + /* + * Only re-enable the DPA mmap if the hardware has actually + * re-committed decoder 0 after FLR. Read the COMMITTED bit from the + * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR + * hardware state, not stale pre-reset state. + * + * If COMMITTED is 0 (slow firmware re-commit path), leave + * region_active=false. Guest faults will return VM_FAULT_SIGBUS + * until the decoder is re-committed and the region is re-enabled. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + /* + * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset + * (now CXL.mem-relative) plus the within-HDM-block offset. + */ + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } +} + +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_cxl_state *cxl = core_dev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret; + + if (!count || pos >= cxl->region_size) + return 0; + + /* + * Hold memory_lock read side across the region_active check and the + * user copy. vfio_cxl_prepare_reset() holds the write side while it + * clears region_active and unmaps the inode range; without the read + * side here, the copy could still touch cxl->region_vaddr after the + * reset has begun. Guard against access after a failed reset + * (region_active=false) or a release race (region_vaddr=NULL): either + * means the memremap'd window is no longer valid; touching it would + * produce a Synchronous External Abort. + */ + down_read(&core_dev->memory_lock); + + if (!cxl->region_active || !cxl->region_vaddr) { + ret = -EIO; + goto out; + } + + count = min(count, (size_t)(cxl->region_size - pos)); + + if (iswrite) { + if (copy_from_user(cxl->region_vaddr + pos, buf, count)) { + ret = -EFAULT; + goto out; + } + } else { + if (copy_to_user(buf, cxl->region_vaddr + pos, count)) { + ret = -EFAULT; + goto out; + } + } + + /* + * vfio_pci_rw() returns the region rw result verbatim and relies on + * the handler to advance *ppos. Without this, successive read/write + * syscalls on the DPA region keep operating at the same offset + * instead of advancing. + */ + *ppos += count; + ret = count; + +out: + up_read(&core_dev->memory_lock); + return ret; +} + +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_cxl_state *cxl = region->data; + + /* + * Deactivate the region before removing user mappings so that any + * fault handler racing the release returns VM_FAULT_SIGBUS rather + * than inserting a PFN into an unmapped region. + */ + WRITE_ONCE(cxl->region_active, false); + + if (cxl->region_vaddr) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + } +} + +static const struct vfio_pci_regops vfio_cxl_regops = { + .rw = vfio_cxl_region_rw, + .mmap = vfio_cxl_region_mmap, + .release = vfio_cxl_region_release, +}; + MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 5cd0dfdcb714..1b65260c80ce 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -33,7 +33,7 @@ * +0x1c: (reserved) */ -static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) { /* * hdm_off is a byte offset within the HDM decoder block. diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index e7d4f38d03be..088ea882a656 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -12,9 +12,18 @@ #include #include -/* CXL device state embedded in vfio_pci_core_device */ +struct vfio_pci_core_device; + +/* + * CXL device state embedded in vfio_pci_core_device. + * + * cxlds must be the first field: devm_cxl_dev_state_create() asserts + * offsetof(cxlds) == 0 so CXL core's container_of() lookups land back + * on this struct. + */ struct vfio_pci_cxl_state { struct cxl_dev_state cxlds; + struct vfio_pci_core_device *vdev; struct cxl_memdev *cxlmd; struct cxl_root_decoder *cxlrd; struct cxl_endpoint_decoder *cxled; @@ -34,6 +43,7 @@ struct vfio_pci_cxl_state { u8 comp_reg_bar; bool cache_capable; bool precommitted; + bool region_active; }; /* Register access sizes */ @@ -98,4 +108,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, resource_size_t size); void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off); + #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 07dfc1c84b3d..e4544b078aa3 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1231,6 +1231,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); + /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ + vfio_cxl_zap_region_locked(vdev); + /* * This function can be invoked while the power state is non-D0. If * pci_try_reset_function() has been called while the power state is @@ -1246,6 +1249,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, ret = pci_try_reset_function(vdev->pdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); + + /* + * Re-enable DPA region if reset succeeded; fault handler will + * re-insert PFNs on next access without requiring a new mmap. + */ + if (!ret) + vfio_cxl_reactivate_region(vdev); + up_write(&vdev->memory_lock); return ret; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index c19269362d06..85a262ff6cad 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -149,6 +149,8 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); #else @@ -156,6 +158,10 @@ static inline void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } static inline void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } #endif /* CONFIG_VFIO_CXL_CORE */ -- Gitee From acf1dfef3bf1689294f627c79b001d72318ee393 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:12 +0530 Subject: [PATCH 34/42] NVIDIA: VR: SAUCE: vfio/cxl: Virtualize CXL DVSEC config writes BugLink: https://bugs.launchpad.net/bugs/2152222 CXL devices expose DVSEC registers in PCI configuration space. Several of them affect device behavior (CXL.io/CXL.mem/CXL.cache enables, lock state, range bases) and must be virtualised so the guest cannot disturb host-owned policy. Add CXL-aware read and write handlers that operate on vdev->vconfig: - DVSEC reads come back from the vconfig shadow that vfio_config_init() already populates via vfio_ecap_init(). - DVSEC writes go through per-register handlers (cxl_dvsec_*_write) which apply the spec-defined reserved-bit and lock-bit masking before updating the shadow. - The handlers are wired in via vdev->dvsec_readfn / dvsec_writefn, which the global ecap_perms[PCI_EXT_CAP_ID_DVSEC] dispatcher routes to when the device is a CXL device. Non-CXL devices with a DVSEC capability fall through to direct hardware access. Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 3ff6c19fc5176d2468e159d6d390b6d235b416e2 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve context mismatches in Makefile and vfio_pci_core.h due to missing upstream dmabuf/p2pdma forward declarations in NV-Kernels base, Carry Disable_Caching into Cache WBI hardware write, use vfio_direct_config_read fallback, add byte-aligned read/write routing for DVSEC registers, handle partial-byte W1C writes for STATUS/STATUS2, add PM_INIT_COMPLETION RW1CS handling] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/cxl/vfio_cxl_config.c | 344 +++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_core.c | 4 +- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 43 +++- drivers/vfio/pci/vfio_pci_config.c | 50 +++- drivers/vfio/pci/vfio_pci_priv.h | 7 + include/linux/vfio_pci_core.h | 7 + include/uapi/cxl/cxl_regs.h | 98 +++++++ 8 files changed, 541 insertions(+), 14 deletions(-) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_config.c diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index c735b4c665c7..5fe6fe78b0cf 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o -vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o cxl/vfio_cxl_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c new file mode 100644 index 000000000000..35d35e2ded9b --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CXL DVSEC configuration space emulation for vfio-pci. + * + * Integrates into the existing vfio-pci-core ecap_perms[] framework using + * vdev->vconfig as the sole shadow buffer for DVSEC registers. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +static inline u16 _cxlds_get_dvsec(struct vfio_pci_cxl_state *cxl) +{ + return (u16)cxl->cxlds.cxl_dvsec; +} + +/* Helpers to access vdev->vconfig at a DVSEC-relative offset */ +static inline u16 dvsec_virt_read16(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return get_unaligned_le16(vdev->vconfig + dvsec + off); +} + +static inline void dvsec_virt_write16(struct vfio_pci_core_device *vdev, + u16 off, u16 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + put_unaligned_le16(val, vdev->vconfig + dvsec + off); +} + +static inline u32 dvsec_virt_read32(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return get_unaligned_le32(vdev->vconfig + dvsec + off); +} + +static inline void dvsec_virt_write32(struct vfio_pci_core_device *vdev, + u16 off, u32 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + put_unaligned_le32(val, vdev->vconfig + dvsec + off); +} + +static u32 dvsec_virt_merge_write32(struct vfio_pci_core_device *vdev, + u16 off, u16 byte_in_reg, + int count, __le32 val) +{ + u32 cur = dvsec_virt_read32(vdev, off); + u32 data = le32_to_cpu(val); + u32 mask; + + if (byte_in_reg + count > sizeof(u32)) + return cur; + + if (count == sizeof(u32)) + return data; + + mask = (1U << (count * 8)) - 1; + mask <<= byte_in_reg * 8; + + return (cur & ~mask) | ((data << (byte_in_reg * 8)) & mask); +} + +/* Individual DVSEC register write handlers */ + +static void cxl_dvsec_control_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 lock = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL_RESERVED_MASK; + + if (lock & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; /* register is locked after first write */ + + if (!(cap3 & CXL_DVSEC_CAP3_P2P_MEM_CAPABLE)) + rev_mask |= CXL_CTRL_P2P_REV_MASK; + + new_val &= ~rev_mask; + new_val |= CXL_DVSEC_CTRL_IO_ENABLE; /* IO_Enable always returns 1 */ + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, new_val); +} + +static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS_OFFSET); + + /* + * VIRAL_STATUS (bit 14) is the only writable bit; all others are + * reserved and always zero. + */ + new_val = cur_val & ~(new_val & CXL_DVSEC_STATUS_VIRAL_STATUS); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); +} + +static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_CONTROL2_OFFSET; + u16 cap2 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY2_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL2_RESERVED_MASK; + + if (!(cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY)) + rev_mask |= CXL_CTRL2_VOLATILE_HDM_REV_MASK; + if (!(cap2 & CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE)) + rev_mask |= CXL_CTRL2_MODIFIED_COMP_REV_MASK; + + new_val &= ~rev_mask; + + /* + * Cache WBI: forward to hardware. cxl_dev_reset() programs + * Disable_Caching first and then initiates Cache WBI with that bit + * still set; replicate that ordering for VMM-delegated WBI by carrying + * the just-written Disable_Caching value into the same hardware write. + * new_val is the post-merge 16-bit shadow value, so it already reflects + * a prior shadow-only Disable_Caching=1 followed by a WBI-only write. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI) { + u16 hw_val = CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI; + + if (new_val & CXL_DVSEC_CTRL2_DISABLE_CACHING) + hw_val |= CXL_DVSEC_CTRL2_DISABLE_CACHING; + pci_write_config_word(pdev, abs_off, hw_val); + } + + /* + * CXL Reset: not yet supported - do not forward to HW. + * TODO: invoke CXL protocol reset via cxl subsystem + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) + pci_warn(pdev, "vfio-cxl: CXL reset requested but not yet supported\n"); + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, + new_val & ~CXL_CTRL2_HW_BITS_MASK); +} + +static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; + + /* RW1CS: write 1 to clear, but only if the capability is supported */ + if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); + /* STATUS2 is not mirrored in vconfig - reads go to hardware */ +} + +static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + + /* Once the LOCK bit is set it can only be cleared by conventional reset */ + if (cur_val & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; + + new_val &= ~CXL_LOCK_RESERVED_MASK; + dvsec_virt_write16(vdev, CXL_DVSEC_LOCK_OFFSET, new_val); +} + +static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, + u16 dvsec_off, u32 new_val) +{ + new_val &= ~CXL_BASE_LO_RESERVED_MASK; + dvsec_virt_write32(vdev, dvsec_off, new_val); +} + +/** + * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to read + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Output buffer for the read value (little-endian) + * + * Called via vfio_pci_dvsec_dispatch_read() for CXL devices. Returns shadow + * vconfig values for virtualized DVSEC registers (CONTROL, STATUS, CONTROL2, + * LOCK) so that userspace reads reflect emulated state rather than raw + * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). + * + * Return: @count on success, or negative error code from the fallback read. + */ +static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 dvsec_off; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_read(vdev, pos, count, perm, offset, val); + + dvsec_off = (u16)pos - dvsec; + + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + case CXL_DVSEC_STATUS_OFFSET: + case CXL_DVSEC_CONTROL2_OFFSET: + case CXL_DVSEC_LOCK_OFFSET: + /* Return shadow vconfig value for virtualized registers */ + memcpy(val, vdev->vconfig + pos, count); + return count; + default: + return vfio_raw_config_read(vdev, pos, count, + perm, offset, val); + } +} + +/** + * vfio_cxl_dvsec_writefn - ecap_perms write handler for PCI_EXT_CAP_ID_DVSEC. + * + * Installed once into ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn by + * vfio_pci_init_perm_bits() when CONFIG_VFIO_CXL_CORE=y. Applies to every + * device opened under vfio-pci; the vdev->cxl NULL check distinguishes CXL + * devices from non-CXL devices that happen to expose a DVSEC capability. + * + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to write + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Value to write (little-endian) + * + * Return: @count on success; non-CXL devices continue to + * vfio_raw_config_write() which also returns @count or negative error. + */ +static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = (u16)pos; + u16 dvsec_off, dword_start, byte_in_dword; + u16 wval16; + u32 wval32; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + pci_dbg(vdev->pdev, + "vfio_cxl: DVSEC write: abs=0x%04x dvsec_off=0x%04x count=%d raw_val=0x%08x\n", + abs_off, abs_off - dvsec, count, le32_to_cpu(val)); + + dvsec_off = abs_off - dvsec; + + dword_start = dvsec_off & ~3u; + byte_in_dword = dvsec_off - dword_start; + + switch (dword_start) { + case CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET: + case CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET: + wval32 = dvsec_virt_merge_write32(vdev, dword_start, byte_in_dword, count, val); + dvsec_virt_write32(vdev, dword_start, wval32); + return count; + case CXL_DVSEC_RANGE1_BASE_LOW_OFFSET: + case CXL_DVSEC_RANGE2_BASE_LOW_OFFSET: + wval32 = dvsec_virt_merge_write32(vdev, dword_start, byte_in_dword, count, val); + cxl_range_base_lo_write(vdev, dword_start, wval32); + return count; + } + + /* Route to the appropriate per-register handler */ + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control_write(vdev, wval16); + break; + case CXL_DVSEC_STATUS_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status_write(vdev, wval16); + break; + case CXL_DVSEC_CONTROL2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control2_write(vdev, wval16); + break; + case CXL_DVSEC_STATUS2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status2_write(vdev, wval16); + break; + case CXL_DVSEC_LOCK_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_lock_write(vdev, wval16); + break; + default: + /* RO registers: header, capability, range sizes - discard */ + break; + } + + return count; +} + +/** + * vfio_cxl_setup_dvsec_perms - Install per-device CXL DVSEC read/write hooks. + * @vdev: VFIO PCI core device + * + * Called once per device open after vfio_config_init() has seeded vdev->vconfig + * from hardware. Installs vfio_cxl_dvsec_readfn and vfio_cxl_dvsec_writefn + * as per-device DVSEC handlers so that the global ecap_perms[DVSEC] dispatcher + * routes reads and writes through CXL-aware emulation. + * + * Forces CXL.io IO_ENABLE in the CONTROL vconfig shadow at init time so the + * initial guest read returns the correct value before the first write. + */ +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) +{ + u16 ctrl = dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL_OFFSET); + + vdev->dvsec_readfn = vfio_cxl_dvsec_readfn; + vdev->dvsec_writefn = vfio_cxl_dvsec_writefn; + + /* Force IO_ENABLE; cxl_dvsec_control_write() maintains this invariant. */ + ctrl |= CXL_DVSEC_CTRL_IO_ENABLE; + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, ctrl); +} +EXPORT_SYMBOL_GPL(vfio_cxl_setup_dvsec_perms); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 04f6f5cb47f6..4d35ab137027 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -73,13 +73,13 @@ vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI * sequence is needed before FLR. */ - if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) || + if (!FIELD_GET(CXL_DVSEC_CAP_MEM_CAPABLE, cap_word) || (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { devm_kfree(&pdev->dev, cxl); return ERR_PTR(-ENODEV); } - cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word); + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); return cxl; } diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 088ea882a656..baf2d2dffa74 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -86,14 +86,43 @@ struct vfio_pci_cxl_state { #define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) /* - * CXL DVSEC for CXL Devices - register offsets within the DVSEC - * (CXL 4.0 8.1.3). - * Offsets are relative to the DVSEC capability base (cxl->dvsec). + * DVSEC register offsets and per-bit hardware definitions are in + * as CXL_DVSEC_*. The masks below encode + * emulation policy: which bits to ignore, which to preserve separately + * from their raw hardware state. */ -#define CXL_DVSEC_CAPABILITY_OFFSET 0xa -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ -#define CXL_DVSEC_CACHE_CAPABLE BIT(0) +/* DVSEC Control (0x0C): bits 13 (RsvdP) and 15 (RsvdP) are always discarded */ +#define CXL_CTRL_RESERVED_MASK (BIT(13) | BIT(15)) +/* bit 12 (P2P_Mem_Enable) treated as reserved if Cap3.P2P_Mem_Capable=0 */ +#define CXL_CTRL_P2P_REV_MASK CXL_DVSEC_CTRL_P2P_MEM_ENABLE + +/* DVSEC Status (0x0E): bits 13:0 and 15 are RsvdZ */ +#define CXL_STATUS_RESERVED_MASK (GENMASK(13, 0) | BIT(15)) + +/* + * DVSEC Control2 (0x10) emulation masks. + * + * CXL_CTRL2_HW_BITS_MASK: bits 1 (Initiate_Cache_WBI) and 2 + * (Initiate_CXL_Reset) always read 0 from hardware _ they are write-only + * action triggers per CXL 4.0 _8.1.3.8 Table 8-8. Forward these to the + * device to trigger the hardware action; clear them from vconfig shadow so + * that subsequent guest reads return 0 as hardware requires. + * + * NOTE: bit 0 (Disable_Caching) and bit 3 (CXL_Reset_Mem_Clr_Enable) are + * ordinary RW fields _ they must be preserved in vconfig, not forwarded. + */ +#define CXL_CTRL2_RESERVED_MASK GENMASK(15, 6) +#define CXL_CTRL2_HW_BITS_MASK (BIT(1) | BIT(2)) +/* bit 4 is RsvdP if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_CTRL2_VOLATILE_HDM_REV_MASK CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM +/* bit 5 is RsvdP if Cap2.Mod_Completion_Capable=0 */ +#define CXL_CTRL2_MODIFIED_COMP_REV_MASK CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE + +/* DVSEC Lock (0x14): bits 15:1 are RsvdP */ +#define CXL_LOCK_RESERVED_MASK GENMASK(15, 1) + +/* DVSEC Range Base Low: bits 27:0 are reserved per Tables 8-15/8-19 */ +#define CXL_BASE_LO_RESERVED_MASK CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, struct vfio_pci_cxl_state *cxl, diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index ac2d7f5fa40e..d50a8749ae07 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -245,9 +245,9 @@ static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos, } /* Allow direct read from hardware, except for capability next pointer */ -static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; @@ -1085,6 +1085,47 @@ static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) return 0; } +/* + * vfio_pci_dvsec_dispatch_read - per-device DVSEC read dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn at module init. + * Calls vdev->dvsec_readfn when a shadow-read handler has been registered + * (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices); otherwise + * fall back to vfio_direct_config_read so non-CXL DVSEC devices keep the + * extended-cap header mangling that the default ecap readfn applies. + */ +static int vfio_pci_dvsec_dispatch_read(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + if (vdev->dvsec_readfn) + return vdev->dvsec_readfn(vdev, pos, count, perm, offset, val); + return vfio_direct_config_read(vdev, pos, count, perm, offset, val); +} + +/* + * vfio_pci_dvsec_dispatch_write - per-device DVSEC write dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn at module init. + * Calls vdev->dvsec_writefn when a handler has been registered for this + * device (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), + * otherwise proceed to vfio_raw_config_write so that non-CXL devices + * with a DVSEC capability continue to pass writes to hardware. + * + * This indirection allows per-device DVSEC handlers to be registered + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_write(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + if (vdev->dvsec_writefn) + return vdev->dvsec_writefn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_write(vdev, pos, count, perm, offset, val); +} + /* * Initialize the shared permission tables */ @@ -1121,7 +1162,8 @@ int __init vfio_pci_init_perm_bits(void) ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; - ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn = vfio_pci_dvsec_dispatch_read; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_pci_dvsec_dispatch_write; if (ret) vfio_pci_uninit_perm_bits(); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 85a262ff6cad..350dea708994 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -49,6 +49,10 @@ int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, int count, struct perm_bits *perm, int offset, __le32 *val); +int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -151,6 +155,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); #else @@ -162,6 +167,8 @@ static inline void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } #endif /* CONFIG_VFIO_CXL_CORE */ diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 6e8885f79d26..6158a12b9b27 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -30,6 +30,7 @@ struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_attachment; struct vfio_pci_cxl_state; +struct perm_bits; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -139,6 +140,12 @@ struct vfio_pci_core_device { struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; struct vfio_pci_cxl_state *cxl; + int (*dvsec_readfn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + int (*dvsec_writefn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h index aa9fea9b88f7..6a6507ebf319 100644 --- a/include/uapi/cxl/cxl_regs.h +++ b/include/uapi/cxl/cxl_regs.h @@ -60,4 +60,102 @@ #define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) #define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) +/* + * CXL r4.0 8.1.3: DVSEC for CXL Devices + * + * Register offsets are relative to the DVSEC capability base address, + * as discovered via PCI_EXT_CAP_ID_DVSEC with DVSEC ID 0x0. + * All registers in this section are 16-bit wide. + */ + +/* DVSEC register offsets */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0x0a +#define CXL_DVSEC_CONTROL_OFFSET 0x0c +#define CXL_DVSEC_STATUS_OFFSET 0x0e +#define CXL_DVSEC_CONTROL2_OFFSET 0x10 +#define CXL_DVSEC_STATUS2_OFFSET 0x12 +#define CXL_DVSEC_LOCK_OFFSET 0x14 +#define CXL_DVSEC_CAPABILITY2_OFFSET 0x16 +#define CXL_DVSEC_RANGE1_SIZE_HIGH_OFFSET 0x18 +#define CXL_DVSEC_RANGE1_SIZE_LOW_OFFSET 0x1c +#define CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET 0x20 +#define CXL_DVSEC_RANGE1_BASE_LOW_OFFSET 0x24 +#define CXL_DVSEC_RANGE2_SIZE_HIGH_OFFSET 0x28 +#define CXL_DVSEC_RANGE2_SIZE_LOW_OFFSET 0x2c +#define CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET 0x30 +#define CXL_DVSEC_RANGE2_BASE_LOW_OFFSET 0x34 +#define CXL_DVSEC_CAPABILITY3_OFFSET 0x38 + +/* DVSEC Range Base Low registers: bits [27:0] are reserved */ +#define CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK __GENMASK(27, 0) + +/* CXL r4.0 8.1.3.1 Table 8-5 DVSEC CXL Capability (offset 0x0A) */ +#define CXL_DVSEC_CAP_CACHE_CAPABLE _BITUL(0) +#define CXL_DVSEC_CAP_IO_CAPABLE _BITUL(1) +#define CXL_DVSEC_CAP_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_CAP_MEM_HW_INIT_MODE _BITUL(3) +#define CXL_DVSEC_CAP_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP_CACHE_WBI_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP_CXL_RESET_CAPABLE _BITUL(7) +#define CXL_DVSEC_CAP_CXL_RESET_TIMEOUT_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CAP_CXL_RESET_MEM_CLR_CAPABLE _BITUL(11) +#define CXL_DVSEC_CAP_TSP_CAPABLE _BITUL(12) +#define CXL_DVSEC_CAP_MLD_CAPABLE _BITUL(13) +#define CXL_DVSEC_CAP_VIRAL_CAPABLE _BITUL(14) +#define CXL_DVSEC_CAP_PM_INIT_REPORTING_CAPABLE _BITUL(15) + +/* CXL r4.0 8.1.3.2 Table 8-6 DVSEC CXL Control (offset 0x0C) */ +#define CXL_DVSEC_CTRL_CACHE_ENABLE _BITUL(0) +#define CXL_DVSEC_CTRL_IO_ENABLE _BITUL(1) +#define CXL_DVSEC_CTRL_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_CTRL_CACHE_SF_COVERAGE_MASK __GENMASK(7, 3) +#define CXL_DVSEC_CTRL_CACHE_SF_GRANULARITY_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CTRL_CACHE_CLEAN_EVICTION _BITUL(11) +#define CXL_DVSEC_CTRL_P2P_MEM_ENABLE _BITUL(12) +/* bit 13: RsvdP */ +#define CXL_DVSEC_CTRL_VIRAL_ENABLE _BITUL(14) +/* bit 15: RsvdP */ + +/* CXL r4.0 8.1.3.3 Table 8-7 DVSEC CXL Status (offset 0x0E) */ +/* bits 13:0 = RsvdZ */ +#define CXL_DVSEC_STATUS_VIRAL_STATUS _BITUL(14) +/* bit 15 = RsvdZ */ + +/* CXL r4.0 8.1.3.4 Table 8-8 DVSEC CXL Control2 (offset 0x10) */ +#define CXL_DVSEC_CTRL2_DISABLE_CACHING _BITUL(0) +#define CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI _BITUL(1) +#define CXL_DVSEC_CTRL2_INITIATE_CXL_RESET _BITUL(2) +#define CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE _BITUL(3) +#define CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM _BITUL(4) +#define CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE _BITUL(5) +/* bits 15:6 = RsvdP */ + +/* CXL r4.0 8.1.3.5 Table 8-9 DVSEC CXL Status2 (offset 0x12) */ +#define CXL_DVSEC_STATUS2_CACHE_INVALID _BITUL(0) +#define CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE _BITUL(1) +#define CXL_DVSEC_STATUS2_CXL_RESET_ERROR _BITUL(2) +/* RW1CS; RsvdZ if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR _BITUL(3) +/* bits 14:4 = RsvdZ */ +#define CXL_DVSEC_STATUS2_PM_INIT_COMPLETION _BITUL(15) + +/* CXL r4.0 _8.1.3.6 Table 8-10 _ DVSEC CXL Lock (offset 0x14) */ +#define CXL_DVSEC_LOCK_CONFIG_LOCK _BITUL(0) +/* bits 15:1 = RsvdP */ + +/* CXL r4.0 8.1.3.7 Table 8-11 DVSEC CXL Capability2 (offset 0x16) */ +#define CXL_DVSEC_CAP2_CACHE_SIZE_UNIT_MASK __GENMASK(3, 0) +#define CXL_DVSEC_CAP2_FALLBACK_CAPABILITY_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP2_NO_CLEAN_WRITEBACK _BITUL(7) +#define CXL_DVSEC_CAP2_CACHE_SIZE_MASK __GENMASK(15, 8) + +/* CXL r4.0 8.1.3.14 Table 8-20 DVSEC CXL Capability3 (offset 0x38) */ +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_COLD_RESET _BITUL(0) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_WARM_RESET _BITUL(1) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_HOT_RESET _BITUL(2) +#define CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY _BITUL(3) +#define CXL_DVSEC_CAP3_P2P_MEM_CAPABLE _BITUL(4) +/* bits 15:5 = RsvdP */ + #endif /* _UAPI_CXL_REGS_H_ */ -- Gitee From 9bf11f3406d75f2f1c263b01568f1d91cdf11fd4 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:13 +0530 Subject: [PATCH 35/42] NVIDIA: VR: SAUCE: vfio/cxl: Register regions with VFIO layer BugLink: https://bugs.launchpad.net/bugs/2152222 Register the DPA and component register region with VFIO layer. Region indices for both these regions are cached for quick lookup. vfio_cxl_register_cxl_region() - memremap(WB) the region HPA (treat CXL.mem as RAM, not MMIO) - Register VFIO_REGION_SUBTYPE_CXL - Records dpa_region_idx. vfio_cxl_register_comp_regs_region() - Registers VFIO_REGION_SUBTYPE_CXL_COMP_REGS with size hdm_reg_offset + hdm_reg_size - Records comp_reg_region_idx. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 6e2d9e5f273d9088c00fc3a766a224dd8abc7903 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Check HDM COMMITTED bit before activating DPA region on precommitted decoders, add pm_runtime/memory-enabled gate in fault and rw paths, split vfio_cxl_zap_dpa() from prepare_reset(), add DPA zap in vfio_pci_zap_and_down_write_memory_lock(), add hot-reset CXL prepare/finish passes] [kobak: Withheld DPA mmap advertisement on BOS until CPU-readable backing for CXL DPA PFNMAP can be proven; DPA fd read/write remains advertised.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 118 ++++++++++++++++++++++++++- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 34 ++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 2 + drivers/vfio/pci/vfio_pci.c | 23 ++++++ drivers/vfio/pci/vfio_pci_priv.h | 11 +++ 5 files changed, 187 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 4d35ab137027..e3da65e2385a 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -80,6 +80,8 @@ vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) } cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); + cxl->dpa_region_idx = -1; + cxl->comp_reg_region_idx = -1; return cxl; } @@ -537,14 +539,19 @@ static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, */ void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { + struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = vdev->cxl; lockdep_assert_held_write(&vdev->memory_lock); - if (!cxl) + if (!cxl || cxl->dpa_region_idx < 0) return; WRITE_ONCE(cxl->region_active, false); + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); } /* @@ -652,6 +659,7 @@ static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, struct vfio_pci_region *region) { + struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = region->data; /* @@ -661,6 +669,16 @@ static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, */ WRITE_ONCE(cxl->region_active, false); + /* + * Remove all user mappings of the DPA region while the device is + * still alive. + */ + if (cxl->dpa_region_idx >= 0) + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); + if (cxl->region_vaddr) { memunmap(cxl->region_vaddr); cxl->region_vaddr = NULL; @@ -673,4 +691,102 @@ static const struct vfio_pci_regops vfio_cxl_regops = { .release = vfio_cxl_region_release, }; +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags; + int ret; + + if (!cxl) + return -ENODEV; + + if (!cxl->region || cxl->region_vaddr) + return -ENODEV; + + /* + * CXL device memory is RAM, not MMIO. Use memremap() rather than + * ioremap_cache() so the correct memory-mapping API is used. + * The WB attribute matches the cache-coherent nature of CXL.mem. + */ + cxl->region_vaddr = memremap(cxl->region_hpa, cxl->region_size, + MEMREMAP_WB); + if (!cxl->region_vaddr) + return -ENOMEM; + + /* + * BOS/backport policy: do not advertise DPA mmap until the CXL DPA + * backing is proven safe for userspace CPU mappings. Keep fd + * read/write available via the memremap() kernel mapping. + */ + flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL, + &vfio_cxl_regops, + cxl->region_size, flags, + cxl); + if (ret) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + return ret; + } + + /* + * Cache the vdev->region[] index before activating the region. + * vfio_pci_core_register_dev_region() placed the new entry at + * vdev->region[num_regions - 1] and incremented num_regions. + * vfio_cxl_zap_region_locked() uses this to avoid scanning + * vdev->region[] on every FLR. + */ + cxl->dpa_region_idx = vdev->num_regions - 1; + + vfio_cxl_reinit_comp_regs(cxl); + + /* + * Only activate the DPA region when the HDM decoder is currently + * committed. vfio_pci_core_enable() runs pci_try_reset_function() + * before regions are registered; that FLR clears the decoder + * COMMITTED bit and firmware may not have re-committed it yet. + * Mirror vfio_cxl_finish_reset(): if COMMITTED is not set here, the + * region stays inactive and guest DPA access returns + * VM_FAULT_SIGBUS / -EIO until a subsequent reset re-runs + * finish_reset with the decoder committed. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region); + +/** + * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region() + * @vdev: VFIO PCI device + * + * Marks the DPA region inactive and resets dpa_region_idx. + * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd). + * The caller must call vfio_cxl_destroy_cxl_region() separately to release + * those objects. + */ +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + + cxl->dpa_region_idx = -1; +} +EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region); + MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 1b65260c80ce..8f1eefec7c44 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -502,3 +502,37 @@ void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) kfree(cxl->comp_reg_virt); cxl->comp_reg_virt = NULL; } + +/* + * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region. + * + * Exposes the emulated HDM decoder register state as a VFIO device region + * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS. QEMU attaches a + * notify_change callback to this region to intercept HDM COMMIT writes + * and map the DPA MemoryRegion at the appropriate GPA. + * + * The region is read+write only (no mmap) to ensure all accesses pass + * through comp_regs_dispatch_write() for proper bit-field enforcement. + */ +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + int ret; + + if (!cxl || !cxl->comp_reg_virt) + return -ENODEV; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL_COMP_REGS, + &vfio_cxl_comp_regs_ops, + cxl->hdm_reg_offset + + cxl->hdm_reg_size, flags, cxl); + if (!ret) + cxl->comp_reg_region_idx = vdev->num_regions - 1; + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index baf2d2dffa74..ac8ea3893c7a 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -38,6 +38,8 @@ struct vfio_pci_cxl_state { __le32 *comp_reg_virt; size_t dpa_size; void __iomem *hdm_iobase; + int dpa_region_idx; + int comp_reg_region_idx; u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 0c771064c0b8..22cf9ea831f9 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -120,6 +120,29 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) } } + if (vdev->cxl) { + /* + * pci_config_map and vconfig are valid now (allocated by + * vfio_config_init() inside vfio_pci_core_enable() above). + */ + vfio_cxl_setup_dvsec_perms(vdev); + + ret = vfio_cxl_register_cxl_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to setup CXL region\n"); + vfio_pci_core_disable(vdev); + return ret; + } + + ret = vfio_cxl_register_comp_regs_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to register COMP_REGS region\n"); + vfio_cxl_unregister_cxl_region(vdev); + vfio_pci_core_disable(vdev); + return ret; + } + } + vfio_pci_core_finish_enable(vdev); return 0; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 350dea708994..e9ebb0f89b5a 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -156,6 +156,9 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); #else @@ -169,6 +172,14 @@ static inline void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline void +vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ return 0; } #endif /* CONFIG_VFIO_CXL_CORE */ -- Gitee From 8d89da8fd56bc75b9d2016bdec6a6c282056e715 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:14 +0530 Subject: [PATCH 36/42] NVIDIA: VR: SAUCE: vfio/pci: Advertise CXL cap and sparse component BAR to userspace BugLink: https://bugs.launchpad.net/bugs/2152222 Expose CXL device capability through the VFIO device info ioctl and give userspace mmap access to the GPU/accelerator register windows in the component BAR while keeping the CXL component register block off-limits to user mappings. vfio_cxl_get_info() fills VFIO_DEVICE_INFO_CAP_CXL with the HDM register BAR index and byte offset, commit flags, and VFIO region indices for the DPA and COMP_REGS regions. HDM decoder count and the HDM block offset within COMP_REGS are not populated; both are derivable from the CXL Capability Array in the COMP_REGS region itself. vfio_cxl_get_region_info() handles VFIO_DEVICE_GET_REGION_INFO for the component register BAR. It builds a sparse-mmap capability that advertises only the GPU/accelerator register windows, carving out the CXL component register block. Three physical layouts are handled: Topology A comp block at BAR end: one area [0, comp_reg_offset) Topology B comp block at BAR start: one area [comp_end, bar_len) Topology C comp block in the middle: two areas, one on each side vfio_cxl_mmap_overlaps_comp_regs() checks whether an mmap request overlaps [comp_reg_offset, comp_reg_offset + comp_reg_size). vfio_pci_core_mmap() calls it to reject mmap of the component register block while allowing mmap of the GPU register windows in the sparse capability. This replaces the earlier blanket rejection of any mmap on the component BAR index. vfio_pci_bar_rw() applies the same overlap check, so fd pread()/pwrite() on the component BAR is also rejected when it would touch the component register subrange. All access to those registers goes through the dedicated COMP_REGS region, where the emulated HDM shadow lives. Hook both helpers into vfio_pci_ioctl_get_info() and vfio_pci_ioctl_get_region_info() in vfio_pci_core.c. The component BAR cannot be claimed exclusively since the CXL subsystem holds persistent sub-range iomem claims during HDM decoder setup. pci_request_selected_regions() returns EBUSY; pass bars=0 to skip the request and map directly via pci_iomap(). Physical ownership is assured by driver binding. Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (cherry-picked from commit 9cd924807287d84bf9bd3b257a6f4b67342b203e from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Add BAR bounds check for component block, handle full-BAR component reg case, add bar_mmap_supported gate, block BAR fd read/write and ioeventfd in component reg subrange] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 191 +++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 31 ++++- drivers/vfio/pci/vfio_pci_priv.h | 24 ++++ drivers/vfio/pci/vfio_pci_rdwr.c | 37 +++++- 4 files changed, 278 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index e3da65e2385a..81d195256d3e 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,197 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl->comp_reg_bar; +} + +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + resource_size_t bar_len, comp_end; + u32 nr_areas, cap_size; + int ret; + + if (!cxl) + return -ENOTTY; + + if (!info) + return -ENOTTY; + + if (info->argsz < minsz) + return -EINVAL; + + if (info->index != cxl->comp_reg_bar) + return -ENOTTY; + + /* + * The device state is not fully initialised; + * fall through to the default BAR handler. + */ + if (!cxl->comp_reg_size) + return -ENOTTY; + + bar_len = pci_resource_len(vdev->pdev, info->index); + comp_end = cxl->comp_reg_offset + cxl->comp_reg_size; + + /* + * A component block past the end of the BAR would walk subsequent + * readl()s off the ioremap window. Reject that up front. + */ + if (comp_end > bar_len) + return -EINVAL; + + /* + * If the component block covers the entire BAR there is nothing to + * mmap; return the BAR with read/write access only and let userspace + * use the COMP_REGS device region for register access. + */ + if (cxl->comp_reg_size == bar_len) { + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return 0; + } + + /* + * Preserve the existing vfio-pci bar_mmap_supported gate. When the + * BAR is non-mappable for any reason (non-page-aligned resource, the + * non_mappable_bars policy, etc.), advertising a sparse-mmap cap and + * VFIO_REGION_INFO_FLAG_MMAP would let userspace try to mmap and get + * a stale -EINVAL from the mmap path. Return the bare BAR descriptor + * instead and let userspace fall back to fd read/write. + */ + if (!vdev->bar_mmap_supported[info->index]) { + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return 0; + } + + /* + * Advertise the GPU/accelerator register windows as mmappable by + * carving the CXL component register block out of the BAR. The + * number of sparse areas depends on where the block sits: + * + * [A] comp block at BAR end [gpu_regs | comp_regs]: + * comp_reg_offset > 0 && comp_end == bar_len + * = 1 area: [0, comp_reg_offset) + * + * [B] comp block at BAR start [comp_regs | gpu_regs]: + * comp_reg_offset == 0 && comp_end < bar_len + * = 1 area: [comp_end, bar_len) + * + * [C] comp block in middle [gpu_regs | comp_regs | gpu_regs]: + * comp_reg_offset > 0 && comp_end < bar_len + * = 2 areas: [0, comp_reg_offset) and [comp_end, bar_len) + */ + if (cxl->comp_reg_offset > 0 && comp_end < bar_len) + nr_areas = 2; + else + nr_areas = 1; + + cap_size = struct_size(sparse, areas, nr_areas); + sparse = kzalloc(cap_size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + + if (nr_areas == 2) { + /* [C]: window before and after comp block */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + sparse->areas[1].offset = comp_end; + sparse->areas[1].size = bar_len - comp_end; + } else if (cxl->comp_reg_offset == 0) { + /* [B]: comp block at BAR start, window follows */ + sparse->areas[0].offset = comp_end; + sparse->areas[0].size = bar_len - comp_end; + } else { + /* [A]: comp block at BAR end, window precedes */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + } + + ret = vfio_info_add_capability(caps, &sparse->header, cap_size); + kfree(sparse); + if (ret) + return ret; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return 0; +} + +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl->comp_reg_size) + return false; + + return req_start < cxl->comp_reg_offset + cxl->comp_reg_size && + req_start + req_len > cxl->comp_reg_offset; +} + +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct vfio_device_info_cap_cxl cxl_cap = {0}; + + if (!cxl) + return 0; + + /* + * Device is not fully initialised? + */ + if (WARN_ON(cxl->dpa_region_idx < 0 || cxl->comp_reg_region_idx < 0)) + return -ENODEV; + + /* Fill in from CXL device structure */ + cxl_cap.header.id = VFIO_DEVICE_INFO_CAP_CXL; + cxl_cap.header.version = 1; + /* + * COMP_REGS region starts at comp_reg_offset + CXL_CM_OFFSET within + * the BAR. This is the byte offset of the CXL.mem register area (where + * the CXL Capability Array Header lives) within the component register + * block. Userspace derives hdm_decoder_offset and hdm_count from the + * COMP_REGS region itself (CXL Capability Array traversal + HDMC read). + */ + cxl_cap.hdm_regs_offset = cxl->comp_reg_offset + CXL_CM_OFFSET; + cxl_cap.hdm_regs_bar_index = cxl->comp_reg_bar; + + if (cxl->precommitted) + cxl_cap.flags |= VFIO_CXL_CAP_FIRMWARE_COMMITTED; + if (cxl->cache_capable) + cxl_cap.flags |= VFIO_CXL_CAP_CACHE_CAPABLE; + + /* + * Populate absolute VFIO region indices so userspace can query them + * directly with VFIO_DEVICE_GET_REGION_INFO. + */ + cxl_cap.dpa_region_index = VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx; + cxl_cap.comp_regs_region_index = + VFIO_PCI_NUM_REGIONS + cxl->comp_reg_region_idx; + + return vfio_info_add_capability(caps, &cxl_cap.header, sizeof(cxl_cap)); +} + /* * Scope-based cleanup wrappers for the CXL resource APIs */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index e4544b078aa3..7e3a5e20353f 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -591,7 +591,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; - int i, bar; + int i, bar, bars; /* For needs_reset */ lockdep_assert_held(&vdev->vdev.dev_set->lock); @@ -650,8 +650,10 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) bar = i + PCI_STD_RESOURCES; if (!vdev->barmap[bar]) continue; + bars = (vdev->cxl && i == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); pci_iounmap(pdev, vdev->barmap[bar]); - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); vdev->barmap[bar] = NULL; } @@ -997,6 +999,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->cxl) { + ret = vfio_cxl_get_info(vdev, &caps); + if (ret) + return ret; + info.flags |= VFIO_DEVICE_FLAGS_CXL; + } + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1042,6 +1051,12 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, struct pci_dev *pdev = vdev->pdev; int i, ret; + if (vdev->cxl) { + ret = vfio_cxl_get_region_info(vdev, info, caps); + if (ret != -ENOTTY) + return ret; + } + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); @@ -1771,6 +1786,18 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma if (req_start + req_len > phys_len) return -EINVAL; + /* + * CXL devices: mmap is permitted for the GPU/accelerator register + * windows listed in the sparse-mmap capability. Block any request + * that overlaps the CXL component register block + * [comp_reg_offset, comp_reg_offset + comp_reg_size); those registers + * must be accessed exclusively through the COMP_REGS device region so + * that the emulation layer (notify_change) intercepts every write. + */ + if (vdev->cxl && index == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, req_start, req_len)) + return -EINVAL; + /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index e9ebb0f89b5a..d33e712ca8e6 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -159,6 +159,14 @@ void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev); +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len); #else @@ -180,6 +188,22 @@ vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } static inline int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) { return 0; } +static inline int +vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline int +vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline u8 +vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ return U8_MAX; } +static inline bool +vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ return false; } #endif /* CONFIG_VFIO_CXL_CORE */ diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 4251ee03e146..3e0ec0b082ff 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -201,19 +201,29 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { struct pci_dev *pdev = vdev->pdev; - int ret; + int ret, bars; void __iomem *io; if (vdev->barmap[bar]) return 0; - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + /* + * The CXL component register BAR cannot be claimed exclusively: the + * CXL subsystem holds persistent sub-range iomem claims during HDM + * decoder setup. pci_request_selected_regions() for the full BAR + * fails with EBUSY. Pass bars=0 to make the request a no-op and map + * directly via pci_iomap(). + */ + bars = (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); + + ret = pci_request_selected_regions(pdev, bars, "vfio"); if (ret) return ret; io = pci_iomap(pdev, bar, 0); if (!io) { - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); return -ENOMEM; } @@ -248,6 +258,17 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, count = min(count, (size_t)(end - pos)); + /* + * For CXL devices, the component register subrange is emulated through + * the dedicated COMP_REGS region (comp_regs_dispatch_write). Reject fd + * read/write that targets that subrange so userspace cannot bypass the + * emulation by issuing pread()/pwrite() on the BAR fd. This matches + * the mmap path, which rejects overlapping mmap requests. + */ + if (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, pos, count)) + return -EINVAL; + if (bar == PCI_ROM_RESOURCE) { /* * The ROM can fill less space than the BAR, so we start the @@ -449,6 +470,16 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; + /* + * Disallow ioeventfds that would land inside the CXL component + * register subrange. Without this check, the eventfd handler would + * iowrite directly into the BAR mapping, bypassing the COMP_REGS + * emulation enforced on the mmap and pread/pwrite paths. + */ + if (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, pos, count)) + return -EINVAL; + if (count == 8) return -EINVAL; -- Gitee From 8e69469740a20e3c8e48cea637994e4d35c307b3 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:15 +0530 Subject: [PATCH 37/42] NVIDIA: VR: SAUCE: vfio/cxl: Provide opt-out for CXL feature BugLink: https://bugs.launchpad.net/bugs/2152222 This commit provides an opt-out mechanism to disable the CXL support from vfio module. The opt-out is provided both build time and module load time. Build time option CONFIG_VFIO_CXL_CORE is used to enable/disable CXL support in vfio-pci module. For runtime disabling the CXL support, use the module parameter disable_cxl. The bare vfio-pci driver copies that parameter into the per-device core state before registration. Variant drivers own their probe policy and must set vdev->disable_cxl explicitly before registering the core device. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) (backported from commit 595c1ad9c3c from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [jan: Resolve context mismatch in vfio_pci.c probe function due to missing upstream pci_ops assignment in NV-Kernels base, Wrap disable_cxl field in #if IS_ENABLED(CONFIG_VFIO_CXL_CORE), update MODULE_PARM_DESC wording] [kobak: Preserved existing vfio-pci pci_ops assignment while wiring the CXL opt-out parameter.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 4 ++++ drivers/vfio/pci/vfio_pci.c | 9 +++++++++ include/linux/vfio_pci_core.h | 3 +++ 3 files changed, 16 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 81d195256d3e..622482064890 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -523,6 +523,10 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) u16 dvsec; int ret; + /* Honor the user opt-out decision */ + if (vdev->disable_cxl) + return; + if (!pcie_is_cxl(pdev)) return; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 22cf9ea831f9..9d9b1116e641 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -60,6 +60,12 @@ static bool disable_denylist; module_param(disable_denylist, bool, 0444); MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) +static bool disable_cxl; +module_param(disable_cxl, bool, 0444); +MODULE_PARM_DESC(disable_cxl, "Disable CXL extensions on devices probed by the bare vfio-pci driver. Variant drivers do not consult this parameter; they must set vdev->disable_cxl explicitly in their probe path."); +#endif + static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) { switch (pdev->vendor) { @@ -190,6 +196,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_set_drvdata(&pdev->dev, vdev); vdev->pci_ops = &vfio_pci_dev_ops; +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + vdev->disable_cxl = disable_cxl; +#endif ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 6158a12b9b27..d9190930d22f 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -129,6 +129,9 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + bool disable_cxl:1; +#endif struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; -- Gitee From cadbc5acd1b2408f5bcfe33b64f03bb6f56d6865 Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Wed, 20 May 2026 12:57:07 -0500 Subject: [PATCH 38/42] NVIDIA: VR: SAUCE: vfio/pci: Wire CXL DPA reset handling BugLink: https://bugs.launchpad.net/bugs/2152222 Wire the VFIO CXL reset prepare/finish paths into VFIO PCI reset flows so DPA mappings are zapped before reset and restored after successful reset. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from commit 0bd9c4c7ab7 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) [kobak: Preserved existing VFIO PCI DMABUF reset movement while adding CXL reset prepare/finish handling.] Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 83 +++++++++++++++++++++++----- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +- drivers/vfio/pci/vfio_pci_config.c | 4 ++ drivers/vfio/pci/vfio_pci_core.c | 36 ++++++++++-- drivers/vfio/pci/vfio_pci_priv.h | 15 +++-- 5 files changed, 116 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 622482064890..e6bcf486d4a6 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -660,6 +660,18 @@ static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) */ down_read(&vdev->memory_lock); + /* + * Mirror vfio_pci_vmf_insert_pfn(): reject faults while runtime PM is + * engaged or PCI Memory Space / power state would make the underlying + * memory inaccessible. vfio_pci_zap_and_down_write_memory_lock() has + * already unmapped existing PTEs in those paths; this gate stops the + * fault path from faulting them back in. + */ + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + if (!cxl->region_active) { ret = VM_FAULT_SIGBUS; goto out; @@ -724,15 +736,27 @@ static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, return 0; } +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl && pci_cxl_reset_capable(vdev->pdev); +} + /* - * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs. + * vfio_cxl_zap_dpa - Invalidate DPA region PTEs without touching region_active. * - * Must be called with vdev->memory_lock held for writing. Sets - * region_active=false before zapping so any subsequent I/O to the region - * sees the inactive state and returns an error rather than accessing - * stale mappings. + * Used by paths that revoke user access transiently (runtime PM entry, D3 + * power transitions, Memory Space disable) but do not perform a reset. + * The DPA region offset range is separate from the BAR range zapped by + * vfio_pci_zap_bars(), so existing DPA mmaps and fd I/O would otherwise + * continue to touch CXL.mem while the device is suspended. + * + * The fault handler and fd I/O path additionally check pm_runtime_engaged + * and __vfio_pci_memory_enabled() to refuse re-faulting while the device + * is in the revoked state. + * + * Must be called with vdev->memory_lock held for writing. */ -void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) +void vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev) { struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = vdev->cxl; @@ -742,7 +766,6 @@ void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) if (!cxl || cxl->dpa_region_idx < 0) return; - WRITE_ONCE(cxl->region_active, false); unmap_mapping_range(core_vdev->inode->i_mapping, VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx), @@ -750,13 +773,34 @@ void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) } /* - * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset. + * vfio_cxl_prepare_reset - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + vfio_cxl_zap_dpa(vdev); +} + +/* + * vfio_cxl_finish_reset - Re-enable DPA region after reset. * * Must be called with vdev->memory_lock held for writing. Re-reads the - * HDM decoder state from hardware (FLR cleared it) and sets region_active - * so that subsequent I/O to the region is permitted again. + * HDM decoder state from hardware and sets region_active so that + * subsequent I/O to the region is permitted again. */ -void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { struct vfio_pci_cxl_state *cxl = vdev->cxl; @@ -766,8 +810,8 @@ void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) return; /* * Re-initialise the emulated HDM comp_reg_virt[] from hardware. - * After FLR the decoder registers read as zero; mirror that in - * the emulated state so QEMU sees a clean slate. + * A reset clears decoder registers; mirror that in the emulated + * state so the guest device manager sees the post-reset hardware. */ vfio_cxl_reinit_comp_regs(cxl); @@ -818,6 +862,17 @@ static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, */ down_read(&core_dev->memory_lock); + /* + * Mirror the BAR-rw / fault gates: refuse fd I/O while the device is + * runtime suspended or has Memory Space / power state that makes the + * memremap'd window inaccessible. + */ + if (core_dev->pm_runtime_engaged || + !__vfio_pci_memory_enabled(core_dev)) { + ret = -EIO; + goto out; + } + if (!cxl->region_active || !cxl->region_vaddr) { ret = -EIO; goto out; @@ -933,7 +988,7 @@ int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) * Cache the vdev->region[] index before activating the region. * vfio_pci_core_register_dev_region() placed the new entry at * vdev->region[num_regions - 1] and incremented num_regions. - * vfio_cxl_zap_region_locked() uses this to avoid scanning + * vfio_cxl_prepare_reset() uses this to avoid scanning * vdev->region[] on every FLR. */ cxl->dpa_region_idx = vdev->num_regions - 1; diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 8f1eefec7c44..43d79e2fe547 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -437,7 +437,7 @@ vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, } /* - * Called with memory_lock write side held (from vfio_cxl_reactivate_region). + * Called with memory_lock write side held (from vfio_cxl_finish_reset). * Uses the pre-established hdm_iobase, no ioremap() under the lock, * which would deadlock on PREEMPT_RT where ioremap() can sleep. */ diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index d50a8749ae07..03835a3b5083 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -910,7 +910,9 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); vfio_pci_dma_buf_move(vdev, true); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); @@ -995,7 +997,9 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); vfio_pci_dma_buf_move(vdev, true); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7e3a5e20353f..2c4f719f4549 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1247,7 +1247,7 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ - vfio_cxl_zap_region_locked(vdev); + vfio_cxl_prepare_reset(vdev); /* * This function can be invoked while the power state is non-D0. If @@ -1266,11 +1266,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_dma_buf_move(vdev, false); /* - * Re-enable DPA region if reset succeeded; fault handler will - * re-insert PFNs on next access without requiring a new mmap. + * finish_reset checks the COMMITTED bit from hardware + * and only brings the region back if it is actually set. */ - if (!ret) - vfio_cxl_reactivate_region(vdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); @@ -1653,6 +1652,13 @@ void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { down_write(&vdev->memory_lock); vfio_pci_zap_bars(vdev); + /* + * Zap the CXL DPA region PTEs too: zap_bars only covers the BAR offset + * range, while the DPA region lives in the device-region offset range + * and would otherwise survive a runtime-PM entry or D3 transition. + * No-op on non-CXL devices. + */ + vfio_cxl_zap_dpa(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -2533,6 +2539,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_undo; } + /* + * All devices in the set are now locked. Commit the CXL prepare + * step in its own pass: it clears region_active and zaps DPA PTEs, + * which must be paired with a finish_reset call for every device it + * touches. Doing this only after all trylocks have succeeded keeps + * a mid-loop failure from leaving earlier devices with + * region_active=false and no matching reset. + */ + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_cxl_prepare_reset(vdev); + /* * The pci_reset_bus() will reset all the devices in the bus. * The power state can be non-D0 for some of the devices in the bus. @@ -2547,6 +2564,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, ret = pci_reset_bus(pdev); + /* + * Mirror vfio_pci_ioctl_reset(): re-read the post-reset HDM state and + * reactivate the DPA region for CXL devices that hardware committed. + * Runs under each device's memory_lock write side acquired earlier and + * pairs with the prepare_reset pass above. + */ + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_cxl_finish_reset(vdev); + vdev = list_last_entry(&dev_set->device_list, struct vfio_pci_core_device, vdev.dev_set_list); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index d33e712ca8e6..bd511ba88b93 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -153,8 +153,10 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); -void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); -void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev); +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev); void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); @@ -174,10 +176,15 @@ static inline void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } static inline void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline bool +vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ return false; } +static inline void +vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) { } static inline void -vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } +vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { } static inline void -vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } +vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } static inline int -- Gitee From dc5ed09c29d3e76564bf50505fd52f236aa4c28b Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Wed, 20 May 2026 13:37:57 -0500 Subject: [PATCH 39/42] NVIDIA: VR: SAUCE: vfio/cxl: Ensure PCI Memory Space is enabled before post-reset BAR access BugLink: https://bugs.launchpad.net/bugs/2152222 A reset caller may disable Memory Space to quiesce device DMA before issuing the reset. The reset path saves and restores PCI_COMMAND via pci_dev_save_and_disable() and pci_dev_restore(). If Memory Space was disabled before FLR, it will be restored in the disabled state. vfio_cxl_finish_reset() reads HDM decoder registers through the component register BAR immediately after reset. Accessing a BAR with Memory Space disabled produces an Unsupported Request completion; on platforms that promote UR to a fatal error this triggers DPC. Add vfio_cxl_enable_memory_space() and call it at the start of vfio_cxl_finish_reset() before touching any BAR. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from commit 5071d3b07627ab1fc42f0149352a2704907f8364 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index e6bcf486d4a6..154e44b5c255 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -793,6 +793,27 @@ void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) vfio_cxl_zap_dpa(vdev); } +/* + * vfio_cxl_enable_memory_space - ensure PCI Memory Space is on before BAR reads. + * + * A reset caller may disable Memory Space to quiesce device DMA before + * issuing the reset. If a guest request cleared PCI_COMMAND Memory Space + * before FLR, pci_dev_save_and_disable() captures it disabled and + * pci_dev_restore() restores it that way. This can leave Memory Space + * disabled on return. Accessing a BAR with Memory Space disabled produces + * an Unsupported Request completion; on platforms that promote UR to a + * fatal error this fires DPC. + */ +static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) +{ + u16 cmd; + + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(vdev->pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); +} + /* * vfio_cxl_finish_reset - Re-enable DPA region after reset. * @@ -808,6 +829,9 @@ void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) if (!cxl) return; + + vfio_cxl_enable_memory_space(vdev); + /* * Re-initialise the emulated HDM comp_reg_virt[] from hardware. * A reset clears decoder registers; mirror that in the emulated -- Gitee From 5704eff7a3dea43de17303c36853eae31ae3e148 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 29 Apr 2026 22:50:38 +0530 Subject: [PATCH 40/42] NVIDIA: VR: SAUCE: vfio/cxl: preserve HDM decoder base addresses across reset BugLink: https://bugs.launchpad.net/bugs/2152222 After FLR, reinit_comp_regs() re-reads HDM decoder registers from hardware into comp_reg_virt[]. Hardware is not all-zeros at this point: pci_dev_restore() ran first and re-committed the pre-reset host-physical decoder bases into the registers. reinit_comp_regs() therefore overwrites the emulated guest-physical bases that the device manager programmed with the host-physical bases used by the host CXL core. The kernel provides no notification that BASE was overwritten, so the emulated GPA bases are silently lost. The same issue affects the CTRL LOCK bit: FLR clears it in hardware and pci_dev_restore() does not re-apply it, so a decoder that the guest had locked re-emerges from reset with LOCK clear in shadow. Add vfio_cxl_reinit_hdm_shadow() which snapshots BASE_LOW, BASE_HIGH, and the CTRL LOCK bit from the shadow before calling reinit_comp_regs(), then writes them back after, keeping the emulated decoder consistent with what the guest programmed. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (cherry-picked from commit 9e0e291bfc29f40e5db40256353b89d34fd93ddf from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 88 ++++++++++++++++++++++++++-- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 38 +++++++++--- 2 files changed, 113 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 154e44b5c255..2d6b804d8537 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -814,6 +814,87 @@ static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) cmd | PCI_COMMAND_MEMORY); } +/* + * vfio_cxl_reinit_hdm_shadow - reinitialise comp_reg_virt, preserving the + * guest-visible BASE registers and CTRL LOCK across reset. + * + * reinit_comp_regs() re-reads hardware into comp_reg_virt[] after FLR. + * pci_dev_restore() re-commits the host-physical BASE values it saved + * before the reset, so reinit_comp_regs() sees those host bases and not + * the guest-physical bases the device manager programmed in shadow. The + * decoder CTRL LOCK bit is also cleared by FLR on hardware and is not + * re-applied by pci_dev_restore(). Snapshot BASE_LOW/BASE_HIGH and the + * LOCK bit from shadow before reinit, then write them back so the + * emulated decoder stays consistent with what the guest configured. + * + * Called with memory_lock write side held (from vfio_cxl_finish_reset). + */ +static void vfio_cxl_reinit_hdm_shadow(struct vfio_pci_cxl_state *cxl) +{ + __le32 *saved_lo = NULL, *saved_hi = NULL, *saved_ctrl = NULL; + u8 n, count = cxl->hdm_count; + + if (cxl->comp_reg_virt && count) { + saved_lo = kcalloc(count, sizeof(*saved_lo), GFP_KERNEL); + saved_hi = kcalloc(count, sizeof(*saved_hi), GFP_KERNEL); + saved_ctrl = kcalloc(count, sizeof(*saved_ctrl), GFP_KERNEL); + if (!saved_lo || !saved_hi || !saved_ctrl) { + /* + * Allocation failure: skip the snapshot and let reinit + * resync from hardware. The guest-visible BASE/LOCK + * state will diverge but the device is otherwise + * functional. This path is unlikely under normal load. + */ + pci_warn(cxl->vdev->pdev, + "vfio_cxl: HDM shadow snapshot allocation failed; resetting without GPA preservation\n"); + kfree(saved_lo); + kfree(saved_hi); + kfree(saved_ctrl); + saved_lo = saved_hi = saved_ctrl = NULL; + } else { + for (n = 0; n < count; n++) { + saved_lo[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)); + saved_hi[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)); + saved_ctrl[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n)); + } + } + } + + vfio_cxl_reinit_comp_regs(cxl); + + if (cxl->comp_reg_virt && saved_lo) { + for (n = 0; n < count; n++) { + u32 ctrl; + + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)) = saved_lo[n]; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)) = saved_hi[n]; + + /* + * Restore the LOCK bit from shadow. Other CTRL bits + * (COMMITTED, error indicators) should reflect the + * post-FLR hardware state that reinit_comp_regs() just + * snapshotted, so leave those alone. + */ + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n))); + ctrl |= le32_to_cpu(saved_ctrl[n]) & + CXL_HDM_DECODER0_CTRL_LOCK; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n)) = + cpu_to_le32(ctrl); + } + } + + kfree(saved_lo); + kfree(saved_hi); + kfree(saved_ctrl); +} + /* * vfio_cxl_finish_reset - Re-enable DPA region after reset. * @@ -833,11 +914,10 @@ void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) vfio_cxl_enable_memory_space(vdev); /* - * Re-initialise the emulated HDM comp_reg_virt[] from hardware. - * A reset clears decoder registers; mirror that in the emulated - * state so the guest device manager sees the post-reset hardware. + * Re-initialise the emulated HDM comp_reg_virt[] from hardware, + * preserving the GPA decoder bases set by the device manager. */ - vfio_cxl_reinit_comp_regs(cxl); + vfio_cxl_reinit_hdm_shadow(cxl); /* * Only re-enable the DPA mmap if the hardware has actually diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 43d79e2fe547..bdd363a819a7 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -272,6 +272,7 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, { struct vfio_pci_cxl_state *cxl = vdev->cxl; loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret = 0; size_t done = 0; if (!count) @@ -283,14 +284,26 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, count = min(count, (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + /* + * Serialise against vfio_cxl_reinit_hdm_shadow(), which holds + * memory_lock write-side while it saves, zeroes, and restores + * comp_reg_virt[] during reset. Without this read lock a concurrent + * COMP_REGS write can land between the save snapshot and the restore, + * causing the restore to silently overwrite it. A concurrent read + * can observe the array mid-rebuild. + */ + down_read(&vdev->memory_lock); + while (done < count) { u32 sz = count - done; u32 off = pos + done; __le32 v; /* Enforce exactly 4-byte, 4-byte-aligned accesses */ - if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) - return done ? (ssize_t)done : -EINVAL; + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) { + ret = done ? (ssize_t)done : -EINVAL; + goto out_unlock; + } if (iswrite) { if (off < cxl->hdm_reg_offset) { @@ -298,22 +311,29 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, done += sizeof(v); continue; } - if (copy_from_user(&v, buf + done, sizeof(v))) - return done ? (ssize_t)done : -EFAULT; + if (copy_from_user(&v, buf + done, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } comp_regs_dispatch_write(vdev, off - cxl->hdm_reg_offset, &v, sizeof(v)); } else { - /* Read from extended buffer _ covers cap array and HDM */ + /* Read from extended buffer - covers cap array and HDM */ v = cxl->comp_reg_virt[off / sizeof(__le32)]; - if (copy_to_user(buf + done, &v, sizeof(v))) - return done ? (ssize_t)done : -EFAULT; + if (copy_to_user(buf + done, &v, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } } done += sizeof(v); } + ret = done; *ppos += done; - return done; +out_unlock: + up_read(&vdev->memory_lock); + return ret; } static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, @@ -437,7 +457,7 @@ vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, } /* - * Called with memory_lock write side held (from vfio_cxl_finish_reset). + * Called with memory_lock write side held (from vfio_cxl_reinit_hdm_shadow). * Uses the pre-established hdm_iobase, no ioremap() under the lock, * which would deadlock on PREEMPT_RT where ioremap() can sleep. */ -- Gitee From 22f3f8aa9aef6547daf1ff51917c2241ac4d7712 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 00:28:10 +0530 Subject: [PATCH 41/42] NVIDIA: VR: SAUCE: vfio/cxl: virtualize DVSEC STATUS2 register in vconfig shadow BugLink: https://bugs.launchpad.net/bugs/2152222 STATUS2 was read directly from hardware while all other DVSEC registers were served from the vconfig shadow. This created two problems: 1. VOLATILE_HDM_PRES_ERROR (RW1CS, bit 3): guest writes cleared the hardware bit but the shadow was not updated, so subsequent reads still returned the set bit from hardware (which the hardware had cleared). 2. CXL_RESET_COMPLETE and CXL_RESET_ERROR (bits 1-2): these outcome bits will be written by vfio_cxl_reset() into the shadow after a protocol reset. Hardware does not update them on its own; serving reads from hardware would hide the outcome from the guest. Add STATUS2 to the read switch so reads come from the shadow, and update cxl_dvsec_status2_write() to mirror VOLATILE_HDM_PRES_ERROR clears into the shadow after forwarding to hardware. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (backported from commit 14fbdcb4d592891c269d052317cc10b640aa4096 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_config.c | 191 +++++++++++++++++++++---- 1 file changed, 167 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c index 35d35e2ded9b..4dc86c1de80b 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_config.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -158,12 +158,34 @@ static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, u16 dvsec = _cxlds_get_dvsec(vdev->cxl); u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; - /* RW1CS: write 1 to clear, but only if the capability is supported */ + /* + * VOLATILE_HDM_PRES_ERROR (bit 3) and PM_INIT_COMPLETION (bit 15) are + * RW1CS. Forward each to hardware on a 1-bit write, then mirror the + * clear into the shadow so guest reads (which now come from the + * shadow) do not see the bit stuck after a successful clear. + * + * All other STATUS2 bits are RO hardware outputs; ignore guest writes. + */ if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && - (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) { + u16 v; + pci_write_config_word(vdev->pdev, abs_off, CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); - /* STATUS2 is not mirrored in vconfig - reads go to hardware */ + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } + + if (new_val & CXL_DVSEC_STATUS2_PM_INIT_COMPLETION) { + u16 v; + + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_PM_INIT_COMPLETION); + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_PM_INIT_COMPLETION; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } } static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, @@ -186,6 +208,30 @@ static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, dvsec_virt_write32(vdev, dvsec_off, new_val); } +/* + * status2_hw_shadow_merge - read STATUS2, merging hardware and vconfig shadow. + * + * RESET_COMPLETE and RESET_ERROR are written into vconfig by vfio_cxl_reset() + * after a protocol reset; pci_dev_restore() clears them from hardware, so they + * must survive in the shadow for a polling guest to see the reset outcome. + * + * All other STATUS2 bits are live hardware outputs and must come from hardware. + * In particular, CACHE_INVALID (bit 0) is polled by guests during a standalone + * write-back invalidation. + * + * @abs_pos: absolute PCI config space byte offset of the STATUS2 register. + */ +static u16 status2_hw_shadow_merge(struct vfio_pci_core_device *vdev, int abs_pos) +{ + const u16 shadow_mask = CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + u16 hw = 0, virt; + + pci_read_config_word(vdev->pdev, abs_pos, &hw); + virt = get_unaligned_le16(vdev->vconfig + abs_pos); + return (hw & ~shadow_mask) | (virt & shadow_mask); +} + /** * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. * @vdev: VFIO PCI core device @@ -200,6 +246,10 @@ static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, * LOCK) so that userspace reads reflect emulated state rather than raw * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). * + * A 4-byte (DWORD) access at the CONTROL2 offset spans both CONTROL2 and + * STATUS2 since CONTROL2 is DWORD-aligned and the two registers are adjacent. + * In that case STATUS2 is returned via the hardware-merge path. + * * Return: @count on success, or negative error code from the fallback read. */ static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, @@ -208,26 +258,61 @@ static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, int offset, __le32 *val) { struct vfio_pci_cxl_state *cxl = vdev->cxl; - u16 dvsec = _cxlds_get_dvsec(vdev->cxl); - u16 dvsec_off; + u16 dvsec, dvsec_off, reg_start, byte_in_reg; + + if (!cxl) + return vfio_direct_config_read(vdev, pos, count, perm, offset, + val); - if (!cxl || (u16)pos < dvsec || - (u16)pos >= dvsec + cxl->dvsec_len) - return vfio_raw_config_read(vdev, pos, count, perm, offset, val); + dvsec = _cxlds_get_dvsec(cxl); + if ((u16)pos < dvsec || (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_direct_config_read(vdev, pos, count, perm, offset, + val); dvsec_off = (u16)pos - dvsec; - switch (dvsec_off) { + /* + * Route by the 2-byte-aligned start of the register so that a guest + * read at the high byte (dvsec_off | 1) hits the shadow path instead + * of falling through to the direct read and diverging from a prior + * shadow write. + */ + reg_start = dvsec_off & ~1u; + byte_in_reg = dvsec_off - reg_start; + + switch (reg_start) { case CXL_DVSEC_CONTROL_OFFSET: case CXL_DVSEC_STATUS_OFFSET: - case CXL_DVSEC_CONTROL2_OFFSET: case CXL_DVSEC_LOCK_OFFSET: - /* Return shadow vconfig value for virtualized registers */ + /* Fully virtualised; return shadow. Byte/word reads work too. */ memcpy(val, vdev->vconfig + pos, count); return count; + case CXL_DVSEC_CONTROL2_OFFSET: + if (count == 4 && byte_in_reg == 0) { + /* + * 4-byte access at the DWORD-aligned CONTROL2 offset + * spans both CONTROL2 (low 16 bits) and STATUS2 (high + * 16 bits). Return CONTROL2 from vconfig and STATUS2 + * via the hardware-merge path so CACHE_INVALID is fresh. + */ + __le32 combined = cpu_to_le32( + (u32)get_unaligned_le16(vdev->vconfig + pos) | + ((u32)status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET) << 16)); + memcpy(val, &combined, 4); + } else { + memcpy(val, vdev->vconfig + pos, count); + } + return count; + case CXL_DVSEC_STATUS2_OFFSET: { + __le16 merged = cpu_to_le16(status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET)); + memcpy(val, ((u8 *)&merged) + byte_in_reg, count); + return count; + } default: - return vfio_raw_config_read(vdev, pos, count, - perm, offset, val); + return vfio_direct_config_read(vdev, pos, count, + perm, offset, val); } } @@ -255,14 +340,18 @@ static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, int offset, __le32 val) { struct vfio_pci_cxl_state *cxl = vdev->cxl; - u16 dvsec = _cxlds_get_dvsec(vdev->cxl); - u16 abs_off = (u16)pos; - u16 dvsec_off, dword_start, byte_in_dword; + u16 dvsec, abs_off, dvsec_off, reg_start, byte_in_reg; + u16 dword_start, byte_in_dword; u16 wval16; u32 wval32; - if (!cxl || (u16)pos < dvsec || - (u16)pos >= dvsec + cxl->dvsec_len) + if (!cxl) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + dvsec = _cxlds_get_dvsec(cxl); + abs_off = (u16)pos; + if (abs_off < dvsec || abs_off >= dvsec + cxl->dvsec_len) return vfio_raw_config_write(vdev, pos, count, perm, offset, val); @@ -272,6 +361,16 @@ static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, dvsec_off = abs_off - dvsec; + /* + * The 2-byte virtualised registers (CONTROL, STATUS, CONTROL2, + * STATUS2, LOCK) all live at 2-byte-aligned offsets. Compute the + * register-aligned offset so writes at the high byte still hit the + * right handler, and merge partial-byte writes against the shadow so + * the high byte of the matched register is not zeroed. + */ + reg_start = dvsec_off & ~1u; + byte_in_reg = dvsec_off - reg_start; + dword_start = dvsec_off & ~3u; byte_in_dword = dvsec_off - dword_start; @@ -288,26 +387,70 @@ static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, return count; } + if (count == 1) { + u16 cur = dvsec_virt_read16(vdev, reg_start); + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? (cur & 0x00ff) | ((u16)byte << 8) + : (cur & 0xff00) | byte; + } else { + wval16 = (u16)le32_to_cpu(val); + } + /* Route to the appropriate per-register handler */ - switch (dvsec_off) { + switch (reg_start) { case CXL_DVSEC_CONTROL_OFFSET: - wval16 = (u16)le32_to_cpu(val); cxl_dvsec_control_write(vdev, wval16); + if (count == 4 && byte_in_reg == 0) { + /* + * High half of a 32-bit write at CONTROL is STATUS; + * forward so RW1C VIRAL_STATUS is not silently dropped. + */ + cxl_dvsec_status_write(vdev, + (u16)(le32_to_cpu(val) >> 16)); + } break; case CXL_DVSEC_STATUS_OFFSET: - wval16 = (u16)le32_to_cpu(val); + /* + * STATUS is RO/W1C. A one-byte write must only act on bits in + * the byte the guest wrote: re-derive the value without merging + * the other byte from shadow, otherwise W1C bits set in shadow + * (e.g. VIRAL_STATUS) would be passed as fresh 1-writes and + * unintentionally cleared. + */ + if (count == 1) { + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? ((u16)byte << 8) : byte; + } cxl_dvsec_status_write(vdev, wval16); break; case CXL_DVSEC_CONTROL2_OFFSET: - wval16 = (u16)le32_to_cpu(val); cxl_dvsec_control2_write(vdev, wval16); + if (count == 4 && byte_in_reg == 0) { + /* + * High half of a 32-bit write at CONTROL2 is STATUS2; + * forward so RW1CS VOLATILE_HDM_PRES_ERROR is not + * silently dropped. + */ + cxl_dvsec_status2_write(vdev, + (u16)(le32_to_cpu(val) >> 16)); + } break; case CXL_DVSEC_STATUS2_OFFSET: - wval16 = (u16)le32_to_cpu(val); + /* + * STATUS2 is RO/RW1CS. Same rule as STATUS: a one-byte write + * must not let W1CS bits set in shadow leak in as fresh + * 1-writes via the merge. + */ + if (count == 1) { + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? ((u16)byte << 8) : byte; + } cxl_dvsec_status2_write(vdev, wval16); break; case CXL_DVSEC_LOCK_OFFSET: - wval16 = (u16)le32_to_cpu(val); cxl_dvsec_lock_write(vdev, wval16); break; default: -- Gitee From 1346659e8a38a258545647775fa73bcddf716819 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 11:56:01 +0530 Subject: [PATCH 42/42] NVIDIA: VR: SAUCE: vfio/cxl: Implement vfio_cxl_reset() BugLink: https://bugs.launchpad.net/bugs/2152222 Add vfio_cxl_reset() to drive a CXL protocol reset on behalf of a guest. Unlike cxl_do_reset(), this path skips host memory offlining since the DPA region is guest memory. The function takes memory_lock for the full sequence, calls vfio_cxl_prepare_reset() to zap DPA region PTEs, drives the hardware via cxl_dev_reset_locked(), which performs pci_dev_save_and_disable(), cxl_dev_reset(), sibling CXL.cachemem coordination, and pci_dev_restore() under the CXL reset mutex, then calls vfio_cxl_finish_reset() to reinitialise emulated state. STATUS2 outcome bits (CXL_RESET_COMPLETE / CXL_RESET_ERROR) are written back to vconfig after the reset so the guest can poll for the result without reading hardware. cxl_save_dvsec() / cxl_restore_dvsec() cover CTRL, CTRL2, range_base_*, and LOCK; STATUS2 is not saved or restored across the reset, so the hardware value is re-read after restore (it will have both outcome bits clear) and the outcome is stamped on top. When the guest writes INIT_CXL_RST into DVSEC CONTROL2, invoke vfio_cxl_reset() to perform a CXL protocol reset. The bit is not forwarded to hardware; cxl_dev_reset() drives the reset sequence directly. Silently drop writes on devices that do not advertise RST_CAPABLE to avoid log noise for the reserved-bit case. Signed-off-by: Manish Honap Signed-off-by: Jiandi An (cherry-picked from commit 67c66e735df5762cfeb7b80e4fdf0815451899f9 from https://github.com/JiandiAnNVIDIA/NV-Kernels.git cxl-vfio_2026-04-23) Signed-off-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Signed-off-by: Brad Figg --- drivers/vfio/pci/cxl/vfio_cxl_config.c | 106 +++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c index 4dc86c1de80b..aeecce1a3d50 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_config.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -107,6 +107,85 @@ static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); } +/** + * vfio_cxl_reset - Service a guest CXL protocol reset. + * @vdev: VFIO PCI core device + * + * Unlike cxl_do_reset(), no host memory offlining is performed: the DPA + * region is guest memory, not host RAM. + * + * memory_lock is held for the entire sequence so neither BAR nor DPA + * mappings can fault back in. INIT_CXL_RST is not forwarded to hardware; + * cxl_dev_reset() drives the state machine directly. + * + * STATUS2 outcome bits are written back to vconfig on return so that the + * guest can poll for completion without going to hardware. + * + * Return: 0 on success, negative error code on failure. + */ +static int vfio_cxl_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(cxl); + u16 hw_status2 = 0; + int ret; + + vfio_pci_zap_and_down_write_memory_lock(vdev); + + /* + * CXL r4.0 Table 8-9: device must clear CXL_Reset_Complete before + * starting the reset flow, on the 0->1 transition of Initiate_CXL_Reset. + * Clear both reset outcome bits so a polling guest sees an unambiguous + * in-progress state rather than a stale result from a prior attempt. + */ + { + u16 s = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + + s &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, s); + } + + vfio_cxl_prepare_reset(vdev); + + /* + * Hand the actual reset off to cxl_dev_reset_locked() so the CXL core + * applies its global reset mutex and saves/disables any CXL.cachemem + * sibling functions on the bus. A bare cxl_dev_reset() under just + * pci_dev_lock() leaves those siblings vulnerable to half-reset states + * and lets a guest-triggered CXL reset race a concurrent host sysfs + * reset. + */ + ret = cxl_dev_reset_locked(pdev, cxl->cxlds.cxl_dvsec, + !!(dvsec_virt_read16(vdev, + CXL_DVSEC_CONTROL2_OFFSET) & + CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE)); + + vfio_cxl_finish_reset(vdev); + + /* + * Re-read STATUS2 from hardware after restore. cxl_save_dvsec() / + * cxl_restore_dvsec() cover CTRL, CTRL2, range_base_*, and LOCK; + * STATUS2 is intentionally not saved or restored across the reset, so + * the hardware value here is fresh post-reset (both outcome bits clear) + * and reflects genuine hardware changes such as VOLATILE_HDM_PRES_ERROR + * clearing. Stamp the new outcome on top of that value below. + */ + pci_read_config_word(pdev, dvsec + CXL_DVSEC_STATUS2_OFFSET, + &hw_status2); + hw_status2 &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + if (ret) + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + else + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, hw_status2); + + up_write(&vdev->memory_lock); + return ret; +} + static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, u16 new_val) { @@ -141,14 +220,31 @@ static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, } /* - * CXL Reset: not yet supported - do not forward to HW. - * TODO: invoke CXL protocol reset via cxl subsystem + * Commit the new CONTROL2 value to the shadow before triggering a + * reset. vfio_cxl_reset() reads Mem_Clr_Enable (bit 3) from the + * shadow; if the shadow is written after the reset call, a guest write + * that changes bit 3 in the same access as INITIATE_CXL_RESET would + * reset with the stale bit 3 value instead of the one just written. */ - if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) - pci_warn(pdev, "vfio-cxl: CXL reset requested but not yet supported\n"); - dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, new_val & ~CXL_CTRL2_HW_BITS_MASK); + + /* + * INIT_CXL_RST: not forwarded to hardware. cxl_dev_reset() drives + * the state machine; forwarding it after the reset would fire a + * second one. Drop writes on non-RST_CAPABLE devices silently; the + * spec reserves the bit there and logging every write is just noise. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) { + if (vfio_cxl_reset_capable(vdev)) { + int rc = vfio_cxl_reset(vdev); + + if (rc) + pci_warn(pdev, + "vfio-cxl: CXL reset failed (%d)\n", + rc); + } + } } static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, -- Gitee