diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 82ca3a4767080cf30b2366a5cdef7dbadff79bf5..7b3bdcee64168e44008e554a081afe447fd8b1ef 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -138,6 +138,8 @@ extern struct cxl_rwsem cxl_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); enum cxl_poison_trace_type { CXL_POISON_TRACE_LIST, diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 0c80b76a5f9b4b7a493d840809799dbae5789d61..081b763ebd1e892223dbbba3ba9e36441afe863a 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -565,6 +565,7 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) devm_cxl_dpa_release(cxled); return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dpa_free, "CXL"); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode) @@ -596,6 +597,64 @@ int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, return 0; } +static int find_free_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == (port->hdm_end + 1); +} + +static struct cxl_endpoint_decoder * +cxl_find_free_decoder(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct device *dev; + + guard(rwsem_read)(&cxl_rwsem.dpa); + dev = device_find_child(&endpoint->dev, NULL, find_free_decoder); + if (!dev) + return NULL; + + return to_cxl_endpoint_decoder(dev); +} + +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc) +{ + struct cxl_endpoint_decoder *cxled; + int rc; + + if (!IS_ALIGNED(alloc, SZ_256M)) + return ERR_PTR(-EINVAL); + + cxled = cxl_find_free_decoder(cxlmd); + if (!cxled) + return ERR_PTR(-ENODEV); + + rc = cxl_dpa_set_part(cxled, mode); + if (rc) + goto err_put; + + rc = cxl_dpa_alloc(cxled, alloc); + if (rc) + goto err_put; + + return cxled; + +err_put: + put_device(&cxled->cxld.dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(cxl_request_dpa, "CXL"); + static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -679,6 +738,44 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } +static int find_committed_endpoint_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_port *port; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + port = cxled_to_port(cxled); + + return cxled->cxld.id == port->hdm_end; +} + +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct device *cxled_dev; + + if (!endpoint) + return NULL; + + guard(rwsem_read)(&cxl_rwsem.dpa); + cxled_dev = device_find_child(&endpoint->dev, NULL, + find_committed_endpoint_decoder); + if (!cxled_dev) + return NULL; + + cxled = to_cxl_endpoint_decoder(cxled_dev); + *cxlr = cxled->cxld.region; + + put_device(cxled_dev); + return cxled; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_committed_decoder, "CXL"); + static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) { u16 eig; @@ -1031,13 +1128,14 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, return -ENXIO; } + port->commit_end = cxld->id; + if (size == 0) { - dev_warn(&port->dev, + dev_dbg(&port->dev, "decoder%d.%d: Committed with zero size\n", port->id, cxld->id); - return -ENXIO; + return -ENOSPC; } - port->commit_end = cxld->id; } else { if (cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -1193,6 +1291,8 @@ static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, rc = init_hdm_decoder(port, cxld, hdm, i, &dpa_base, info); if (rc) { + if (rc == -ENOSPC) + continue; dev_warn(&port->dev, "Failed to initialize decoder%d.%d\n", port->id, i); diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 80e65690eb773e7e043bea2e52933e698ae8f8e9..0587a7509a6fbbde12bad0d24fe8587163915914 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "trace.h" #include "core.h" @@ -579,9 +580,16 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +static const struct device_type cxl_accel_memdev_type = { + .name = "cxl_accel_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, +}; + bool is_cxl_memdev(const struct device *dev) { - return dev->type == &cxl_memdev_type; + return (dev->type == &cxl_memdev_type || + dev->type == &cxl_accel_memdev_type); } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); @@ -710,7 +718,10 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->parent = cxlds->dev; dev->bus = &cxl_bus_type; dev->devt = MKDEV(cxl_mem_major, cxlmd->id); - dev->type = &cxl_memdev_type; + if (cxlds->type == CXL_DEVTYPE_DEVMEM) + dev->type = &cxl_accel_memdev_type; + else + dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); INIT_WORK(&cxlmd->detach_work, detach_memdev); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index d1f487b3d809adde0010ae8bc6838e620d7d321a..c4f4d5e161e066a64b524f9d22ac6091f0dd1e4a 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -13,6 +15,9 @@ #include "core.h" #include "trace.h" +/* Initial sibling array capacity: covers max non-ARI functions per slot */ +#define CXL_RESET_SIBLINGS_INIT 8 + /** * DOC: cxl core pci * @@ -141,16 +146,24 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) return 0; } -/* - * Wait up to @media_ready_timeout for the device to report memory - * active. +/** + * cxl_await_range_active - Wait for all HDM DVSEC memory ranges to be active + * @cxlds: CXL device state (DVSEC and HDM count must be valid) + * + * For each HDM decoder range reported in the CXL DVSEC capability, waits for + * the range to report MEM INFO VALID (up to 1s per range), then MEM ACTIVE + * (up to media_ready_timeout seconds per range, default 60s). Used by + * cxl_await_media_ready() and by callers that only need range readiness + * without checking the memory device status register. + * + * Return: 0 if all ranges become valid and active, -ETIMEDOUT if a timeout + * occurs, or a negative errno from config read on failure. */ -int cxl_await_media_ready(struct cxl_dev_state *cxlds) +int cxl_await_range_active(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; int rc, i, hdm_count; - u64 md_status; u16 cap; rc = pci_read_config_word(pdev, @@ -171,6 +184,23 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) return rc; } + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_range_active, "CXL"); + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + u64 md_status; + int rc; + + rc = cxl_await_range_active(cxlds); + if (rc) + return rc; + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO; @@ -448,6 +478,35 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL"); +/** + * cxl_get_hdm_info - Get HDM decoder register block location and count + * @cxlds: CXL device state (must have component regs enumerated via + * cxl_probe_component_regs()) + * @count: number of HDM decoders in the block (from HDM Capability bits [3:0]) + * @offset: byte offset of HDM decoder block within the component register BAR + * @size: size in bytes of the HDM decoder block + * + * Return: 0 on success. -ENODEV if the HDM decoder block is not present. + */ +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ + struct cxl_reg_map *hdm = &cxlds->reg_map.component_map.hdm_decoder; + + if (WARN_ON(!count || !offset || !size)) + return -EINVAL; + + if (!hdm->valid) + return -ENODEV; + + *count = hdm->count; + *offset = hdm->offset; + *size = hdm->size; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hdm_info, "CXL"); + #define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff #define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 #define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 @@ -926,3 +985,624 @@ int cxl_port_get_possible_dports(struct cxl_port *port) return ctx.count; } + +/* + * CXL Reset support - core-provided reset logic for CXL devices. + * + * These functions implement the CXL reset sequence. + */ + +/* + * If CXL memory backed by this decoder is online as System RAM, offline + * and remove it per CXL spec requirements before issuing CXL Reset. + * Returns 0 if memory was not online or was successfully offlined. + */ +static int cxl_is_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static int __maybe_unused cxl_offline_memory(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct cxl_region_params *p; + int rc; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr) + return 0; + + p = &cxlr->params; + if (!p->res) + return 0; + + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, NULL, + cxl_is_system_ram) <= 0) + return 0; + + dev_info(dev, "Offlining CXL memory [%pr] for reset\n", p->res); + +#ifdef CONFIG_MEMORY_HOTREMOVE + rc = offline_and_remove_memory(p->res->start, resource_size(p->res)); + if (rc) { + dev_err(dev, + "Failed to offline CXL memory [%pr]: %d\n", + p->res, rc); + return rc; + } +#else + dev_err(dev, "Memory hotremove not supported, cannot offline CXL memory\n"); + rc = -EOPNOTSUPP; + return rc; +#endif + + return 0; +} + +static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + struct device *dev; + + if (!cxlmd || !cxlmd->cxlds) + return -ENODEV; + + dev = cxlmd->cxlds->dev; + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + return device_for_each_child(&endpoint->dev, NULL, + cxl_offline_memory); +} + +static int __maybe_unused cxl_decoder_flush_cache(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct resource *res; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr || !cxlr->params.res) + return 0; + + res = cxlr->params.res; + cpu_cache_invalidate_memregion(res->start, resource_size(res)); + return 0; +} + +static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + + if (!cxlmd) + return 0; + + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + if (!cpu_cache_has_invalidate_memregion()) + return 0; + + device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); + return 0; +} + +/* + * Serialize all CXL reset operations globally. + */ +static DEFINE_MUTEX(cxl_reset_mutex); + +struct cxl_reset_context { + struct pci_dev *target; + struct pci_dev **pci_functions; + int pci_func_count; + int pci_func_cap; +}; + +/* + * Check if a sibling function is non-CXL using the Non-CXL Function Map + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including + * on any read failure). + */ +static bool cxl_is_non_cxl_function(struct pci_dev *pdev, + u16 func_map_dvsec, int fn) +{ + int reg, bit; + u32 map; + + if (pci_ari_enabled(pdev->bus)) { + reg = fn / 32; + bit = fn % 32; + } else { + reg = 0; + bit = fn; + } + + if (pci_read_config_dword(pdev, + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4), + &map)) + return false; + + return map & BIT(bit); +} + +struct cxl_reset_walk_ctx { + struct cxl_reset_context *ctx; + u16 func_map_dvsec; + int error; + bool ari; +}; + +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) +{ + struct cxl_reset_walk_ctx *wctx = data; + struct cxl_reset_context *ctx = wctx->ctx; + struct pci_dev *pdev = ctx->target; + u16 dvsec, cap; + int fn; + + if (func == pdev) + return 0; + + if (!wctx->ari && + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn)) + return 0; + + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn); + if (wctx->func_map_dvsec && + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn)) + return 0; + + /* Only coordinate with siblings that have CXL.cachemem */ + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return 0; + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return 0; + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE | + PCI_DVSEC_CXL_MEM_CAPABLE))) + return 0; + + /* Grow sibling array; double capacity for ARI devices when running out of space */ + if (ctx->pci_func_count >= ctx->pci_func_cap) { + struct pci_dev **new; + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2 + : CXL_RESET_SIBLINGS_INIT; + + new = krealloc(ctx->pci_functions, + new_cap * sizeof(*new), GFP_KERNEL); + if (!new) { + wctx->error = -ENOMEM; + return 1; + } + ctx->pci_functions = new; + ctx->pci_func_cap = new_cap; + } + + pci_dev_get(func); + ctx->pci_functions[ctx->pci_func_count++] = func; + return 0; +} + +static void cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) + pci_dev_put(ctx->pci_functions[i]); + kfree(ctx->pci_functions); + ctx->pci_functions = NULL; + ctx->pci_func_count = 0; + ctx->pci_func_cap = 0; +} + +static int cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +{ + struct pci_dev *pdev = ctx->target; + struct cxl_reset_walk_ctx wctx; + int i; + + ctx->pci_func_count = 0; + ctx->pci_functions = NULL; + ctx->pci_func_cap = 0; + + wctx.ctx = ctx; + wctx.ari = pci_ari_enabled(pdev->bus); + wctx.error = 0; + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP); + + /* Collect CXL.cachemem siblings under pci_bus_sem */ + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx); + if (wctx.error) { + cxl_pci_functions_reset_release(ctx); + return wctx.error; + } + + /* Lock and save/disable siblings outside pci_bus_sem */ + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_lock(ctx->pci_functions[i]); + pci_dev_save_and_disable(ctx->pci_functions[i]); + } + + return 0; +} + +static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_restore(ctx->pci_functions[i]); + pci_dev_unlock(ctx->pci_functions[i]); + } + cxl_pci_functions_reset_release(ctx); +} + +/* + * CXL device reset execution + */ +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en) +{ + static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 cap, ctrl2, status2; + u32 timeout_ms; + int rc, idx; + + if (!pci_wait_for_pending_transaction(pdev)) + pci_err(pdev, "timed out waiting for pending transactions\n"); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + /* + * Disable caching and initiate cache writeback+invalidation if the + * device supports it. Poll for completion. + * Per CXL r3.2 section 9.6, software may use the cache size from + * DVSEC CXL Capability2 to compute a suitable timeout; we use a + * default of 10ms. + */ + if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) { + u32 wbi_poll_us = 100; + s32 wbi_remaining_us = 10000; + + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CACHE_WBI; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + do { + usleep_range(wbi_poll_us, wbi_poll_us + 1); + wbi_remaining_us -= wbi_poll_us; + rc = pci_read_config_word(pdev, + dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + } while (!(status2 & PCI_DVSEC_CXL_CACHE_INV) && + wbi_remaining_us > 0); + + if (!(status2 & PCI_DVSEC_CXL_CACHE_INV)) { + pci_err(pdev, "CXL cache WB+I timed out\n"); + return -ETIMEDOUT; + } + } else if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) { + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + if (cap & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &ctrl2); + if (rc) + return rc; + + /* + * Explicitly set or clear RST_MEM_CLR_EN rather than only + * setting it. A previous reset may have left the bit set in + * hardware; if mem_clr_en is false we must clear it so that a + * guest-triggered reset does not unexpectedly scrub DPA. + */ + if (mem_clr_en) + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + else + ctrl2 &= ~PCI_DVSEC_CXL_RST_MEM_CLR_EN; + + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + idx = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, cap); + if (idx >= ARRAY_SIZE(reset_timeout_ms)) + idx = ARRAY_SIZE(reset_timeout_ms) - 1; + timeout_ms = reset_timeout_ms[idx]; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CXL_RST; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + msleep(timeout_ms); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + + if (status2 & PCI_DVSEC_CXL_RST_ERR) { + pci_err(pdev, "CXL reset error\n"); + return -EIO; + } + + if (!(status2 & PCI_DVSEC_CXL_RST_DONE)) { + pci_err(pdev, "CXL reset timeout\n"); + return -ETIMEDOUT; + } + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 &= ~PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset, "CXL"); + +/** + * cxl_dev_reset_locked() - cxl_dev_reset() under cxl_reset_mutex with sibling + * CXL.cachemem function save/restore. + * @pdev: Target CXL function + * @dvsec: CXL DVSEC capability offset (pci_find_dvsec_capability()) + * @mem_clr_en: Pass-through to cxl_dev_reset() (Mem_Clr_Enable in CTRL2) + * + * Return: 0 on success, negative errno from cxl_dev_reset() or sibling + * coordination failure. + */ +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + pci_dev_save_and_disable(pdev); + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec, mem_clr_en); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset_locked, "CXL"); + +static int match_memdev_by_parent(struct device *dev, const void *parent) +{ + return is_cxl_memdev(dev) && dev->parent == parent; +} + +static int __cxl_do_reset(struct pci_dev *pdev, struct cxl_memdev *cxlmd, + int dvsec) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + if (cxlmd) { + guard(device)(&cxlmd->dev); + + rc = cxl_reset_prepare_memdev(cxlmd); + if (rc) + goto out_unlock; + + cxl_reset_flush_cpu_caches(cxlmd); + } + + pci_dev_save_and_disable(pdev); + + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec, true); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + +out_unlock: + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} + +static int cxl_do_reset(struct pci_dev *pdev) +{ + int dvsec; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return -ENODEV; + + struct device *memdev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, &pdev->dev, + match_memdev_by_parent); + if (!memdev) + return __cxl_do_reset(pdev, NULL, dvsec); + + struct cxl_memdev *cxlmd = to_cxl_memdev(memdev); + + return __cxl_do_reset(pdev, cxlmd, dvsec); +} + +/* + * CXL reset sysfs attribute management. + * + * The cxl_reset attribute is added to PCI devices that advertise CXL Reset + * capability. Managed entirely by the CXL module via subsys_interface on + * pci_bus_type, avoiding cross-module symbol dependencies between the PCI + * core (built-in) and CXL (potentially modular). + * + * subsys_interface handles existing devices at register time and hot-plug + * add/remove automatically. On unregister, remove_dev runs for all tracked + * devices under bus core serialization. + */ + +bool pci_cxl_reset_capable(struct pci_dev *pdev) +{ + int dvsec; + u16 cap; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return false; + + if (pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return false; + + if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE) || + !(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) + return false; + + return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); +} +EXPORT_SYMBOL_NS_GPL(pci_cxl_reset_capable, "CXL"); + +static ssize_t cxl_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc; + + if (!sysfs_streq(buf, "1")) + return -EINVAL; + + rc = cxl_do_reset(pdev); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(cxl_reset); + +static umode_t cxl_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return a->mode; +} + +static struct attribute *cxl_reset_attrs[] = { + &dev_attr_cxl_reset.attr, + NULL, +}; + +static const struct attribute_group cxl_reset_attr_group = { + .attrs = cxl_reset_attrs, + .is_visible = cxl_reset_attr_is_visible, +}; + +static int cxl_reset_add_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return sysfs_create_group(&dev->kobj, &cxl_reset_attr_group); +} + +static void cxl_reset_remove_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return; + + sysfs_remove_group(&dev->kobj, &cxl_reset_attr_group); +} + +static struct subsys_interface cxl_reset_interface = { + .name = "cxl_reset", + .subsys = &pci_bus_type, + .add_dev = cxl_reset_add_dev, + .remove_dev = cxl_reset_remove_dev, +}; + +void cxl_reset_sysfs_init(void) +{ + int rc; + + rc = subsys_interface_register(&cxl_reset_interface); + if (rc) + pr_warn("CXL: failed to register cxl_reset interface (%d)\n", + rc); +} + +void cxl_reset_sysfs_exit(void) +{ + subsys_interface_unregister(&cxl_reset_interface); +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5aacd7054f1d245d8334d6397530da44ce5130e..f95f0bdd7b90025f1c0a2a58b10c1ac625e4e704 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2530,6 +2530,8 @@ static __init int cxl_core_init(void) if (rc) goto err_ras; + cxl_reset_sysfs_init(); + return 0; err_ras: @@ -2545,6 +2547,7 @@ static __init int cxl_core_init(void) static void cxl_core_exit(void) { + cxl_reset_sysfs_exit(); cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e50dc716d4e820df278840814eed91483c461cd8..779dfa81a7ff59b32e476cf42fd74dd17a8cd893 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -732,6 +733,141 @@ static int free_hpa(struct cxl_region *cxlr) return 0; } +struct cxlrd_max_context { + struct device * const *host_bridges; + int interleave_ways; + unsigned long flags; + resource_size_t max_hpa; + struct cxl_root_decoder *cxlrd; +}; + +static int find_max_hpa(struct device *dev, void *data) +{ + struct cxlrd_max_context *ctx = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_root_decoder *cxlrd; + struct resource *res, *prev; + struct cxl_decoder *cxld; + resource_size_t free = 0; + resource_size_t max; + int found = 0; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + cxlsd = &cxlrd->cxlsd; + cxld = &cxlsd->cxld; + + if ((cxld->flags & ctx->flags) != ctx->flags) { + dev_dbg(dev, "flags not matching: %08lx vs %08lx\n", + cxld->flags, ctx->flags); + return 0; + } + + for (int i = 0; i < ctx->interleave_ways; i++) { + for (int j = 0; j < ctx->interleave_ways; j++) { + if (ctx->host_bridges[i] == cxlsd->target[j]->dport_dev) { + found++; + break; + } + } + } + + if (found != ctx->interleave_ways) { + dev_dbg(dev, + "Not enough host bridges. Found %d for %d interleave ways requested\n", + found, ctx->interleave_ways); + return 0; + } + + lockdep_assert_held_read(&cxl_rwsem.region); + res = cxlrd->res->child; + + if (!res) + max = resource_size(cxlrd->res); + else + max = 0; + + for (prev = NULL; res; prev = res, res = res->sibling) { + if (!prev && res->start == cxlrd->res->start && + res->end == cxlrd->res->end) { + max = resource_size(cxlrd->res); + break; + } + if (prev && !resource_size(prev)) + continue; + + if (!prev && res->start > cxlrd->res->start) { + free = res->start - cxlrd->res->start; + max = max(free, max); + } + if (prev && res->start > prev->end + 1) { + free = res->start - prev->end + 1; + max = max(free, max); + } + } + + if (prev && prev->end + 1 < cxlrd->res->end + 1) { + free = cxlrd->res->end + 1 - prev->end + 1; + max = max(free, max); + } + + dev_dbg(&cxlrd->cxlsd.cxld.dev, "found %pa bytes of free space\n", &max); + if (max > ctx->max_hpa) { + if (ctx->cxlrd) + put_device(&ctx->cxlrd->cxlsd.cxld.dev); + get_device(&cxlrd->cxlsd.cxld.dev); + ctx->cxlrd = cxlrd; + ctx->max_hpa = max; + } + return 0; +} + +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max_avail_contig) +{ + struct cxlrd_max_context ctx = { + .flags = flags, + .interleave_ways = interleave_ways, + }; + struct cxl_port *root_port; + struct cxl_port *endpoint; + + endpoint = cxlmd->endpoint; + if (!endpoint) { + dev_dbg(&cxlmd->dev, "endpoint not linked to memdev\n"); + return ERR_PTR(-ENXIO); + } + + ctx.host_bridges = &endpoint->host_bridge; + + struct cxl_root *root __free(put_cxl_root) = find_cxl_root(endpoint); + if (!root) { + dev_dbg(&endpoint->dev, "endpoint is not related to a root port\n"); + return ERR_PTR(-ENXIO); + } + + root_port = &root->port; + scoped_guard(rwsem_read, &cxl_rwsem.region) + device_for_each_child(&root_port->dev, &ctx, find_max_hpa); + + if (!ctx.cxlrd) + return ERR_PTR(-ENOMEM); + + *max_avail_contig = ctx.max_hpa; + return ctx.cxlrd; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hpa_freespace, "CXL"); + +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd) +{ + put_device(&cxlrd->cxlsd.cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_put_root_decoder, "CXL"); + static ssize_t size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -1423,57 +1559,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) return 0; } +static inline u64 get_selector(u64 ways, u64 gran) +{ + if (!is_power_of_2(ways)) + ways /= 3; + + if (!is_power_of_2(ways) || !is_power_of_2(gran)) + return 0; + + return (ways - 1) * gran; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = cxlr->cxlrd; - int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_ep *ep = cxl_ep_load(port, cxlmd); struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; - struct cxl_switch_decoder *cxlsd; + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev); struct cxl_port *iter = port; - u16 eig, peig; - u8 eiw, peiw; + int ig, iw = cxl_rr->nr_targets, rc, pos = cxled->pos; + int distance, parent_distance; + u64 selector, cxlr_sel; + u16 eig; + u8 eiw; /* * While root level decoders support x3, x6, x12, switch level * decoders only support powers of 2 up to x16. */ - if (!is_power_of_2(cxl_rr->nr_targets)) { + if (!is_power_of_2(iw)) { dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - cxl_rr->nr_targets); + dev_name(port->uport_dev), dev_name(&port->dev), iw); return -EINVAL; } - cxlsd = to_cxl_switch_decoder(&cxld->dev); - if (cxl_rr->nr_targets_set) { - int i, distance = 1; - struct cxl_region_ref *cxl_rr_iter; + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } - /* - * The "distance" between peer downstream ports represents which - * endpoint positions in the region interleave a given port can - * host. - * - * For example, at the root of a hierarchy the distance is - * always 1 as every index targets a different host-bridge. At - * each subsequent switch level those ports map every Nth region - * position where N is the width of the switch == distance. - */ - do { - cxl_rr_iter = cxl_rr_load(iter, cxlr); - distance *= cxl_rr_iter->nr_targets; - iter = to_cxl_port(iter->dev.parent); - } while (!is_cxl_root(iter)); - distance *= cxlrd->cxlsd.cxld.interleave_ways; + /* + * Calculate the effective granularity and ways to determine + * HPA bits used as target selectors of the interleave set. + * Use this to check if the root decoder and all subsequent + * HDM decoders only use bits from that range as selectors. + * + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + + /* Start with the root decoders selector and distance. */ + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways, + cxlrd->cxlsd.cxld.interleave_granularity); + distance = cxlrd->cxlsd.cxld.interleave_ways; + if (!is_power_of_2(distance)) + distance /= 3; + + for (iter = parent_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr); + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder; + u64 cxld_sel; + + if (cxld_iter->interleave_ways == 1) + continue; + + cxld_sel = get_selector(cxld_iter->interleave_ways, + cxld_iter->interleave_granularity); + + if (cxld_sel & selector) { + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxld_sel, selector); + return -ENXIO; + } + + selector |= cxld_sel; + distance *= cxl_rr_iter->nr_targets; + } + + parent_distance = distance; + distance *= iw; + + /* The combined selector bits must fit the region selector. */ + cxlr_sel = get_selector(p->interleave_ways, + p->interleave_granularity); + + if ((cxlr_sel & selector) != selector) { + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxlr_sel, selector); + return -ENXIO; + } + + /* Calculate remaining selector bits available for use. */ + selector = cxlr_sel & ~selector; - for (i = 0; i < cxl_rr->nr_targets_set; i++) + if (cxl_rr->nr_targets_set) { + for (int i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, distance); @@ -1484,87 +1682,40 @@ static int cxl_port_setup_targets(struct cxl_port *port, goto add_target; } - if (is_cxl_root(parent_port)) { + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + ig = cxld->interleave_granularity; + else /* + * Set the interleave granularity with each interleave + * level to a multiple of it's parent port interleave + * ways. Beginning with the granularity of the root + * decoder set to the region granularity (starting + * with the inner selector bits of the HPA), the + * granularity is increased with each level. Calculate + * this using the parent distance and region + * granularity. + * * Root decoder IG is always set to value in CFMWS which * may be different than this region's IG. We can use the * region's IG here since interleave_granularity_store() * does not allow interleaved host-bridges with * root IG != region IG. */ - parent_ig = p->interleave_granularity; - parent_iw = cxlrd->cxlsd.cxld.interleave_ways; - /* - * For purposes of address bit routing, use power-of-2 math for - * switch ports. - */ - if (!is_power_of_2(parent_iw)) - parent_iw /= 3; - } else { - struct cxl_region_ref *parent_rr; - struct cxl_decoder *parent_cxld; - - parent_rr = cxl_rr_load(parent_port, cxlr); - parent_cxld = parent_rr->decoder; - parent_ig = parent_cxld->interleave_granularity; - parent_iw = parent_cxld->interleave_ways; - } + ig = p->interleave_granularity * parent_distance; - rc = granularity_to_eig(parent_ig, &peig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_ig); - return rc; - } - - rc = ways_to_eiw(parent_iw, &peiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_iw); - return rc; - } - - iw = cxl_rr->nr_targets; rc = ways_to_eiw(iw, &eiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), iw); - return rc; - } - - /* - * Interleave granularity is a multiple of @parent_port granularity. - * Multiplier is the parent port interleave ways. - */ - rc = granularity_to_eig(parent_ig * parent_iw, &eig); - if (rc) { - dev_dbg(&cxlr->dev, - "%s: invalid granularity calculation (%d * %d)\n", - dev_name(&parent_port->dev), parent_ig, parent_iw); - return rc; - } + if (!rc) + rc = granularity_to_eig(ig, &eig); - rc = eig_to_granularity(eig, &ig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n", dev_name(port->uport_dev), dev_name(&port->dev), - 256 << eig); - return rc; - } - - if (iw > 8 || iw > cxlsd->nr_targets) { - dev_dbg(&cxlr->dev, - "%s:%s:%s: ways: %d overflows targets: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - dev_name(&cxld->dev), iw, cxlsd->nr_targets); + iw, ig, selector); return -ENXIO; } if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - (iw > 1 && cxld->interleave_granularity != ig) || !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, @@ -2277,7 +2428,9 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled->part = -1; if (p->state > CXL_CONFIG_ACTIVE) { - cxl_region_decode_reset(cxlr, p->interleave_ways); + if (!test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } @@ -2559,6 +2712,62 @@ static void unregister_region(void *_cxlr) put_device(&cxlr->dev); } +static void cxl_endpoint_region_autoremove(void *_cxlr); + +static void cxl_region_release_action(struct cxl_region *cxlr) +{ + struct cxl_port *port = cxlrd_to_port(cxlr->cxlrd); + + if (cxlr->type != CXL_DECODER_DEVMEM) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return; + } + + if (cxlr->params.nr_targets) { + struct cxl_endpoint_decoder *cxled = cxlr->params.targets[0]; + struct cxl_port *endpoint = cxled_to_port(cxled); + + guard(device)(&endpoint->dev); + if (cxlr->detach) { + void (*detach)(void *data) = cxlr->detach; + void *detach_data = cxlr->detach_data; + + cxlr->detach = NULL; + cxlr->detach_data = NULL; + devm_release_action(&endpoint->dev, detach, detach_data); + devm_release_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + } else { + unregister_region(cxlr); + } + return; + } + + unregister_region(cxlr); +} + +void cxl_unregister_region(struct cxl_region *cxlr) +{ + cxl_region_release_action(cxlr); +} +EXPORT_SYMBOL_NS_GPL(cxl_unregister_region, "CXL"); + +int cxl_get_region_range(struct cxl_region *region, struct range *range) +{ + if (WARN_ON_ONCE(!region)) + return -ENODEV; + + if (!region->params.res) + return -ENOSPC; + + range->start = region->params.res->start; + range->end = region->params.res->end; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_region_range, "CXL"); + static struct lock_class_key cxl_region_key; static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) @@ -2711,9 +2920,16 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); - if (rc) - return ERR_PTR(rc); + /* + * For accelerators/type2, region release linked to endpoint device. + * See handling of cxl_endpoint_region_autoremove() below by + * cxl_memdev_attach_region(). + */ + if (type == CXL_DECODER_HOSTONLYMEM) { + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + } dev_dbg(port->uport_dev, "%s: created %s\n", dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); @@ -2764,7 +2980,6 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, memregion_free(rc); return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, target_type); } @@ -2836,14 +3051,13 @@ static ssize_t delete_region_store(struct device *dev, const char *buf, size_t len) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); - struct cxl_port *port = to_cxl_port(dev->parent); struct cxl_region *cxlr; cxlr = cxl_find_region_by_name(cxlrd, buf); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); put_device(&cxlr->dev); return len; @@ -3709,11 +3923,18 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, { struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; + if (part < 0 || part >= cxlds->nr_partitions) { + dev_err(cxlmd->dev.parent, + "%s:%s: invalid partition index %d (max %u)\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + part, cxlds->nr_partitions); + return ERR_PTR(-ENXIO); + } + do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, atomic_read(&cxlrd->region_id), @@ -3730,13 +3951,108 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = __construct_region(cxlr, ctx); if (rc) { - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); return ERR_PTR(rc); } return cxlr; } +DEFINE_FREE(cxl_region_release, struct cxl_region *, + if (!IS_ERR_OR_NULL(_T)) cxl_region_release_action(_T)) + +static struct cxl_region * +__construct_new_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, int ways) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled[0]); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p; + resource_size_t size = 0; + int rc, i, part = READ_ONCE(cxled[0]->part); + + if (part < 0 || part >= cxlds->nr_partitions) { + dev_err(cxlmd->dev.parent, + "%s:%s: invalid partition index %d (max %u)\n", + dev_name(&cxlmd->dev), dev_name(&cxled[0]->cxld.dev), + part, cxlds->nr_partitions); + return ERR_PTR(-ENXIO); + } + + struct cxl_region *cxlr __free(cxl_region_release) = + __create_region(cxlrd, cxlds->part[part].mode, + atomic_read(&cxlrd->region_id), + cxled[0]->cxld.target_type); + if (IS_ERR(cxlr)) + return cxlr; + + guard(rwsem_write)(&cxl_rwsem.region); + + p = &cxlr->params; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s unexpected region state\n", + dev_name(&cxlmd->dev), dev_name(&cxled[0]->cxld.dev), + __func__); + return ERR_PTR(-EBUSY); + } + + if (ways < 1) + return ERR_PTR(-EINVAL); + + p->interleave_ways = ways; + p->interleave_granularity = cxld->interleave_granularity; + + scoped_guard(rwsem_read, &cxl_rwsem.dpa) { + for (i = 0; i < ways; i++) { + if (!cxled[i]->dpa_res) + return ERR_PTR(-EINVAL); + size += resource_size(cxled[i]->dpa_res); + } + + rc = alloc_hpa(cxlr, size); + if (rc) + return ERR_PTR(rc); + + for (i = 0; i < ways; i++) { + rc = cxl_region_attach(cxlr, cxled[i], 0); + if (rc) + return ERR_PTR(rc); + } + } + + rc = cxl_region_decode_commit(cxlr); + if (rc) + return ERR_PTR(rc); + + p->state = CXL_CONFIG_COMMIT; + + return no_free_ptr(cxlr); +} + +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways) +{ + struct cxl_region *cxlr; + + mutex_lock(&cxlrd->range_lock); + cxlr = __construct_new_region(cxlrd, cxled, ways); + mutex_unlock(&cxlrd->range_lock); + if (IS_ERR(cxlr)) + return cxlr; + + if (device_attach(&cxlr->dev) <= 0) { + dev_err(&cxlr->dev, "failed to create region\n"); + cxl_region_release_action(cxlr); + return ERR_PTR(-ENODEV); + } + + return cxlr; +} +EXPORT_SYMBOL_NS_GPL(cxl_create_region, "CXL"); + static struct cxl_region * cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa_range) @@ -4018,6 +4334,12 @@ bool cxl_region_contains_resource(const struct resource *res) } EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem"); +bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return cxl_region_contains_resource(res); +} +EXPORT_SYMBOL_GPL(cxl_region_contains_soft_reserve); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; @@ -4043,6 +4365,135 @@ static int cxl_region_can_probe(struct cxl_region *cxlr) return 0; } +static int first_mapped_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (cxled->cxld.region) + return 1; + + return 0; +} + +/* + * As this is running in endpoint port remove context it does not race cxl_root + * destruction since port topologies are always removed depth first. + */ +static void cxl_endpoint_region_autoremove(void *_cxlr) +{ + unregister_region(_cxlr); +} + +/** + * cxl_memdev_attach_region - bind region to accelerator memdev + * + * @cxlmd: a pointer to cxl_memdev to use + * @attach: a pointer to region attach struct with callbacks for + * safely working with a region range by the caller + * + * Returns 0 or error. + */ +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, + struct cxl_attach_region *attach) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + int rc; + + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + if (!endpoint) + return -ENXIO; + + { + /* hold endpoint lock to setup autoremove of the region */ + guard(device)(&endpoint->dev); + if (!endpoint->dev.driver) + return -ENXIO; + + { + guard(rwsem_read)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + + /* + * TODO auto-instantiate a region, for now assume this will + * find an auto-region. + */ + struct device *dev __free(put_device) = + device_find_child(&endpoint->dev, NULL, + first_mapped_decoder); + + if (!dev) { + dev_dbg(cxlmd->cxlds->dev, + "no region found for memdev %s\n", + dev_name(&cxlmd->dev)); + return -ENXIO; + } + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + + if (cxlr->params.state < CXL_CONFIG_COMMIT) { + dev_dbg(cxlmd->cxlds->dev, + "region %s not committed for memdev %s\n", + dev_name(&cxlr->dev), dev_name(&cxlmd->dev)); + return -ENXIO; + } + + if (cxlr->params.nr_targets > 1) { + dev_dbg(cxlmd->cxlds->dev, + "Only attach to local non-interleaved region\n"); + return -ENXIO; + } + + attach->region = (struct range) { + .start = cxlr->params.res->start, + .end = cxlr->params.res->end, + }; + + /* + * With endpoint locked leave the caller to safely work + * with the region range. + */ + rc = attach->attach(attach->data); + if (rc) + return rc; + + /* Only teardown regions that pass validation, ignore the rest */ + rc = devm_add_action(&endpoint->dev, + cxl_endpoint_region_autoremove, cxlr); + if (rc) { + attach->detach(attach->data); + goto err_unregister; + } + + /* Link type2 driver callback for stopping use of the region range. */ + rc = devm_add_action_or_reset(&endpoint->dev, + attach->detach, attach->data); + if (rc) { + devm_remove_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + goto err_unregister; + } + + cxlr->detach = attach->detach; + cxlr->detach_data = attach->data; + + return 0; + } +err_unregister: + unregister_region(cxlr); + return rc; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_attach_region, "CXL"); + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); @@ -4053,6 +4504,13 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* + * HDM-D[B] (device-memory) regions have accelerator specific usage. + * Skip device-dax registration. + */ + if (cxlr->type == CXL_DECODER_DEVMEM) + return 0; + /* * From this point on any path that changes the region's state away from * CXL_CONFIG_COMMIT is also responsible for releasing the driver. diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 93710cf4f0a6971f2176c9acdad03c8516ec6a89..c73a05742be024a1381bd022694dbbffde4597ac 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -84,6 +84,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, decoder_cnt = cxl_hdm_decoder_count(hdr); length = 0x20 * decoder_cnt + 0x10; rmap = &map->hdm_decoder; + rmap->count = decoder_cnt; break; } case CXL_CM_CAP_CAP_ID_RAS: @@ -276,6 +277,19 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, u64 offset = ((u64)reg_hi << 32) | (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); + /* + * The BIR field is 3 bits wide (CXL spec); values 6 and 7 are + * reserved. PCI only defines BAR 0-5, and pci_resource_*() on a + * higher index reads past the resource array. Reject those here + * so callers do not get garbage. + */ + if (bar >= PCI_STD_NUM_BARS) { + dev_warn(&pdev->dev, + "Reserved BIR %d in Register Locator entry (type %d)\n", + bar, reg_type); + return false; + } + if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, @@ -286,9 +300,44 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; + map->bar_index = bar; + map->bar_offset = offset; return true; } +/** + * cxl_regblock_get_bar_info() - Get BAR index and offset for a BAR-backed + * regblock + * @map: Register map from cxl_find_regblock() or cxl_find_regblock_instance() + * @bar_index: Output BAR index (0-5). Optional, may be NULL. + * @bar_offset: Output offset within the BAR. Optional, may be NULL. + * + * When the register block was found via the Register Locator DVSEC and + * lives in a PCI BAR (BIR 0-5), this returns the BAR index and the offset + * within that BAR. + * + * Return: 0 if the regblock is BAR-backed (bar_index <= 5), -EINVAL otherwise. + */ +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset) +{ + if (!map || map->bar_index == 0xff) + return -EINVAL; + /* + * Guard callers against stale or out-of-range bar_index. Only BAR + * indices 0..5 are valid PCI BARs; anything else means the map was + * not BAR-backed or was filled from a reserved BIR. + */ + if (map->bar_index >= PCI_STD_NUM_BARS) + return -EINVAL; + if (bar_index) + *bar_index = map->bar_index; + if (bar_offset) + *bar_offset = map->bar_offset; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_regblock_get_bar_info, "CXL"); + /* * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index * Use CXL_INSTANCES_COUNT for @index if counting instances. @@ -307,6 +356,7 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty *map = (struct cxl_register_map) { .host = &pdev->dev, + .bar_index = 0xFF, .resource = CXL_RESOURCE_NONE, }; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 1297594beaec31f1e49a8ae05dbcca00e267111b..08755b9404e6966f515c48deb77b5418276effe0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -13,6 +13,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -24,72 +25,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; * (port-driver, region-driver, nvdimm object-drivers... etc). */ -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -#define CXL_CM_CAP_CAP_ID_RAS 0x2 -#define CXL_CM_CAP_CAP_ID_HDM 0x5 -#define CXL_CM_CAP_CAP_HDM_VERSION 1 - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - -/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ -#define CXL_DECODER_MIN_GRANULARITY 256 -#define CXL_DECODER_MAX_ENCODED_IG 6 - -static inline int cxl_hdm_decoder_count(u32 cap_hdr) -{ - int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - - switch (val) { - case 0: - return 1; - case 1 ... 8: - return val * 2; - case 9 ... 12: - return (val - 4) * 4; - default: - return -ENXIO; - } -} - /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ static inline int eig_to_granularity(u16 eig, unsigned int *granularity) { @@ -223,13 +158,16 @@ int cxl_map_device_regs(const struct cxl_register_map *map, int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); #define CXL_INSTANCES_COUNT -1 -enum cxl_regloc_type; int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); -int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); -int cxl_setup_regs(struct cxl_register_map *map); +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, + u8 *bar_index, resource_size_t *bar_offset); +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); @@ -296,14 +234,12 @@ struct cxl_decoder { }; /* - * Track whether this decoder is free for userspace provisioning, reserved for - * region autodiscovery, whether it is started connecting (awaiting other - * peers), or has completed auto assembly. + * Track whether this decoder is reserved for region autodiscovery, or + * free for userspace provisioning. */ enum cxl_decoder_state { CXL_DECODER_STATE_MANUAL, CXL_DECODER_STATE_AUTO, - CXL_DECODER_STATE_AUTO_STAGED, }; /** @@ -455,6 +391,8 @@ struct cxl_region_params { * @hpa_range: Address range occupied by the region * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type + * @detach: accelerator detach callback for device-memory regions + * @detach_data: accelerator detach callback data * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge * @flags: Region state flags @@ -470,6 +408,8 @@ struct cxl_region { struct range hpa_range; enum cxl_partition_mode mode; enum cxl_decoder_type type; + void (*detach)(void *data); + void *detach_data; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; unsigned long flags; @@ -723,7 +663,6 @@ DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev)) DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) -DEFINE_FREE(put_cxl_dax_region, struct cxl_dax_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); void cxl_bus_rescan(void); @@ -792,6 +731,9 @@ int cxl_port_setup_regs(struct cxl_port *port, resource_size_t component_reg_phys); struct cxl_dev_state; +int cxl_await_range_active(struct cxl_dev_state *cxlds); +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, struct cxl_endpoint_dvsec_info *info); @@ -855,7 +797,6 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); -bool cxl_region_contains_resource(const struct resource *res); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -878,10 +819,6 @@ static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, { return 0; } -static inline bool cxl_region_contains_resource(const struct resource *res) -{ - return false; -} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 776c50d1db5186b0a3cf33816ef8f13aa712dbbf..92cca400d113ff8c5d95d37ca29f6e9448e1d2f0 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,10 +34,6 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) -struct cxl_memdev_attach { - int (*probe)(struct cxl_memdev *cxlmd); -}; - /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -103,8 +99,6 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, - const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index fcffe24dcb42f80510ccf79511af8f0d97813dc2..ff858318091f18fbd1c3003d539d308a329a5632 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -65,6 +65,26 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static void cxl_memdev_poison_enable(struct cxl_memdev_state *mds, + struct cxl_memdev *cxlmd, + struct dentry *dentry) +{ + /* + * Avoid poison debugfs for DEVMEM aka accelerators as they rely on + * cxl_memdev_state. + */ + if (!mds) + return; + + if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) + debugfs_create_file("inject_poison", 0200, dentry, cxlmd, + &cxl_poison_inject_fops); + + if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) + debugfs_create_file("clear_poison", 0200, dentry, cxlmd, + &cxl_poison_clear_fops); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -92,12 +112,7 @@ static int cxl_mem_probe(struct device *dev) dentry = cxl_debugfs_create_dir(dev_name(dev)); debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); - if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) - debugfs_create_file("inject_poison", 0200, dentry, cxlmd, - &cxl_poison_inject_fops); - if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) - debugfs_create_file("clear_poison", 0200, dentry, cxlmd, - &cxl_poison_clear_fops); + cxl_memdev_poison_enable(mds, cxlmd, dentry); rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); if (rc) @@ -206,16 +221,24 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +static bool cxl_poison_attr_visible(struct kobject *kobj, struct attribute *a) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); - if (a == &dev_attr_trigger_poison_list.attr) - if (!test_bit(CXL_POISON_ENABLED_LIST, - mds->poison.enabled_cmds)) - return 0; + if (!mds || + !test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return false; + + return true; +} + +static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_trigger_poison_list.attr && + !cxl_poison_attr_visible(kobj, a)) + return 0; return a->mode; } diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index bace662dc9882cbbee85b9cb489a185a8dd4f3a4..deafa5bae2c79eae2ed84031195e81c8c6e086fe 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -977,6 +977,7 @@ static void cxl_reset_done(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct cxl_port *endpoint; struct device *dev = &pdev->dev; /* @@ -986,14 +987,11 @@ static void cxl_reset_done(struct pci_dev *pdev) * that no longer exists. */ guard(device)(&cxlmd->dev); - if (!cxlmd->dev.driver) + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) return; - if (cxlmd->endpoint && - cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { - device_for_each_child(&cxlmd->endpoint->dev, NULL, - cxl_endpoint_decoder_clear_reset_flags); - + if (cxl_endpoint_decoder_reset_detected(endpoint)) { dev_crit(dev, "SBR happened without memory regions removal.\n"); dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 296bf01e185ecacc388ebc69e92706c99e47c814..c3f54fc8ccc078a49767927bb6239dd2cf95480f 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -70,6 +70,8 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/cxl/Kconfig" + source "drivers/vfio/pci/qat/Kconfig" source "drivers/vfio/pci/xe/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 6138f1bf241df04e7419f196b404abdf9b194050..5fe6fe78b0cf7759a06755da1f4ee2a40adba3bf 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o cxl/vfio_cxl_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/Kconfig b/drivers/vfio/pci/cxl/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..fad53300fecfb1599f02049eb77ffac6afe804b7 --- /dev/null +++ b/drivers/vfio/pci/cxl/Kconfig @@ -0,0 +1,9 @@ +config VFIO_CXL_CORE + bool "VFIO CXL core" + depends on VFIO_PCI_CORE && CXL_BUS && CXL_MEM + help + Extends vfio-pci-core with CXL.mem passthrough for vendor-specific + CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or HDM-DB + decoders without the standard CXL memory expander class code + (PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 accelerators and + non-class-code Type-3 variants (e.g. compressed memory devices). diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c new file mode 100644 index 0000000000000000000000000000000000000000..aeecce1a3d50c8dba80302b7a190f21f78f19741 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -0,0 +1,583 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CXL DVSEC configuration space emulation for vfio-pci. + * + * Integrates into the existing vfio-pci-core ecap_perms[] framework using + * vdev->vconfig as the sole shadow buffer for DVSEC registers. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +static inline u16 _cxlds_get_dvsec(struct vfio_pci_cxl_state *cxl) +{ + return (u16)cxl->cxlds.cxl_dvsec; +} + +/* Helpers to access vdev->vconfig at a DVSEC-relative offset */ +static inline u16 dvsec_virt_read16(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return get_unaligned_le16(vdev->vconfig + dvsec + off); +} + +static inline void dvsec_virt_write16(struct vfio_pci_core_device *vdev, + u16 off, u16 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + put_unaligned_le16(val, vdev->vconfig + dvsec + off); +} + +static inline u32 dvsec_virt_read32(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return get_unaligned_le32(vdev->vconfig + dvsec + off); +} + +static inline void dvsec_virt_write32(struct vfio_pci_core_device *vdev, + u16 off, u32 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + put_unaligned_le32(val, vdev->vconfig + dvsec + off); +} + +static u32 dvsec_virt_merge_write32(struct vfio_pci_core_device *vdev, + u16 off, u16 byte_in_reg, + int count, __le32 val) +{ + u32 cur = dvsec_virt_read32(vdev, off); + u32 data = le32_to_cpu(val); + u32 mask; + + if (byte_in_reg + count > sizeof(u32)) + return cur; + + if (count == sizeof(u32)) + return data; + + mask = (1U << (count * 8)) - 1; + mask <<= byte_in_reg * 8; + + return (cur & ~mask) | ((data << (byte_in_reg * 8)) & mask); +} + +/* Individual DVSEC register write handlers */ + +static void cxl_dvsec_control_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 lock = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL_RESERVED_MASK; + + if (lock & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; /* register is locked after first write */ + + if (!(cap3 & CXL_DVSEC_CAP3_P2P_MEM_CAPABLE)) + rev_mask |= CXL_CTRL_P2P_REV_MASK; + + new_val &= ~rev_mask; + new_val |= CXL_DVSEC_CTRL_IO_ENABLE; /* IO_Enable always returns 1 */ + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, new_val); +} + +static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS_OFFSET); + + /* + * VIRAL_STATUS (bit 14) is the only writable bit; all others are + * reserved and always zero. + */ + new_val = cur_val & ~(new_val & CXL_DVSEC_STATUS_VIRAL_STATUS); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); +} + +/** + * vfio_cxl_reset - Service a guest CXL protocol reset. + * @vdev: VFIO PCI core device + * + * Unlike cxl_do_reset(), no host memory offlining is performed: the DPA + * region is guest memory, not host RAM. + * + * memory_lock is held for the entire sequence so neither BAR nor DPA + * mappings can fault back in. INIT_CXL_RST is not forwarded to hardware; + * cxl_dev_reset() drives the state machine directly. + * + * STATUS2 outcome bits are written back to vconfig on return so that the + * guest can poll for completion without going to hardware. + * + * Return: 0 on success, negative error code on failure. + */ +static int vfio_cxl_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(cxl); + u16 hw_status2 = 0; + int ret; + + vfio_pci_zap_and_down_write_memory_lock(vdev); + + /* + * CXL r4.0 Table 8-9: device must clear CXL_Reset_Complete before + * starting the reset flow, on the 0->1 transition of Initiate_CXL_Reset. + * Clear both reset outcome bits so a polling guest sees an unambiguous + * in-progress state rather than a stale result from a prior attempt. + */ + { + u16 s = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + + s &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, s); + } + + vfio_cxl_prepare_reset(vdev); + + /* + * Hand the actual reset off to cxl_dev_reset_locked() so the CXL core + * applies its global reset mutex and saves/disables any CXL.cachemem + * sibling functions on the bus. A bare cxl_dev_reset() under just + * pci_dev_lock() leaves those siblings vulnerable to half-reset states + * and lets a guest-triggered CXL reset race a concurrent host sysfs + * reset. + */ + ret = cxl_dev_reset_locked(pdev, cxl->cxlds.cxl_dvsec, + !!(dvsec_virt_read16(vdev, + CXL_DVSEC_CONTROL2_OFFSET) & + CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE)); + + vfio_cxl_finish_reset(vdev); + + /* + * Re-read STATUS2 from hardware after restore. cxl_save_dvsec() / + * cxl_restore_dvsec() cover CTRL, CTRL2, range_base_*, and LOCK; + * STATUS2 is intentionally not saved or restored across the reset, so + * the hardware value here is fresh post-reset (both outcome bits clear) + * and reflects genuine hardware changes such as VOLATILE_HDM_PRES_ERROR + * clearing. Stamp the new outcome on top of that value below. + */ + pci_read_config_word(pdev, dvsec + CXL_DVSEC_STATUS2_OFFSET, + &hw_status2); + hw_status2 &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + if (ret) + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + else + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, hw_status2); + + up_write(&vdev->memory_lock); + return ret; +} + +static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_CONTROL2_OFFSET; + u16 cap2 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY2_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL2_RESERVED_MASK; + + if (!(cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY)) + rev_mask |= CXL_CTRL2_VOLATILE_HDM_REV_MASK; + if (!(cap2 & CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE)) + rev_mask |= CXL_CTRL2_MODIFIED_COMP_REV_MASK; + + new_val &= ~rev_mask; + + /* + * Cache WBI: forward to hardware. cxl_dev_reset() programs + * Disable_Caching first and then initiates Cache WBI with that bit + * still set; replicate that ordering for VMM-delegated WBI by carrying + * the just-written Disable_Caching value into the same hardware write. + * new_val is the post-merge 16-bit shadow value, so it already reflects + * a prior shadow-only Disable_Caching=1 followed by a WBI-only write. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI) { + u16 hw_val = CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI; + + if (new_val & CXL_DVSEC_CTRL2_DISABLE_CACHING) + hw_val |= CXL_DVSEC_CTRL2_DISABLE_CACHING; + pci_write_config_word(pdev, abs_off, hw_val); + } + + /* + * Commit the new CONTROL2 value to the shadow before triggering a + * reset. vfio_cxl_reset() reads Mem_Clr_Enable (bit 3) from the + * shadow; if the shadow is written after the reset call, a guest write + * that changes bit 3 in the same access as INITIATE_CXL_RESET would + * reset with the stale bit 3 value instead of the one just written. + */ + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, + new_val & ~CXL_CTRL2_HW_BITS_MASK); + + /* + * INIT_CXL_RST: not forwarded to hardware. cxl_dev_reset() drives + * the state machine; forwarding it after the reset would fire a + * second one. Drop writes on non-RST_CAPABLE devices silently; the + * spec reserves the bit there and logging every write is just noise. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) { + if (vfio_cxl_reset_capable(vdev)) { + int rc = vfio_cxl_reset(vdev); + + if (rc) + pci_warn(pdev, + "vfio-cxl: CXL reset failed (%d)\n", + rc); + } + } +} + +static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; + + /* + * VOLATILE_HDM_PRES_ERROR (bit 3) and PM_INIT_COMPLETION (bit 15) are + * RW1CS. Forward each to hardware on a 1-bit write, then mirror the + * clear into the shadow so guest reads (which now come from the + * shadow) do not see the bit stuck after a successful clear. + * + * All other STATUS2 bits are RO hardware outputs; ignore guest writes. + */ + if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) { + u16 v; + + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } + + if (new_val & CXL_DVSEC_STATUS2_PM_INIT_COMPLETION) { + u16 v; + + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_PM_INIT_COMPLETION); + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_PM_INIT_COMPLETION; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } +} + +static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + + /* Once the LOCK bit is set it can only be cleared by conventional reset */ + if (cur_val & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; + + new_val &= ~CXL_LOCK_RESERVED_MASK; + dvsec_virt_write16(vdev, CXL_DVSEC_LOCK_OFFSET, new_val); +} + +static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, + u16 dvsec_off, u32 new_val) +{ + new_val &= ~CXL_BASE_LO_RESERVED_MASK; + dvsec_virt_write32(vdev, dvsec_off, new_val); +} + +/* + * status2_hw_shadow_merge - read STATUS2, merging hardware and vconfig shadow. + * + * RESET_COMPLETE and RESET_ERROR are written into vconfig by vfio_cxl_reset() + * after a protocol reset; pci_dev_restore() clears them from hardware, so they + * must survive in the shadow for a polling guest to see the reset outcome. + * + * All other STATUS2 bits are live hardware outputs and must come from hardware. + * In particular, CACHE_INVALID (bit 0) is polled by guests during a standalone + * write-back invalidation. + * + * @abs_pos: absolute PCI config space byte offset of the STATUS2 register. + */ +static u16 status2_hw_shadow_merge(struct vfio_pci_core_device *vdev, int abs_pos) +{ + const u16 shadow_mask = CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + u16 hw = 0, virt; + + pci_read_config_word(vdev->pdev, abs_pos, &hw); + virt = get_unaligned_le16(vdev->vconfig + abs_pos); + return (hw & ~shadow_mask) | (virt & shadow_mask); +} + +/** + * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to read + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Output buffer for the read value (little-endian) + * + * Called via vfio_pci_dvsec_dispatch_read() for CXL devices. Returns shadow + * vconfig values for virtualized DVSEC registers (CONTROL, STATUS, CONTROL2, + * LOCK) so that userspace reads reflect emulated state rather than raw + * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). + * + * A 4-byte (DWORD) access at the CONTROL2 offset spans both CONTROL2 and + * STATUS2 since CONTROL2 is DWORD-aligned and the two registers are adjacent. + * In that case STATUS2 is returned via the hardware-merge path. + * + * Return: @count on success, or negative error code from the fallback read. + */ +static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec, dvsec_off, reg_start, byte_in_reg; + + if (!cxl) + return vfio_direct_config_read(vdev, pos, count, perm, offset, + val); + + dvsec = _cxlds_get_dvsec(cxl); + if ((u16)pos < dvsec || (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_direct_config_read(vdev, pos, count, perm, offset, + val); + + dvsec_off = (u16)pos - dvsec; + + /* + * Route by the 2-byte-aligned start of the register so that a guest + * read at the high byte (dvsec_off | 1) hits the shadow path instead + * of falling through to the direct read and diverging from a prior + * shadow write. + */ + reg_start = dvsec_off & ~1u; + byte_in_reg = dvsec_off - reg_start; + + switch (reg_start) { + case CXL_DVSEC_CONTROL_OFFSET: + case CXL_DVSEC_STATUS_OFFSET: + case CXL_DVSEC_LOCK_OFFSET: + /* Fully virtualised; return shadow. Byte/word reads work too. */ + memcpy(val, vdev->vconfig + pos, count); + return count; + case CXL_DVSEC_CONTROL2_OFFSET: + if (count == 4 && byte_in_reg == 0) { + /* + * 4-byte access at the DWORD-aligned CONTROL2 offset + * spans both CONTROL2 (low 16 bits) and STATUS2 (high + * 16 bits). Return CONTROL2 from vconfig and STATUS2 + * via the hardware-merge path so CACHE_INVALID is fresh. + */ + __le32 combined = cpu_to_le32( + (u32)get_unaligned_le16(vdev->vconfig + pos) | + ((u32)status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET) << 16)); + memcpy(val, &combined, 4); + } else { + memcpy(val, vdev->vconfig + pos, count); + } + return count; + case CXL_DVSEC_STATUS2_OFFSET: { + __le16 merged = cpu_to_le16(status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET)); + memcpy(val, ((u8 *)&merged) + byte_in_reg, count); + return count; + } + default: + return vfio_direct_config_read(vdev, pos, count, + perm, offset, val); + } +} + +/** + * vfio_cxl_dvsec_writefn - ecap_perms write handler for PCI_EXT_CAP_ID_DVSEC. + * + * Installed once into ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn by + * vfio_pci_init_perm_bits() when CONFIG_VFIO_CXL_CORE=y. Applies to every + * device opened under vfio-pci; the vdev->cxl NULL check distinguishes CXL + * devices from non-CXL devices that happen to expose a DVSEC capability. + * + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to write + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Value to write (little-endian) + * + * Return: @count on success; non-CXL devices continue to + * vfio_raw_config_write() which also returns @count or negative error. + */ +static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec, abs_off, dvsec_off, reg_start, byte_in_reg; + u16 dword_start, byte_in_dword; + u16 wval16; + u32 wval32; + + if (!cxl) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + dvsec = _cxlds_get_dvsec(cxl); + abs_off = (u16)pos; + if (abs_off < dvsec || abs_off >= dvsec + cxl->dvsec_len) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + pci_dbg(vdev->pdev, + "vfio_cxl: DVSEC write: abs=0x%04x dvsec_off=0x%04x count=%d raw_val=0x%08x\n", + abs_off, abs_off - dvsec, count, le32_to_cpu(val)); + + dvsec_off = abs_off - dvsec; + + /* + * The 2-byte virtualised registers (CONTROL, STATUS, CONTROL2, + * STATUS2, LOCK) all live at 2-byte-aligned offsets. Compute the + * register-aligned offset so writes at the high byte still hit the + * right handler, and merge partial-byte writes against the shadow so + * the high byte of the matched register is not zeroed. + */ + reg_start = dvsec_off & ~1u; + byte_in_reg = dvsec_off - reg_start; + + dword_start = dvsec_off & ~3u; + byte_in_dword = dvsec_off - dword_start; + + switch (dword_start) { + case CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET: + case CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET: + wval32 = dvsec_virt_merge_write32(vdev, dword_start, byte_in_dword, count, val); + dvsec_virt_write32(vdev, dword_start, wval32); + return count; + case CXL_DVSEC_RANGE1_BASE_LOW_OFFSET: + case CXL_DVSEC_RANGE2_BASE_LOW_OFFSET: + wval32 = dvsec_virt_merge_write32(vdev, dword_start, byte_in_dword, count, val); + cxl_range_base_lo_write(vdev, dword_start, wval32); + return count; + } + + if (count == 1) { + u16 cur = dvsec_virt_read16(vdev, reg_start); + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? (cur & 0x00ff) | ((u16)byte << 8) + : (cur & 0xff00) | byte; + } else { + wval16 = (u16)le32_to_cpu(val); + } + + /* Route to the appropriate per-register handler */ + switch (reg_start) { + case CXL_DVSEC_CONTROL_OFFSET: + cxl_dvsec_control_write(vdev, wval16); + if (count == 4 && byte_in_reg == 0) { + /* + * High half of a 32-bit write at CONTROL is STATUS; + * forward so RW1C VIRAL_STATUS is not silently dropped. + */ + cxl_dvsec_status_write(vdev, + (u16)(le32_to_cpu(val) >> 16)); + } + break; + case CXL_DVSEC_STATUS_OFFSET: + /* + * STATUS is RO/W1C. A one-byte write must only act on bits in + * the byte the guest wrote: re-derive the value without merging + * the other byte from shadow, otherwise W1C bits set in shadow + * (e.g. VIRAL_STATUS) would be passed as fresh 1-writes and + * unintentionally cleared. + */ + if (count == 1) { + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? ((u16)byte << 8) : byte; + } + cxl_dvsec_status_write(vdev, wval16); + break; + case CXL_DVSEC_CONTROL2_OFFSET: + cxl_dvsec_control2_write(vdev, wval16); + if (count == 4 && byte_in_reg == 0) { + /* + * High half of a 32-bit write at CONTROL2 is STATUS2; + * forward so RW1CS VOLATILE_HDM_PRES_ERROR is not + * silently dropped. + */ + cxl_dvsec_status2_write(vdev, + (u16)(le32_to_cpu(val) >> 16)); + } + break; + case CXL_DVSEC_STATUS2_OFFSET: + /* + * STATUS2 is RO/RW1CS. Same rule as STATUS: a one-byte write + * must not let W1CS bits set in shadow leak in as fresh + * 1-writes via the merge. + */ + if (count == 1) { + u8 byte = (u8)le32_to_cpu(val); + + wval16 = byte_in_reg ? ((u16)byte << 8) : byte; + } + cxl_dvsec_status2_write(vdev, wval16); + break; + case CXL_DVSEC_LOCK_OFFSET: + cxl_dvsec_lock_write(vdev, wval16); + break; + default: + /* RO registers: header, capability, range sizes - discard */ + break; + } + + return count; +} + +/** + * vfio_cxl_setup_dvsec_perms - Install per-device CXL DVSEC read/write hooks. + * @vdev: VFIO PCI core device + * + * Called once per device open after vfio_config_init() has seeded vdev->vconfig + * from hardware. Installs vfio_cxl_dvsec_readfn and vfio_cxl_dvsec_writefn + * as per-device DVSEC handlers so that the global ecap_perms[DVSEC] dispatcher + * routes reads and writes through CXL-aware emulation. + * + * Forces CXL.io IO_ENABLE in the CONTROL vconfig shadow at init time so the + * initial guest read returns the correct value before the first write. + */ +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) +{ + u16 ctrl = dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL_OFFSET); + + vdev->dvsec_readfn = vfio_cxl_dvsec_readfn; + vdev->dvsec_writefn = vfio_cxl_dvsec_writefn; + + /* Force IO_ENABLE; cxl_dvsec_control_write() maintains this invariant. */ + ctrl |= CXL_DVSEC_CTRL_IO_ENABLE; + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, ctrl); +} +EXPORT_SYMBOL_GPL(vfio_cxl_setup_dvsec_perms); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c new file mode 100644 index 0000000000000000000000000000000000000000..2d6b804d8537bc1f83d95189cc8ed269873d0fdd --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -0,0 +1,1146 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO CXL Core - CXL.mem passthrough for vendor-specific CXL devices + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * + * This module extends vfio-pci-core to pass through CXL.mem regions for + * vendor-specific CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or + * HDM-DB decoders but do not report the standard CXL memory expander class + * code (PCI_CLASS_MEMORY_CXL, 0x0502). This covers both CXL Type-2 + * accelerators (with CXL.cache) and non-class-code Type-3 variants (e.g. + * compressed memory devices) which cannot be paravirtualized by the host + * CXL subsystem and require direct DPA region access from the guest. + */ + +#include +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl->comp_reg_bar; +} + +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + resource_size_t bar_len, comp_end; + u32 nr_areas, cap_size; + int ret; + + if (!cxl) + return -ENOTTY; + + if (!info) + return -ENOTTY; + + if (info->argsz < minsz) + return -EINVAL; + + if (info->index != cxl->comp_reg_bar) + return -ENOTTY; + + /* + * The device state is not fully initialised; + * fall through to the default BAR handler. + */ + if (!cxl->comp_reg_size) + return -ENOTTY; + + bar_len = pci_resource_len(vdev->pdev, info->index); + comp_end = cxl->comp_reg_offset + cxl->comp_reg_size; + + /* + * A component block past the end of the BAR would walk subsequent + * readl()s off the ioremap window. Reject that up front. + */ + if (comp_end > bar_len) + return -EINVAL; + + /* + * If the component block covers the entire BAR there is nothing to + * mmap; return the BAR with read/write access only and let userspace + * use the COMP_REGS device region for register access. + */ + if (cxl->comp_reg_size == bar_len) { + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return 0; + } + + /* + * Preserve the existing vfio-pci bar_mmap_supported gate. When the + * BAR is non-mappable for any reason (non-page-aligned resource, the + * non_mappable_bars policy, etc.), advertising a sparse-mmap cap and + * VFIO_REGION_INFO_FLAG_MMAP would let userspace try to mmap and get + * a stale -EINVAL from the mmap path. Return the bare BAR descriptor + * instead and let userspace fall back to fd read/write. + */ + if (!vdev->bar_mmap_supported[info->index]) { + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return 0; + } + + /* + * Advertise the GPU/accelerator register windows as mmappable by + * carving the CXL component register block out of the BAR. The + * number of sparse areas depends on where the block sits: + * + * [A] comp block at BAR end [gpu_regs | comp_regs]: + * comp_reg_offset > 0 && comp_end == bar_len + * = 1 area: [0, comp_reg_offset) + * + * [B] comp block at BAR start [comp_regs | gpu_regs]: + * comp_reg_offset == 0 && comp_end < bar_len + * = 1 area: [comp_end, bar_len) + * + * [C] comp block in middle [gpu_regs | comp_regs | gpu_regs]: + * comp_reg_offset > 0 && comp_end < bar_len + * = 2 areas: [0, comp_reg_offset) and [comp_end, bar_len) + */ + if (cxl->comp_reg_offset > 0 && comp_end < bar_len) + nr_areas = 2; + else + nr_areas = 1; + + cap_size = struct_size(sparse, areas, nr_areas); + sparse = kzalloc(cap_size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + + if (nr_areas == 2) { + /* [C]: window before and after comp block */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + sparse->areas[1].offset = comp_end; + sparse->areas[1].size = bar_len - comp_end; + } else if (cxl->comp_reg_offset == 0) { + /* [B]: comp block at BAR start, window follows */ + sparse->areas[0].offset = comp_end; + sparse->areas[0].size = bar_len - comp_end; + } else { + /* [A]: comp block at BAR end, window precedes */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + } + + ret = vfio_info_add_capability(caps, &sparse->header, cap_size); + kfree(sparse); + if (ret) + return ret; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return 0; +} + +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl->comp_reg_size) + return false; + + return req_start < cxl->comp_reg_offset + cxl->comp_reg_size && + req_start + req_len > cxl->comp_reg_offset; +} + +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct vfio_device_info_cap_cxl cxl_cap = {0}; + + if (!cxl) + return 0; + + /* + * Device is not fully initialised? + */ + if (WARN_ON(cxl->dpa_region_idx < 0 || cxl->comp_reg_region_idx < 0)) + return -ENODEV; + + /* Fill in from CXL device structure */ + cxl_cap.header.id = VFIO_DEVICE_INFO_CAP_CXL; + cxl_cap.header.version = 1; + /* + * COMP_REGS region starts at comp_reg_offset + CXL_CM_OFFSET within + * the BAR. This is the byte offset of the CXL.mem register area (where + * the CXL Capability Array Header lives) within the component register + * block. Userspace derives hdm_decoder_offset and hdm_count from the + * COMP_REGS region itself (CXL Capability Array traversal + HDMC read). + */ + cxl_cap.hdm_regs_offset = cxl->comp_reg_offset + CXL_CM_OFFSET; + cxl_cap.hdm_regs_bar_index = cxl->comp_reg_bar; + + if (cxl->precommitted) + cxl_cap.flags |= VFIO_CXL_CAP_FIRMWARE_COMMITTED; + if (cxl->cache_capable) + cxl_cap.flags |= VFIO_CXL_CAP_CACHE_CAPABLE; + + /* + * Populate absolute VFIO region indices so userspace can query them + * directly with VFIO_DEVICE_GET_REGION_INFO. + */ + cxl_cap.dpa_region_index = VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx; + cxl_cap.comp_regs_region_index = + VFIO_PCI_NUM_REGIONS + cxl->comp_reg_region_idx; + + return vfio_info_add_capability(caps, &cxl_cap.header, sizeof(cxl_cap)); +} + +/* + * Scope-based cleanup wrappers for the CXL resource APIs + */ +DEFINE_FREE(cxl_put_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_put_root_decoder(_T)) +DEFINE_FREE(cxl_dpa_free, struct cxl_endpoint_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_dpa_free(_T)) +DEFINE_FREE(cxl_unregister_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) cxl_unregister_region(_T)) + +/* + * vfio_cxl_create_device_state - Allocate and validate CXL device state + * + * Returns a pointer to the allocated vfio_pci_cxl_state on success, or + * ERR_PTR on failure. The allocation uses devm; the caller must call + * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release + * the resource before device unbind. Using devm_kfree() to undo a devm + * allocation early is explicitly supported by the devres API. + * + * The caller assigns vdev->cxl only after all setup steps succeed, preventing + * partially-initialised state from being visible through vdev->cxl on any + * failure path. + */ +static struct vfio_pci_cxl_state * +vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) +{ + struct vfio_pci_cxl_state *cxl; + u16 cap_word; + u32 hdr1; + + /* + * Freed automatically when pdev->dev is released. Use the PCI Device + * Serial Number capability for cxlds->serial; pdev->dev.id is the + * generic-device sibling counter (typically 0) and surfaces as a bogus + * serial in sysfs and CXL tracepoints. + */ + cxl = devm_cxl_dev_state_create(&pdev->dev, + CXL_DEVTYPE_DEVMEM, + pci_get_dsn(pdev), dvsec, + struct vfio_pci_cxl_state, + cxlds, false); + if (!cxl) + return ERR_PTR(-ENOMEM); + + pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1); + cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1); + + pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET, + &cap_word); + + /* + * Only handle vendor devices (class != 0x0502) with Mem_Capable set. + * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI + * sequence is needed before FLR. + */ + if (!FIELD_GET(CXL_DVSEC_CAP_MEM_CAPABLE, cap_word) || + (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { + devm_kfree(&pdev->dev, cxl); + return ERR_PTR(-ENODEV); + } + + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); + cxl->dpa_region_idx = -1; + cxl->comp_reg_region_idx = -1; + + return cxl; +} + +static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct cxl_register_map *map = &cxl->cxlds.reg_map; + resource_size_t offset, bar_offset, size; + struct pci_dev *pdev = vdev->pdev; + void __iomem *base; + int ret; + u8 count; + u8 bar; + + if (WARN_ON_ONCE(!pci_is_enabled(pdev))) + return -EINVAL; + + /* Find component register block via Register Locator DVSEC */ + ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map); + if (ret) + return ret; + + /* + * Request the region and map. This is a transient mapping + * used only to probe register capabilities; released immediately + * after cxl_probe_component_regs() returns. + */ + if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe")) + return -EBUSY; + + base = ioremap(map->resource, map->max_size); + if (!base) { + ret = -ENOMEM; + goto failed_release; + } + + /* Probe component register capabilities */ + cxl_probe_component_regs(&pdev->dev, base, &map->component_map); + + /* Check if HDM decoder was found */ + if (!map->component_map.hdm_decoder.valid) { + ret = -ENODEV; + goto failed_unmap; + } + + pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n", + map->component_map.hdm_decoder.offset, + map->component_map.hdm_decoder.size); + + /* Get HDM register info */ + ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size); + if (ret) + goto failed_unmap; + + if (!count || !size) { + ret = -ENODEV; + goto failed_unmap; + } + + cxl->hdm_count = count; + /* + * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + + * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before + * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset). + * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem + * register area start, which is where comp_reg_virt[0] is anchored. + * The physical BAR address for hdm_iobase is recovered by adding + * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs(). + */ + cxl->hdm_reg_offset = offset - CXL_CM_OFFSET; + cxl->hdm_reg_size = size; + + ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset); + if (ret) + goto failed_unmap; + + cxl->comp_reg_bar = bar; + cxl->comp_reg_offset = bar_offset; + cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + ret = vfio_cxl_setup_virt_regs(vdev, cxl, base, map->max_size); + iounmap(base); + release_mem_region(map->resource, map->max_size); + if (ret) + return ret; + + return 0; + +failed_unmap: + iounmap(base); +failed_release: + release_mem_region(map->resource, map->max_size); + + return ret; +} + +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size) +{ + resource_size_t max_size; + + struct cxl_root_decoder *cxlrd __free(cxl_put_root_decoder) = + cxl_get_hpa_freespace(cxl->cxlmd, 1, + CXL_DECODER_F_RAM | CXL_DECODER_F_TYPE2, + &max_size); + if (IS_ERR(cxlrd)) + return PTR_ERR(cxlrd); + + /* Insufficient HPA space; cxlrd freed automatically by __free() */ + if (max_size < size) + return -ENOSPC; + + struct cxl_endpoint_decoder *cxled __free(cxl_dpa_free) = + cxl_request_dpa(cxl->cxlmd, CXL_PARTMODE_RAM, size); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + + struct cxl_region *region __free(cxl_unregister_region) = + cxl_create_region(cxlrd, &cxled, 1); + if (IS_ERR(region)) + return PTR_ERR(region); + + /* All operations succeeded; transfer ownership to cxl state */ + cxl->cxlrd = no_free_ptr(cxlrd); + cxl->cxled = no_free_ptr(cxled); + cxl->region = no_free_ptr(region); + + return 0; +} + +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl) +{ + if (!cxl->region) + return; + + /* + * Precommitted regions are obtained via cxl_get_committed_decoder() as + * a borrowed reference owned by the cxl core; do not unregister or + * free the decoder objects from here. Only vfio_cxl_create_cxl_region() + * owns the region and decoders. + */ + if (!cxl->precommitted) { + cxl_unregister_region(cxl->region); + cxl_dpa_free(cxl->cxled); + cxl_put_root_decoder(cxl->cxlrd); + } + + cxl->region = NULL; + cxl->cxled = NULL; + cxl->cxlrd = NULL; +} + +static int vfio_cxl_create_region_helper(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + struct pci_dev *pdev = vdev->pdev; + struct range range; + int ret; + + if (cxl->precommitted) { + struct cxl_endpoint_decoder *cxled; + /* + * cxl_get_committed_decoder() does not write *region on every + * failure path (e.g. when cxlmd->endpoint is NULL or no decoder + * is committed). Initialise to NULL so the !cxl->region check + * below catches it regardless of stack-init mode. + */ + struct cxl_region *region = NULL; + + cxled = cxl_get_committed_decoder(cxl->cxlmd, ®ion); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + cxl->cxled = cxled; + cxl->region = region; + } else { + ret = vfio_cxl_create_cxl_region(cxl, capacity); + if (ret) + return ret; + } + + if (!cxl->region) { + pci_err(pdev, "Failed to create CXL region\n"); + ret = -ENODEV; + goto failed; + } + + ret = cxl_get_region_range(cxl->region, &range); + if (ret) + goto failed; + + cxl->region_hpa = range.start; + cxl->region_size = range_len(&range); + + pci_dbg(pdev, "CXL region: HPA 0x%llx size %lu MB\n", + cxl->region_hpa, cxl->region_size >> 20); + + return 0; + +failed: + vfio_cxl_destroy_cxl_region(cxl); + + return ret; +} + +static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + int ret; + + ret = cxl_set_capacity(&cxl->cxlds, capacity); + if (ret) + return ret; + + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) + return PTR_ERR(cxl->cxlmd); + + return 0; +} + +/* + * Free CXL state early on probe failure. devm_kfree() on a live devres + * allocation removes it from the list immediately, so the normal devres + * teardown at unbind time won't double-free it. + */ +static void vfio_cxl_dev_state_free(struct pci_dev *pdev, + struct vfio_pci_cxl_state *cxl) +{ + devm_kfree(&pdev->dev, cxl); +} + +/** + * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific + * CXL.mem device + * @vdev: VFIO PCI device + * + * Called from vfio_pci_core_register_device(). Detects CXL DVSEC capability + * and initializes CXL features. On failure vdev->cxl remains NULL and the + * device operates as a standard PCI device. + */ +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_cxl_state *cxl; + resource_size_t capacity = 0; + u16 dvsec; + int ret; + + /* Honor the user opt-out decision */ + if (vdev->disable_cxl) + return; + + if (!pcie_is_cxl(pdev)) + return; + + dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + /* + * CXL DVSEC found: any failure from here is a hard probe error on + * a confirmed CXL-capable device, not a silent non-CXL fallback. + * Warn the operator so misconfiguration is visible. + */ + cxl = vfio_cxl_create_device_state(pdev, dvsec); + if (IS_ERR(cxl)) { + if (PTR_ERR(cxl) != -ENODEV) + pci_warn(pdev, + "vfio-cxl: CXL device state allocation failed: %ld\n", + PTR_ERR(cxl)); + return; + } + + /* + * Required for ioremap of the component register block and + * calls to cxl_probe_component_regs(). + */ + ret = pci_enable_device_mem(pdev); + if (ret) { + pci_warn(pdev, + "vfio-cxl: pci_enable_device_mem failed: %d\n", ret); + goto free_cxl; + } + + ret = vfio_cxl_setup_regs(vdev, cxl); + if (ret) { + pci_warn(pdev, + "vfio-cxl: HDM register probing failed: %d\n", ret); + pci_disable_device(pdev); + goto free_cxl; + } + + cxl->cxlds.media_ready = !cxl_await_range_active(&cxl->cxlds); + if (!cxl->cxlds.media_ready) { + pci_warn(pdev, "CXL media not ready\n"); + pci_disable_device(pdev); + goto regs_failed; + } + + /* + * Take the single authoritative HDM decoder snapshot now that + * MEM_ACTIVE is confirmed and BAR memory is still enabled. Using + * readl() per-dword ensures correct MMIO serialisation and captures + * the final firmware-written values for all fields including SIZE_HIGH, + * which firmware commits to the BAR at MEM_ACTIVE time. + */ + vfio_cxl_reinit_comp_regs(cxl); + + pci_disable_device(pdev); + + capacity = vfio_cxl_read_committed_decoder_size(vdev, cxl); + if (capacity == 0) { + /* + * TODO: Add handling for devices which do not have + * firmware pre-committed decoders + */ + pci_info(pdev, "Uncommitted region size must be configured via sysfs before bind\n"); + goto regs_failed; + } + + cxl->precommitted = true; + cxl->dpa_size = capacity; + + pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); + + ret = vfio_cxl_create_memdev(cxl, capacity); + if (ret) { + pci_warn(pdev, "Failed to create memdev\n"); + goto regs_failed; + } + + ret = vfio_cxl_create_region_helper(vdev, cxl, capacity); + if (ret) + goto regs_failed; + + /* + * Register probing succeeded. Assign vdev->cxl now so that + * all subsequent helpers can access state via vdev->cxl. + * All failure paths below clear vdev->cxl before calling + * vfio_cxl_dev_state_free(). cxl->vdev is the back-pointer used + * by vm_fault and other helpers that only have the cxl state in hand. + */ + cxl->vdev = vdev; + vdev->cxl = cxl; + + return; + +regs_failed: + vfio_cxl_clean_virt_regs(cxl); + +free_cxl: + vfio_cxl_dev_state_free(pdev, cxl); +} + +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl) + return; + + vfio_cxl_clean_virt_regs(cxl); + vfio_cxl_destroy_cxl_region(cxl); +} + +static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) +{ + struct vfio_pci_region *region = vmf->vma->vm_private_data; + struct vfio_pci_cxl_state *cxl = region->data; + struct vfio_pci_core_device *vdev = cxl->vdev; + unsigned long pgoff; + unsigned long pfn; + vm_fault_t ret; + + /* + * Hold memory_lock read side across the region_active check and the + * vmf_insert_pfn so the reset path cannot run unmap_mapping_range + * between the two and leave a stale PTE pointing at the pre-reset HPA. + * vfio_cxl_prepare_reset holds the write side while it clears + * region_active and zaps existing PTEs. + */ + down_read(&vdev->memory_lock); + + /* + * Mirror vfio_pci_vmf_insert_pfn(): reject faults while runtime PM is + * engaged or PCI Memory Space / power state would make the underlying + * memory inaccessible. vfio_pci_zap_and_down_write_memory_lock() has + * already unmapped existing PTEs in those paths; this gate stops the + * fault path from faulting them back in. + */ + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + if (!cxl->region_active) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + pgoff = vmf->pgoff & + ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (pgoff >= (cxl->region_size >> PAGE_SHIFT)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + pfn = PHYS_PFN(cxl->region_hpa) + pgoff; + ret = vmf_insert_pfn(vmf->vma, vmf->address, pfn); + +out: + up_read(&vdev->memory_lock); + return ret; +} + +static const struct vm_operations_struct vfio_cxl_region_vm_ops = { + .fault = vfio_cxl_region_vm_fault, +}; + +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u64 req_len, pgoff, end; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + if (end > cxl->region_size) + return -EINVAL; + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + + vma->vm_ops = &vfio_cxl_region_vm_ops; + vma->vm_private_data = region; + + return 0; +} + +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl && pci_cxl_reset_capable(vdev->pdev); +} + +/* + * vfio_cxl_zap_dpa - Invalidate DPA region PTEs without touching region_active. + * + * Used by paths that revoke user access transiently (runtime PM entry, D3 + * power transitions, Memory Space disable) but do not perform a reset. + * The DPA region offset range is separate from the BAR range zapped by + * vfio_pci_zap_bars(), so existing DPA mmaps and fd I/O would otherwise + * continue to touch CXL.mem while the device is suspended. + * + * The fault handler and fd I/O path additionally check pm_runtime_engaged + * and __vfio_pci_memory_enabled() to refuse re-faulting while the device + * is in the revoked state. + * + * Must be called with vdev->memory_lock held for writing. + */ +void vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev) +{ + struct vfio_device *core_vdev = &vdev->vdev; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); +} + +/* + * vfio_cxl_prepare_reset - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + vfio_cxl_zap_dpa(vdev); +} + +/* + * vfio_cxl_enable_memory_space - ensure PCI Memory Space is on before BAR reads. + * + * A reset caller may disable Memory Space to quiesce device DMA before + * issuing the reset. If a guest request cleared PCI_COMMAND Memory Space + * before FLR, pci_dev_save_and_disable() captures it disabled and + * pci_dev_restore() restores it that way. This can leave Memory Space + * disabled on return. Accessing a BAR with Memory Space disabled produces + * an Unsupported Request completion; on platforms that promote UR to a + * fatal error this fires DPC. + */ +static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) +{ + u16 cmd; + + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(vdev->pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); +} + +/* + * vfio_cxl_reinit_hdm_shadow - reinitialise comp_reg_virt, preserving the + * guest-visible BASE registers and CTRL LOCK across reset. + * + * reinit_comp_regs() re-reads hardware into comp_reg_virt[] after FLR. + * pci_dev_restore() re-commits the host-physical BASE values it saved + * before the reset, so reinit_comp_regs() sees those host bases and not + * the guest-physical bases the device manager programmed in shadow. The + * decoder CTRL LOCK bit is also cleared by FLR on hardware and is not + * re-applied by pci_dev_restore(). Snapshot BASE_LOW/BASE_HIGH and the + * LOCK bit from shadow before reinit, then write them back so the + * emulated decoder stays consistent with what the guest configured. + * + * Called with memory_lock write side held (from vfio_cxl_finish_reset). + */ +static void vfio_cxl_reinit_hdm_shadow(struct vfio_pci_cxl_state *cxl) +{ + __le32 *saved_lo = NULL, *saved_hi = NULL, *saved_ctrl = NULL; + u8 n, count = cxl->hdm_count; + + if (cxl->comp_reg_virt && count) { + saved_lo = kcalloc(count, sizeof(*saved_lo), GFP_KERNEL); + saved_hi = kcalloc(count, sizeof(*saved_hi), GFP_KERNEL); + saved_ctrl = kcalloc(count, sizeof(*saved_ctrl), GFP_KERNEL); + if (!saved_lo || !saved_hi || !saved_ctrl) { + /* + * Allocation failure: skip the snapshot and let reinit + * resync from hardware. The guest-visible BASE/LOCK + * state will diverge but the device is otherwise + * functional. This path is unlikely under normal load. + */ + pci_warn(cxl->vdev->pdev, + "vfio_cxl: HDM shadow snapshot allocation failed; resetting without GPA preservation\n"); + kfree(saved_lo); + kfree(saved_hi); + kfree(saved_ctrl); + saved_lo = saved_hi = saved_ctrl = NULL; + } else { + for (n = 0; n < count; n++) { + saved_lo[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)); + saved_hi[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)); + saved_ctrl[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n)); + } + } + } + + vfio_cxl_reinit_comp_regs(cxl); + + if (cxl->comp_reg_virt && saved_lo) { + for (n = 0; n < count; n++) { + u32 ctrl; + + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)) = saved_lo[n]; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)) = saved_hi[n]; + + /* + * Restore the LOCK bit from shadow. Other CTRL bits + * (COMMITTED, error indicators) should reflect the + * post-FLR hardware state that reinit_comp_regs() just + * snapshotted, so leave those alone. + */ + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n))); + ctrl |= le32_to_cpu(saved_ctrl[n]) & + CXL_HDM_DECODER0_CTRL_LOCK; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(n)) = + cpu_to_le32(ctrl); + } + } + + kfree(saved_lo); + kfree(saved_hi); + kfree(saved_ctrl); +} + +/* + * vfio_cxl_finish_reset - Re-enable DPA region after reset. + * + * Must be called with vdev->memory_lock held for writing. Re-reads the + * HDM decoder state from hardware and sets region_active so that + * subsequent I/O to the region is permitted again. + */ +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + + vfio_cxl_enable_memory_space(vdev); + + /* + * Re-initialise the emulated HDM comp_reg_virt[] from hardware, + * preserving the GPA decoder bases set by the device manager. + */ + vfio_cxl_reinit_hdm_shadow(cxl); + + /* + * Only re-enable the DPA mmap if the hardware has actually + * re-committed decoder 0 after FLR. Read the COMMITTED bit from the + * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR + * hardware state, not stale pre-reset state. + * + * If COMMITTED is 0 (slow firmware re-commit path), leave + * region_active=false. Guest faults will return VM_FAULT_SIGBUS + * until the decoder is re-committed and the region is re-enabled. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + /* + * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset + * (now CXL.mem-relative) plus the within-HDM-block offset. + */ + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } +} + +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_cxl_state *cxl = core_dev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret; + + if (!count || pos >= cxl->region_size) + return 0; + + /* + * Hold memory_lock read side across the region_active check and the + * user copy. vfio_cxl_prepare_reset() holds the write side while it + * clears region_active and unmaps the inode range; without the read + * side here, the copy could still touch cxl->region_vaddr after the + * reset has begun. Guard against access after a failed reset + * (region_active=false) or a release race (region_vaddr=NULL): either + * means the memremap'd window is no longer valid; touching it would + * produce a Synchronous External Abort. + */ + down_read(&core_dev->memory_lock); + + /* + * Mirror the BAR-rw / fault gates: refuse fd I/O while the device is + * runtime suspended or has Memory Space / power state that makes the + * memremap'd window inaccessible. + */ + if (core_dev->pm_runtime_engaged || + !__vfio_pci_memory_enabled(core_dev)) { + ret = -EIO; + goto out; + } + + if (!cxl->region_active || !cxl->region_vaddr) { + ret = -EIO; + goto out; + } + + count = min(count, (size_t)(cxl->region_size - pos)); + + if (iswrite) { + if (copy_from_user(cxl->region_vaddr + pos, buf, count)) { + ret = -EFAULT; + goto out; + } + } else { + if (copy_to_user(buf, cxl->region_vaddr + pos, count)) { + ret = -EFAULT; + goto out; + } + } + + /* + * vfio_pci_rw() returns the region rw result verbatim and relies on + * the handler to advance *ppos. Without this, successive read/write + * syscalls on the DPA region keep operating at the same offset + * instead of advancing. + */ + *ppos += count; + ret = count; + +out: + up_read(&core_dev->memory_lock); + return ret; +} + +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_device *core_vdev = &vdev->vdev; + struct vfio_pci_cxl_state *cxl = region->data; + + /* + * Deactivate the region before removing user mappings so that any + * fault handler racing the release returns VM_FAULT_SIGBUS rather + * than inserting a PFN into an unmapped region. + */ + WRITE_ONCE(cxl->region_active, false); + + /* + * Remove all user mappings of the DPA region while the device is + * still alive. + */ + if (cxl->dpa_region_idx >= 0) + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); + + if (cxl->region_vaddr) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + } +} + +static const struct vfio_pci_regops vfio_cxl_regops = { + .rw = vfio_cxl_region_rw, + .mmap = vfio_cxl_region_mmap, + .release = vfio_cxl_region_release, +}; + +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags; + int ret; + + if (!cxl) + return -ENODEV; + + if (!cxl->region || cxl->region_vaddr) + return -ENODEV; + + /* + * CXL device memory is RAM, not MMIO. Use memremap() rather than + * ioremap_cache() so the correct memory-mapping API is used. + * The WB attribute matches the cache-coherent nature of CXL.mem. + */ + cxl->region_vaddr = memremap(cxl->region_hpa, cxl->region_size, + MEMREMAP_WB); + if (!cxl->region_vaddr) + return -ENOMEM; + + /* + * BOS/backport policy: do not advertise DPA mmap until the CXL DPA + * backing is proven safe for userspace CPU mappings. Keep fd + * read/write available via the memremap() kernel mapping. + */ + flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL, + &vfio_cxl_regops, + cxl->region_size, flags, + cxl); + if (ret) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + return ret; + } + + /* + * Cache the vdev->region[] index before activating the region. + * vfio_pci_core_register_dev_region() placed the new entry at + * vdev->region[num_regions - 1] and incremented num_regions. + * vfio_cxl_prepare_reset() uses this to avoid scanning + * vdev->region[] on every FLR. + */ + cxl->dpa_region_idx = vdev->num_regions - 1; + + vfio_cxl_reinit_comp_regs(cxl); + + /* + * Only activate the DPA region when the HDM decoder is currently + * committed. vfio_pci_core_enable() runs pci_try_reset_function() + * before regions are registered; that FLR clears the decoder + * COMMITTED bit and firmware may not have re-committed it yet. + * Mirror vfio_cxl_finish_reset(): if COMMITTED is not set here, the + * region stays inactive and guest DPA access returns + * VM_FAULT_SIGBUS / -EIO until a subsequent reset re-runs + * finish_reset with the decoder committed. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region); + +/** + * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region() + * @vdev: VFIO PCI device + * + * Marks the DPA region inactive and resets dpa_region_idx. + * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd). + * The caller must call vfio_cxl_destroy_cxl_region() separately to release + * those objects. + */ +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + + cxl->dpa_region_idx = -1; +} +EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region); + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c new file mode 100644 index 0000000000000000000000000000000000000000..bdd363a819a7ce7ba7f018a4cfa879ae185c0b95 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/* + * comp_reg_virt[] shadow layout: + * Covers the full CXL.mem register area (starting at CXL_CM_OFFSET + * within the component register block). Index 0 is the CXL Capability + * Array Header; the HDM decoder block starts at index + * hdm_reg_offset / sizeof(__le32). + * + * Register layout within the HDM block (CXL spec 4.0 8.2.4.20 CXL HDM Decoder + * Capability Structure): + * 0x00: HDM Decoder Capability + * 0x04: HDM Decoder Global Control + * 0x08: (reserved) + * 0x0c: (reserved) + * For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20: + * +0x00: BASE_LO + * +0x04: BASE_HI + * +0x08: SIZE_LO + * +0x0c: SIZE_HI + * +0x10: CTRL + * +0x14: TARGET_LIST_LO + * +0x18: TARGET_LIST_HI + * +0x1c: (reserved) + */ + +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +{ + /* + * hdm_off is a byte offset within the HDM decoder block. + * comp_reg_virt covers the CXL.mem register area starting at + * CXL_CM_OFFSET within the component register block. + * hdm_reg_offset is CXL.mem-relative, so adding hdm_reg_offset + * gives the correct index into comp_reg_virt[]. + */ + return &cxl->comp_reg_virt[(cxl->hdm_reg_offset + hdm_off) / + sizeof(__le32)]; +} + +static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + /* Discard writes on reserved registers. */ + return size; +} + +static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 new_val = le32_to_cpu(*val32); + u32 dec_idx, ctrl_off, ctrl; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* + * Honor the CTRL LOCK bit the same way BASE_HI/SIZE_HI do: once the + * guest sets LOCK, BASE_LO/SIZE_LO must remain frozen in shadow. + */ + dec_idx = ((u32)offset - CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) / + CXL_HDM_DECODER_BLOCK_STRIDE; + ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + dec_idx * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + ctrl = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, ctrl_off)); + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* Bits [27:0] are reserved. */ + new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 size) +{ + u32 hdm_gcap; + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bit [31:2] are reserved. */ + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK; + + /* Poison On Decode Error Enable (bit 0) is RO=0 if not supported. */ + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + if (!(hdm_gcap & CXL_HDM_DECODER_POISON_ON_DECODE_ERR)) + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT; + + *hdm_reg_ptr(vdev->cxl, CXL_HDM_DECODER_CTRL_OFFSET) = + cpu_to_le32(new_val); + + return size; +} + +/** + * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register. + * @vdev: VFIO PCI core device + * @val32: New register value supplied by userspace (little-endian) + * @offset: Byte offset within the HDM block for this decoder's CTRL register + * @size: Access size in bytes; must equal CXL_REG_SIZE_DWORD + * + * The COMMIT bit (bit 9) is the key: setting it requests the hardware to + * lock the decoder. The emulated COMMITTED bit (bit 10) mirrors COMMIT + * immediately to allow QEMU's notify_change to detect the transition and + * map/unmap the DPA MemoryRegion in the guest address space. + * + * Note: the actual hardware HDM decoder programming (writing the real + * BASE/SIZE with host physical addresses) happens in the QEMU notify_change + * callback BEFORE this write reaches the hardware. This ordering is + * correct because vfio_region_write() calls notify_change() first. + * + * Return: @size on success, %-EINVAL if @size is not %CXL_REG_SIZE_DWORD. + */ +static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 hdm_gcap; + u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK; + u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK; + u32 new_val = le32_to_cpu(*val32); + u32 cur_val; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset)); + if (cur_val & CXL_HDM_DECODER0_CTRL_LOCK) { + if (new_val & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* LOCK_0 only: preserve all other bits, clear LOCK */ + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32( + cur_val & ~CXL_HDM_DECODER0_CTRL_LOCK); + return size; + } + + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO; + rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED; + + if (!(hdm_gcap & CXL_HDM_DECODER_UIO_CAPABLE)) + rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED; + + /* + * BI (bit 13) is RsvdP for devices without CXL.cache. HDM-D decoders + * on a CXL.mem-only device must not see BI set in shadow. + */ + if (!vdev->cxl->cache_capable) + rev_mask |= CXL_HDM_DECODER_CTRL_BI_RESERVED; + + new_val &= ~rev_mask; + cur_val &= ro_mask; + new_val = (new_val & ~ro_mask) | cur_val; + + /* + * Mirror COMMIT to COMMITTED immediately in the emulated state. + */ + if (new_val & CXL_HDM_DECODER0_CTRL_COMMIT) + new_val |= CXL_HDM_DECODER0_CTRL_COMMITTED; + else + new_val &= ~CXL_HDM_DECODER0_CTRL_COMMITTED; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +/* + * Dispatch table for COMP_REGS region writes. Indexed by byte offset within + * the HDM decoder block. Returns the appropriate write handler. + * + * Layout: + * 0x00 HDM Decoder Capability (RO) + * 0x04 HDM Global Control (RW with reserved masking) + * 0x08-0x0f (reserved) (ignored) + * Per decoder N, base = 0x10 + N*0x20: + * base+0x00 BASE_LO (RW, [27:0] reserved) + * base+0x04 BASE_HI (RW) + * base+0x08 SIZE_LO (RW, [27:0] reserved) + * base+0x0c SIZE_HI (RW) + * base+0x10 CTRL (RW, complex rules) + * base+0x14 TARGET_LIST_LO (ignored for Type-2) + * base+0x18 TARGET_LIST_HI (ignored for Type-2) + * base+0x1c (reserved) (ignored) + */ +static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev, + u32 off, const __le32 *val32, u32 size) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 dec_base, dec_off; + + /* HDM Decoder Capability (0x00): RO */ + if (off == CXL_HDM_DECODER_CAP_OFFSET) + return size; + + /* HDM Global Control (0x04) */ + if (off == CXL_HDM_DECODER_CTRL_OFFSET) + return hdm_decoder_global_ctrl_write(vdev, val32, size); + + /* + * Offsets 0x08-0x0f are reserved per CXL 4.0 Table 8-115. + * Per-decoder registers start at 0x10, stride 0x20 + */ + if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) + return size; /* reserved gap */ + + dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET; + /* + * Reject accesses beyond the last implemented HDM decoder. + * Without this check an out-of-bounds offset would silently + * corrupt comp_reg_virt[] memory past the end of the allocation. + */ + if ((off - dec_base) / CXL_HDM_DECODER_BLOCK_STRIDE >= cxl->hdm_count) + return size; + + dec_off = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE; + + switch (dec_off) { + case CXL_HDM_DECODER_N_BASE_LOW_OFFSET: /* BASE_LO */ + case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET: /* SIZE_LO */ + return hdm_decoder_n_lo_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */ + case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */ + { + /* Full 32-bit write, no reserved bits; frozen when COMMIT_LOCK set */ + u32 ctrl_off = off - dec_off + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + *hdm_reg_ptr(cxl, off) = *val32; + return size; + } + case CXL_HDM_DECODER_N_CTRL_OFFSET: /* CTRL */ + return hdm_decoder_n_ctrl_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET: + case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET: + case CXL_HDM_DECODER_N_REV_OFFSET: + return virt_hdm_rev_reg_write(vdev, val32, off, size); + default: + return size; + } +} + +/* + * vfio_cxl_comp_regs_rw - regops rw handler for + * VFIO_REGION_SUBTYPE_CXL_COMP_REGS. + * + * Reads return the emulated HDM state (comp_reg_virt[]). + * Writes go through comp_regs_dispatch_write() for bit-field enforcement. + * Only 4-byte aligned 4-byte accesses are supported (hardware requirement). + */ +static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret = 0; + size_t done = 0; + + if (!count) + return 0; + + /* Clamp to total region size: cap array prefix + HDM block */ + if (pos >= cxl->hdm_reg_offset + cxl->hdm_reg_size) + return -EINVAL; + count = min(count, + (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + + /* + * Serialise against vfio_cxl_reinit_hdm_shadow(), which holds + * memory_lock write-side while it saves, zeroes, and restores + * comp_reg_virt[] during reset. Without this read lock a concurrent + * COMP_REGS write can land between the save snapshot and the restore, + * causing the restore to silently overwrite it. A concurrent read + * can observe the array mid-rebuild. + */ + down_read(&vdev->memory_lock); + + while (done < count) { + u32 sz = count - done; + u32 off = pos + done; + __le32 v; + + /* Enforce exactly 4-byte, 4-byte-aligned accesses */ + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) { + ret = done ? (ssize_t)done : -EINVAL; + goto out_unlock; + } + + if (iswrite) { + if (off < cxl->hdm_reg_offset) { + /* Cap array area is read-only; discard writes */ + done += sizeof(v); + continue; + } + if (copy_from_user(&v, buf + done, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } + comp_regs_dispatch_write(vdev, + off - cxl->hdm_reg_offset, + &v, sizeof(v)); + } else { + /* Read from extended buffer - covers cap array and HDM */ + v = cxl->comp_reg_virt[off / sizeof(__le32)]; + if (copy_to_user(buf + done, &v, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } + } + done += sizeof(v); + } + + ret = done; + *ppos += done; +out_unlock: + up_read(&vdev->memory_lock); + return ret; +} + +static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + /* comp_reg_virt is freed in vfio_cxl_clean_virt_regs() */ +} + +static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = { + .rw = vfio_cxl_comp_regs_rw, + .release = vfio_cxl_comp_regs_release, +}; + +/* + * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state. + * + * Allocates comp_reg_virt as a compact __le32 array covering only + * hdm_reg_size bytes of HDM decoder registers. The initial values + * are read from hardware via the BAR ioremap established by the caller. + * + * DVSEC state is accessed via vdev->vconfig (see the following patch). + */ +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base, + resource_size_t max_size) +{ + size_t total_size, nregs, i; + + if (WARN_ON(!cxl->hdm_reg_size)) + return -EINVAL; + + total_size = cxl->hdm_reg_offset + cxl->hdm_reg_size; + + /* + * The caller's map covers [comp_reg_offset, comp_reg_offset+max_size) + * inside the BAR; the HDM block ends at CXL_CM_OFFSET + total_size + * relative to that map. Reject HDM blocks that walk past the + * advertised map size; pci_resource_len() would happily allow a stale + * BAR-wide window and the subsequent readl()s would run off the + * ioremap range. + */ + if (CXL_CM_OFFSET + total_size > max_size) + return -ENODEV; + + nregs = total_size / sizeof(__le32); + cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL); + if (!cxl->comp_reg_virt) + return -ENOMEM; + + /* + * Snapshot the CXL.mem register area from the caller's mapping. + * cap_base maps the component register block from comp_reg_offset. + * The CXL.mem registers start at CXL_CM_OFFSET (= 0x1000) within that + * block; reading from cap_base + CXL_CM_OFFSET ensures comp_reg_virt[0] + * holds the CXL Capability Array Header required by guest drivers. + */ + for (i = 0; i < nregs; i++) + cxl->comp_reg_virt[i] = + cpu_to_le32(readl(cap_base + CXL_CM_OFFSET + + i * sizeof(__le32))); + + /* + * Establish persistent mapping; kept alive until + * vfio_cxl_clean_virt_regs(). + */ + cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, + cxl->comp_reg_bar) + + cxl->comp_reg_offset + CXL_CM_OFFSET + + cxl->hdm_reg_offset, + cxl->hdm_reg_size); + if (!cxl->hdm_iobase) { + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; + return -ENOMEM; + } + + return 0; +} + +/* + * vfio_cxl_read_committed_decoder_size - Extract committed DPA capacity from + * comp_reg_virt[]. + * + * Called from probe context after vfio_cxl_reinit_comp_regs() has taken the + * post-MEM_ACTIVE readl() snapshot and patched SIZE_HIGH/SIZE_LOW from DVSEC. + * comp_reg_virt[] is already correct at this point; no hardware access needed. + * + * Returns the committed DPA capacity in bytes, or 0 if the decoder is not + * committed. + */ +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct pci_dev *pdev = vdev->pdev; + resource_size_t capacity; + u32 ctrl, sz_hi, sz_lo; + + if (WARN_ON(!cxl || !cxl->comp_reg_virt)) + return 0; + + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_CTRL_OFFSET(0))); + sz_hi = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(0))); + sz_lo = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_LOW_OFFSET(0))); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) { + pci_dbg(pdev, + "vfio_cxl: decoder0 not committed: ctrl=0x%08x\n", + ctrl); + return 0; + } + + capacity = ((resource_size_t)sz_hi << 32) | (sz_lo & GENMASK(31, 28)); + + pci_dbg(pdev, + "vfio_cxl: decoder0 committed: sz_hi=0x%08x sz_lo=0x%08x capacity=0x%llx\n", + sz_hi, sz_lo, (unsigned long long)capacity); + + return capacity; +} + +/* + * Called with memory_lock write side held (from vfio_cxl_reinit_hdm_shadow). + * Uses the pre-established hdm_iobase, no ioremap() under the lock, + * which would deadlock on PREEMPT_RT where ioremap() can sleep. + */ +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl) +{ + size_t i, nregs; + u32 n; + + if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase) + return; + + nregs = cxl->hdm_reg_size / sizeof(__le32); + + for (i = 0; i < nregs; i++) + *hdm_reg_ptr(cxl, i * sizeof(__le32)) = + cpu_to_le32(readl(cxl->hdm_iobase + + i * sizeof(__le32))); + + /* + * For firmware-committed decoders, clear COMMIT_LOCK (bit 8) and zero + * BASE in comp_reg_virt[] so QEMU can write the correct guest GPA via + * setup_locked_hdm() before guest DPA access begins. + * + * Check the COMMITTED bit (bit 10) directly from the freshly-snapshotted + * ctrl register rather than relying on cxl->precommitted. At probe time + * this function is called before cxl->precommitted is set (it is set + * after vfio_cxl_read_committed_decoder_size() succeeds), so using + * cxl->precommitted here would silently skip the LOCK clearing and leave + * the hardware HPA in comp_reg_virt[]. + */ + for (n = 0; n < cxl->hdm_count; n++) { + u32 ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 base_lo_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_LOW_OFFSET; + u32 base_hi_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_HIGH_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) { + *hdm_reg_ptr(cxl, ctrl_off) = + cpu_to_le32(ctrl & + ~CXL_HDM_DECODER0_CTRL_LOCK); + *hdm_reg_ptr(cxl, base_lo_off) = 0; + *hdm_reg_ptr(cxl, base_hi_off) = 0; + } + } +} + +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) +{ + if (cxl->hdm_iobase) { + iounmap(cxl->hdm_iobase); + cxl->hdm_iobase = NULL; + } + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; +} + +/* + * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region. + * + * Exposes the emulated HDM decoder register state as a VFIO device region + * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS. QEMU attaches a + * notify_change callback to this region to intercept HDM COMMIT writes + * and map the DPA MemoryRegion at the appropriate GPA. + * + * The region is read+write only (no mmap) to ensure all accesses pass + * through comp_regs_dispatch_write() for proper bit-field enforcement. + */ +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + int ret; + + if (!cxl || !cxl->comp_reg_virt) + return -ENODEV; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL_COMP_REGS, + &vfio_cxl_comp_regs_ops, + cxl->hdm_reg_offset + + cxl->hdm_reg_size, flags, cxl); + if (!ret) + cxl->comp_reg_region_idx = vdev->num_regions - 1; + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..ac8ea3893c7afc5327da108dab8300fb073591ad --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common infrastructure for CXL Type-2 device variant drivers + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef __LINUX_VFIO_CXL_PRIV_H +#define __LINUX_VFIO_CXL_PRIV_H + +#include +#include +#include + +struct vfio_pci_core_device; + +/* + * CXL device state embedded in vfio_pci_core_device. + * + * cxlds must be the first field: devm_cxl_dev_state_create() asserts + * offsetof(cxlds) == 0 so CXL core's container_of() lookups land back + * on this struct. + */ +struct vfio_pci_cxl_state { + struct cxl_dev_state cxlds; + struct vfio_pci_core_device *vdev; + struct cxl_memdev *cxlmd; + struct cxl_root_decoder *cxlrd; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + resource_size_t region_hpa; + size_t region_size; + void *region_vaddr; + resource_size_t hdm_reg_offset; + size_t hdm_reg_size; + resource_size_t comp_reg_offset; + size_t comp_reg_size; + __le32 *comp_reg_virt; + size_t dpa_size; + void __iomem *hdm_iobase; + int dpa_region_idx; + int comp_reg_region_idx; + u16 dvsec_len; + u8 hdm_count; + u8 comp_reg_bar; + bool cache_capable; + bool precommitted; + bool region_active; +}; + +/* Register access sizes */ +#define CXL_REG_SIZE_WORD 2 +#define CXL_REG_SIZE_DWORD 4 + +/* HDM Decoder - register offsets (CXL 4.0 Table 8-115) */ +#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET 0x10 +#define CXL_HDM_DECODER_BLOCK_STRIDE 0x20 +#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET 0x0 +#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET 0x4 +#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET 0x8 +#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET 0xc +#define CXL_HDM_DECODER_N_CTRL_OFFSET 0x10 +#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET 0x14 +#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18 +#define CXL_HDM_DECODER_N_REV_OFFSET 0x1c + +/* + * HDM Decoder N Control emulation masks. + * + * Single-bit hardware definitions are in as + * CXL_HDM_DECODER0_CTRL_* (bits 0-14) and CXL_HDM_DECODER_*_CAP. + * The masks below express emulation policy for a CXL.mem device. + */ +#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK (BIT(10) | BIT(11)) +#define CXL_HDM_DECODER_CTRL_RESERVED_MASK (BIT(15) | GENMASK(31, 28)) +#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO BIT(12) +#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED (GENMASK(19, 16) | GENMASK(23, 20)) +#define CXL_HDM_DECODER_CTRL_UIO_RESERVED (BIT(14) | GENMASK(27, 24)) +/* + * bit 13 (BI) is RsvdP for devices without CXL.cache (Cache_Capable=0). + * HDM-D (CXL.mem only) decoders must not have BI set by the guest. + */ +#define CXL_HDM_DECODER_CTRL_BI_RESERVED BIT(13) +#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK GENMASK(27, 0) + +#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2) +#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) + +/* + * DVSEC register offsets and per-bit hardware definitions are in + * as CXL_DVSEC_*. The masks below encode + * emulation policy: which bits to ignore, which to preserve separately + * from their raw hardware state. + */ +/* DVSEC Control (0x0C): bits 13 (RsvdP) and 15 (RsvdP) are always discarded */ +#define CXL_CTRL_RESERVED_MASK (BIT(13) | BIT(15)) +/* bit 12 (P2P_Mem_Enable) treated as reserved if Cap3.P2P_Mem_Capable=0 */ +#define CXL_CTRL_P2P_REV_MASK CXL_DVSEC_CTRL_P2P_MEM_ENABLE + +/* DVSEC Status (0x0E): bits 13:0 and 15 are RsvdZ */ +#define CXL_STATUS_RESERVED_MASK (GENMASK(13, 0) | BIT(15)) + +/* + * DVSEC Control2 (0x10) emulation masks. + * + * CXL_CTRL2_HW_BITS_MASK: bits 1 (Initiate_Cache_WBI) and 2 + * (Initiate_CXL_Reset) always read 0 from hardware _ they are write-only + * action triggers per CXL 4.0 _8.1.3.8 Table 8-8. Forward these to the + * device to trigger the hardware action; clear them from vconfig shadow so + * that subsequent guest reads return 0 as hardware requires. + * + * NOTE: bit 0 (Disable_Caching) and bit 3 (CXL_Reset_Mem_Clr_Enable) are + * ordinary RW fields _ they must be preserved in vconfig, not forwarded. + */ +#define CXL_CTRL2_RESERVED_MASK GENMASK(15, 6) +#define CXL_CTRL2_HW_BITS_MASK (BIT(1) | BIT(2)) +/* bit 4 is RsvdP if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_CTRL2_VOLATILE_HDM_REV_MASK CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM +/* bit 5 is RsvdP if Cap2.Mod_Completion_Capable=0 */ +#define CXL_CTRL2_MODIFIED_COMP_REV_MASK CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE + +/* DVSEC Lock (0x14): bits 15:1 are RsvdP */ +#define CXL_LOCK_RESERVED_MASK GENMASK(15, 1) + +/* DVSEC Range Base Low: bits 27:0 are reserved per Tables 8-15/8-19 */ +#define CXL_BASE_LO_RESERVED_MASK CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK + +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base, + resource_size_t max_size); +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl); +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size); +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); + +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off); + +#endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 0c771064c0b84424d4cc39fbcf162366028553e3..9d9b1116e64177daafeccb311715c5b00096bf66 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -60,6 +60,12 @@ static bool disable_denylist; module_param(disable_denylist, bool, 0444); MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) +static bool disable_cxl; +module_param(disable_cxl, bool, 0444); +MODULE_PARM_DESC(disable_cxl, "Disable CXL extensions on devices probed by the bare vfio-pci driver. Variant drivers do not consult this parameter; they must set vdev->disable_cxl explicitly in their probe path."); +#endif + static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) { switch (pdev->vendor) { @@ -120,6 +126,29 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) } } + if (vdev->cxl) { + /* + * pci_config_map and vconfig are valid now (allocated by + * vfio_config_init() inside vfio_pci_core_enable() above). + */ + vfio_cxl_setup_dvsec_perms(vdev); + + ret = vfio_cxl_register_cxl_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to setup CXL region\n"); + vfio_pci_core_disable(vdev); + return ret; + } + + ret = vfio_cxl_register_comp_regs_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to register COMP_REGS region\n"); + vfio_cxl_unregister_cxl_region(vdev); + vfio_pci_core_disable(vdev); + return ret; + } + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -167,6 +196,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_set_drvdata(&pdev->dev, vdev); vdev->pci_ops = &vfio_pci_dev_ops; +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + vdev->disable_cxl = disable_cxl; +#endif ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index a10ed733f0e3af89c185f8b93192ba6b004f05ed..03835a3b5083234774321e45cab75fbb91e58d7c 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -245,9 +245,9 @@ static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos, } /* Allow direct read from hardware, except for capability next pointer */ -static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; @@ -270,9 +270,9 @@ static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, } /* Raw access skips any kind of virtualization */ -static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -283,9 +283,9 @@ static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, return count; } -static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; @@ -910,7 +910,9 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); vfio_pci_dma_buf_move(vdev, true); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); @@ -995,7 +997,9 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); vfio_pci_dma_buf_move(vdev, true); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); @@ -1085,6 +1089,47 @@ static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) return 0; } +/* + * vfio_pci_dvsec_dispatch_read - per-device DVSEC read dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn at module init. + * Calls vdev->dvsec_readfn when a shadow-read handler has been registered + * (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices); otherwise + * fall back to vfio_direct_config_read so non-CXL DVSEC devices keep the + * extended-cap header mangling that the default ecap readfn applies. + */ +static int vfio_pci_dvsec_dispatch_read(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + if (vdev->dvsec_readfn) + return vdev->dvsec_readfn(vdev, pos, count, perm, offset, val); + return vfio_direct_config_read(vdev, pos, count, perm, offset, val); +} + +/* + * vfio_pci_dvsec_dispatch_write - per-device DVSEC write dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn at module init. + * Calls vdev->dvsec_writefn when a handler has been registered for this + * device (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), + * otherwise proceed to vfio_raw_config_write so that non-CXL devices + * with a DVSEC capability continue to pass writes to hardware. + * + * This indirection allows per-device DVSEC handlers to be registered + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_write(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + if (vdev->dvsec_writefn) + return vdev->dvsec_writefn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_write(vdev, pos, count, perm, offset, val); +} + /* * Initialize the shared permission tables */ @@ -1121,7 +1166,8 @@ int __init vfio_pci_init_perm_bits(void) ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; - ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn = vfio_pci_dvsec_dispatch_read; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_pci_dvsec_dispatch_write; if (ret) vfio_pci_uninit_perm_bits(); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ad52abc46c04df935a779d66bbe91767eab66ed7..2c4f719f45491376c6979b4b5a04addbc805acba 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -591,7 +591,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; - int i, bar; + int i, bar, bars; /* For needs_reset */ lockdep_assert_held(&vdev->vdev.dev_set->lock); @@ -650,8 +650,10 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) bar = i + PCI_STD_RESOURCES; if (!vdev->barmap[bar]) continue; + bars = (vdev->cxl && i == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); pci_iounmap(pdev, vdev->barmap[bar]); - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); vdev->barmap[bar] = NULL; } @@ -997,6 +999,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->cxl) { + ret = vfio_cxl_get_info(vdev, &caps); + if (ret) + return ret; + info.flags |= VFIO_DEVICE_FLAGS_CXL; + } + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1042,6 +1051,12 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, struct pci_dev *pdev = vdev->pdev; int i, ret; + if (vdev->cxl) { + ret = vfio_cxl_get_region_info(vdev, info, caps); + if (ret != -ENOTTY) + return ret; + } + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); @@ -1231,6 +1246,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); + /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ + vfio_cxl_prepare_reset(vdev); + /* * This function can be invoked while the power state is non-D0. If * pci_try_reset_function() has been called while the power state is @@ -1246,6 +1264,13 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, ret = pci_try_reset_function(vdev->pdev); if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); + + /* + * finish_reset checks the COMMITTED bit from hardware + * and only brings the region back if it is actually set. + */ + vfio_cxl_finish_reset(vdev); + up_write(&vdev->memory_lock); return ret; @@ -1627,6 +1652,13 @@ void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { down_write(&vdev->memory_lock); vfio_pci_zap_bars(vdev); + /* + * Zap the CXL DPA region PTEs too: zap_bars only covers the BAR offset + * range, while the DPA region lives in the device-region offset range + * and would otherwise survive a runtime-PM entry or D3 transition. + * No-op on non-CXL devices. + */ + vfio_cxl_zap_dpa(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1760,6 +1792,18 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma if (req_start + req_len > phys_len) return -EINVAL; + /* + * CXL devices: mmap is permitted for the GPU/accelerator register + * windows listed in the sparse-mmap capability. Block any request + * that overlaps the CXL component register block + * [comp_reg_offset, comp_reg_offset + comp_reg_size); those registers + * must be accessed exclusively through the COMP_REGS device region so + * that the emulation layer (notify_change) intercepts every write. + */ + if (vdev->cxl && index == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, req_start, req_len)) + return -EINVAL; + /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. @@ -2187,6 +2231,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (ret) goto out_vf; + vfio_pci_cxl_detect_and_init(vdev); + vfio_pci_probe_power_state(vdev); /* @@ -2211,6 +2257,15 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return 0; out_power: + /* + * vfio_pci_cxl_detect_and_init() may have assigned vdev->cxl and + * allocated comp_reg_virt[] / hdm_iobase / region state above. The + * normal teardown via vfio_pci_core_unregister_device() will not run + * if registration failed, so release the CXL state here. No-op when + * vdev->cxl is NULL (non-CXL device or detect skipped). + */ + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(dev); @@ -2230,6 +2285,8 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vf_uninit(vdev); vfio_pci_vga_uninit(vdev); + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(&vdev->pdev->dev); @@ -2482,6 +2539,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_undo; } + /* + * All devices in the set are now locked. Commit the CXL prepare + * step in its own pass: it clears region_active and zaps DPA PTEs, + * which must be paired with a finish_reset call for every device it + * touches. Doing this only after all trylocks have succeeded keeps + * a mid-loop failure from leaving earlier devices with + * region_active=false and no matching reset. + */ + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_cxl_prepare_reset(vdev); + /* * The pci_reset_bus() will reset all the devices in the bus. * The power state can be non-D0 for some of the devices in the bus. @@ -2496,6 +2564,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, ret = pci_reset_bus(pdev); + /* + * Mirror vfio_pci_ioctl_reset(): re-read the post-reset HDM state and + * reactivate the DPA region for CXL devices that hardware committed. + * Runs under each device's memory_lock write side acquired earlier and + * pairs with the prepare_reset pass above. + */ + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_cxl_finish_reset(vdev); + vdev = list_last_entry(&dev_set->device_list, struct vfio_pci_core_device, vdev.dev_set_list); @@ -2595,3 +2672,4 @@ module_exit(vfio_pci_core_cleanup); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index fca9d0dfac90f8eaaf2d281dd2213111c46d76a1..bd511ba88b9340719839db4cb726eae08df37e64 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -41,6 +41,18 @@ ssize_t vfio_pci_config_rw_single(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); + +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + +int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -137,4 +149,69 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, } #endif +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev); +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev); +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev); +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len); + +#else + +static inline void +vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline bool +vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ return false; } +static inline void +vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_zap_dpa(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline void +vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline int +vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline int +vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline u8 +vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ return U8_MAX; } +static inline bool +vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ return false; } + +#endif /* CONFIG_VFIO_CXL_CORE */ + #endif diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 4251ee03e1463bf175ca846f7f212d2b3c1398fd..3e0ec0b082ff89e804522133f14779275f8110f5 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -201,19 +201,29 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { struct pci_dev *pdev = vdev->pdev; - int ret; + int ret, bars; void __iomem *io; if (vdev->barmap[bar]) return 0; - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + /* + * The CXL component register BAR cannot be claimed exclusively: the + * CXL subsystem holds persistent sub-range iomem claims during HDM + * decoder setup. pci_request_selected_regions() for the full BAR + * fails with EBUSY. Pass bars=0 to make the request a no-op and map + * directly via pci_iomap(). + */ + bars = (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); + + ret = pci_request_selected_regions(pdev, bars, "vfio"); if (ret) return ret; io = pci_iomap(pdev, bar, 0); if (!io) { - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); return -ENOMEM; } @@ -248,6 +258,17 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, count = min(count, (size_t)(end - pos)); + /* + * For CXL devices, the component register subrange is emulated through + * the dedicated COMP_REGS region (comp_regs_dispatch_write). Reject fd + * read/write that targets that subrange so userspace cannot bypass the + * emulation by issuing pread()/pwrite() on the BAR fd. This matches + * the mmap path, which rejects overlapping mmap requests. + */ + if (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, pos, count)) + return -EINVAL; + if (bar == PCI_ROM_RESOURCE) { /* * The ROM can fill less space than the BAR, so we start the @@ -449,6 +470,16 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; + /* + * Disallow ioeventfds that would land inside the CXL component + * register subrange. Without this check, the eventfd handler would + * iowrite directly into the BAR mapping, bypassing the COMP_REGS + * emulation enforced on the mmap and pread/pwrite paths. + */ + if (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, pos, count)) + return -EINVAL; + if (count == 8) return -EINVAL; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index fa7269154620554c847414fb18ea2e850dfe5201..b3287411460dcc1db028c42deac6c7f10783c34a 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -5,9 +5,12 @@ #ifndef __CXL_CXL_H__ #define __CXL_CXL_H__ +#include #include #include +#include #include +#include /** * enum cxl_devtype - delineate type-2 from a generic type-3 device @@ -70,11 +73,27 @@ struct cxl_regs { ); }; +#define CXL_CM_CAP_CAP_ID_RAS 0x2 +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + +/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ +#define CXL_DECODER_MIN_GRANULARITY 256 +#define CXL_DECODER_MAX_ENCODED_IG 6 + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + struct cxl_reg_map { bool valid; int id; unsigned long offset; unsigned long size; + u8 count; }; struct cxl_component_reg_map { @@ -99,9 +118,16 @@ struct cxl_pmu_reg_map { * @resource: physical resource base of the register block * @max_size: maximum mapping size to perform register search * @reg_type: see enum cxl_regloc_type + * @bar_index: PCI BAR index (0-5) when regblock is BAR-backed; 0xFF otherwise + * @bar_offset: offset within the BAR; only valid when bar_index <= 5 * @component_map: cxl_reg_map for component registers * @device_map: cxl_reg_maps for device registers * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + * + * When the register block is described by the Register Locator DVSEC with + * a BAR Indicator (BIR 0-5), bar_index and bar_offset are set so callers can + * use pci_iomap(pdev, bar_index, size) and base + bar_offset instead of + * ioremap(resource). */ struct cxl_register_map { struct device *host; @@ -109,6 +135,8 @@ struct cxl_register_map { resource_size_t resource; resource_size_t max_size; u8 reg_type; + u8 bar_index; + resource_size_t bar_offset; union { struct cxl_component_reg_map component_map; struct cxl_device_reg_map device_map; @@ -149,6 +177,36 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +/* + * cxl_decoder flags that define the type of memory / devices this decoder + * supports as well as configuration lock status. + */ +#define CXL_DECODER_F_RAM BIT(0) +#define CXL_DECODER_F_PMEM BIT(1) +#define CXL_DECODER_F_TYPE2 BIT(2) +#define CXL_DECODER_F_TYPE3 BIT(3) +#define CXL_DECODER_F_LOCK BIT(4) +#define CXL_DECODER_F_ENABLE BIT(5) +#define CXL_DECODER_F_MASK GENMASK(5, 0) + +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + +/** + * struct cxl_attach_region - accelerator region handling + * @attach: invoked at cxl_memdev_attach_region() with endpoint device locked. + * @detach: invoked at endpoint release. + * @data: pointer referencing accelerator data for attach and detach calls. + * @region: initialised with autodiscovered region values linked to memdev. + */ +struct cxl_attach_region { + int (*attach)(void *); + void (*detach)(void *); + void *data; + struct range region; +}; + /** * struct cxl_dev_state - The driver device state * @@ -223,4 +281,66 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ sizeof(drv_struct), mbox); \ }) + +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_region; +struct cxl_endpoint_decoder *cxl_get_committed_decoder(struct cxl_memdev *cxlmd, + struct cxl_region **cxlr); +int cxl_get_region_range(struct cxl_region *region, struct range *range); +void cxl_unregister_region(struct cxl_region *cxlr); +struct cxl_port; +struct cxl_root_decoder *cxl_get_hpa_freespace(struct cxl_memdev *cxlmd, + int interleave_ways, + unsigned long flags, + resource_size_t *max); +void cxl_put_root_decoder(struct cxl_root_decoder *cxlrd); +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_memdev *cxlmd, + enum cxl_partition_mode mode, + resource_size_t alloc); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder **cxled, + int ways); +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); + +#ifdef CONFIG_CXL_REGION +bool cxl_region_contains_soft_reserve(struct resource *res); +#else +static inline bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return false; +} +#endif +struct cxl_component_reg_map; +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map); + +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset); + +#ifdef CONFIG_CXL_BUS + +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); + +#else + +static inline +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ return -EOPNOTSUPP; } + +#endif /* CONFIG_CXL_BUS */ + +/* f951acc: split from media-ready wait */ +struct cxl_dev_state; +int cxl_await_range_active(struct cxl_dev_state *cxlds); + +/* a6a063d: exported reset helpers for VFIO */ +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +int cxl_dev_reset_locked(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); + #endif /* __CXL_CXL_H__ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 4f1308244c82dc9506d24055f96d0dc0f9cc90cb..82b86c7bdf6e06796cefaf8578c9e594124aee4b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2049,6 +2049,9 @@ int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) +void pci_dev_save_and_disable(struct pci_dev *dev); +void pci_dev_restore(struct pci_dev *dev); + /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), * a PCI domain is defined to be a set of PCI buses which share diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f7bcadb3df0d93b76a94169dbfeba..d9190930d22f17dbda8e936453c51ad450efc143 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -29,6 +29,8 @@ struct vfio_pci_core_device; struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_attachment; +struct vfio_pci_cxl_state; +struct perm_bits; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -127,6 +129,9 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + bool disable_cxl:1; +#endif struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; @@ -137,6 +142,13 @@ struct vfio_pci_core_device { struct mutex ioeventfds_lock; struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; + struct vfio_pci_cxl_state *cxl; + int (*dvsec_readfn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + int (*dvsec_writefn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6507ebf31916002758a366f99be31115a88284 --- /dev/null +++ b/include/uapi/cxl/cxl_regs.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * CXL Standard defines + * + * Hardware register offsets and bit-field masks for the CXL Component + * Register block, as defined by the CXL Specification r4.0. + */ + +#ifndef _UAPI_CXL_REGS_H_ +#define _UAPI_CXL_REGS_H_ + +#include /* __BITS_PER_LONG; needed by __GENMASK() */ +#include /* _BITUL(), _BITULL() */ +#include /* __GENMASK() */ + +/* CXL 4.0 8.2.3 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE 0x00010000 + +/* CXL 4.0 8.2.4 CXL.cache and CXL.mem Registers*/ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK __GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK __GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK __GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK __GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK __GENMASK(31, 20) + +/* CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_POISON_ON_DECODE_ERR _BITUL(10) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_UIO_CAPABLE _BITUL(13) +#define CXL_HDM_DECODER_UIO_COUNT_MASK __GENMASK(19, 16) +#define CXL_HDM_DECODER_MEMDATA_NXM _BITUL(20) +#define CXL_HDM_DECODER_COHERENCY_MODELS_MASK __GENMASK(22, 21) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE _BITUL(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK _BITUL(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT _BITUL(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED _BITUL(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR _BITUL(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY _BITUL(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* + * CXL r4.0 8.1.3: DVSEC for CXL Devices + * + * Register offsets are relative to the DVSEC capability base address, + * as discovered via PCI_EXT_CAP_ID_DVSEC with DVSEC ID 0x0. + * All registers in this section are 16-bit wide. + */ + +/* DVSEC register offsets */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0x0a +#define CXL_DVSEC_CONTROL_OFFSET 0x0c +#define CXL_DVSEC_STATUS_OFFSET 0x0e +#define CXL_DVSEC_CONTROL2_OFFSET 0x10 +#define CXL_DVSEC_STATUS2_OFFSET 0x12 +#define CXL_DVSEC_LOCK_OFFSET 0x14 +#define CXL_DVSEC_CAPABILITY2_OFFSET 0x16 +#define CXL_DVSEC_RANGE1_SIZE_HIGH_OFFSET 0x18 +#define CXL_DVSEC_RANGE1_SIZE_LOW_OFFSET 0x1c +#define CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET 0x20 +#define CXL_DVSEC_RANGE1_BASE_LOW_OFFSET 0x24 +#define CXL_DVSEC_RANGE2_SIZE_HIGH_OFFSET 0x28 +#define CXL_DVSEC_RANGE2_SIZE_LOW_OFFSET 0x2c +#define CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET 0x30 +#define CXL_DVSEC_RANGE2_BASE_LOW_OFFSET 0x34 +#define CXL_DVSEC_CAPABILITY3_OFFSET 0x38 + +/* DVSEC Range Base Low registers: bits [27:0] are reserved */ +#define CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK __GENMASK(27, 0) + +/* CXL r4.0 8.1.3.1 Table 8-5 DVSEC CXL Capability (offset 0x0A) */ +#define CXL_DVSEC_CAP_CACHE_CAPABLE _BITUL(0) +#define CXL_DVSEC_CAP_IO_CAPABLE _BITUL(1) +#define CXL_DVSEC_CAP_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_CAP_MEM_HW_INIT_MODE _BITUL(3) +#define CXL_DVSEC_CAP_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP_CACHE_WBI_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP_CXL_RESET_CAPABLE _BITUL(7) +#define CXL_DVSEC_CAP_CXL_RESET_TIMEOUT_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CAP_CXL_RESET_MEM_CLR_CAPABLE _BITUL(11) +#define CXL_DVSEC_CAP_TSP_CAPABLE _BITUL(12) +#define CXL_DVSEC_CAP_MLD_CAPABLE _BITUL(13) +#define CXL_DVSEC_CAP_VIRAL_CAPABLE _BITUL(14) +#define CXL_DVSEC_CAP_PM_INIT_REPORTING_CAPABLE _BITUL(15) + +/* CXL r4.0 8.1.3.2 Table 8-6 DVSEC CXL Control (offset 0x0C) */ +#define CXL_DVSEC_CTRL_CACHE_ENABLE _BITUL(0) +#define CXL_DVSEC_CTRL_IO_ENABLE _BITUL(1) +#define CXL_DVSEC_CTRL_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_CTRL_CACHE_SF_COVERAGE_MASK __GENMASK(7, 3) +#define CXL_DVSEC_CTRL_CACHE_SF_GRANULARITY_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CTRL_CACHE_CLEAN_EVICTION _BITUL(11) +#define CXL_DVSEC_CTRL_P2P_MEM_ENABLE _BITUL(12) +/* bit 13: RsvdP */ +#define CXL_DVSEC_CTRL_VIRAL_ENABLE _BITUL(14) +/* bit 15: RsvdP */ + +/* CXL r4.0 8.1.3.3 Table 8-7 DVSEC CXL Status (offset 0x0E) */ +/* bits 13:0 = RsvdZ */ +#define CXL_DVSEC_STATUS_VIRAL_STATUS _BITUL(14) +/* bit 15 = RsvdZ */ + +/* CXL r4.0 8.1.3.4 Table 8-8 DVSEC CXL Control2 (offset 0x10) */ +#define CXL_DVSEC_CTRL2_DISABLE_CACHING _BITUL(0) +#define CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI _BITUL(1) +#define CXL_DVSEC_CTRL2_INITIATE_CXL_RESET _BITUL(2) +#define CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE _BITUL(3) +#define CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM _BITUL(4) +#define CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE _BITUL(5) +/* bits 15:6 = RsvdP */ + +/* CXL r4.0 8.1.3.5 Table 8-9 DVSEC CXL Status2 (offset 0x12) */ +#define CXL_DVSEC_STATUS2_CACHE_INVALID _BITUL(0) +#define CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE _BITUL(1) +#define CXL_DVSEC_STATUS2_CXL_RESET_ERROR _BITUL(2) +/* RW1CS; RsvdZ if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR _BITUL(3) +/* bits 14:4 = RsvdZ */ +#define CXL_DVSEC_STATUS2_PM_INIT_COMPLETION _BITUL(15) + +/* CXL r4.0 _8.1.3.6 Table 8-10 _ DVSEC CXL Lock (offset 0x14) */ +#define CXL_DVSEC_LOCK_CONFIG_LOCK _BITUL(0) +/* bits 15:1 = RsvdP */ + +/* CXL r4.0 8.1.3.7 Table 8-11 DVSEC CXL Capability2 (offset 0x16) */ +#define CXL_DVSEC_CAP2_CACHE_SIZE_UNIT_MASK __GENMASK(3, 0) +#define CXL_DVSEC_CAP2_FALLBACK_CAPABILITY_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP2_NO_CLEAN_WRITEBACK _BITUL(7) +#define CXL_DVSEC_CAP2_CACHE_SIZE_MASK __GENMASK(15, 8) + +/* CXL r4.0 8.1.3.14 Table 8-20 DVSEC CXL Capability3 (offset 0x38) */ +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_COLD_RESET _BITUL(0) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_WARM_RESET _BITUL(1) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_HOT_RESET _BITUL(2) +#define CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY _BITUL(3) +#define CXL_DVSEC_CAP3_P2P_MEM_CAPABLE _BITUL(4) +/* bits 15:5 = RsvdP */ + +#endif /* _UAPI_CXL_REGS_H_ */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 14f634ab9350d5442192162225b5e5202dbe2308..a7ac017baa1c1ccbd79330af114931801ba9990e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1353,6 +1353,19 @@ #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) + +#define PCI_DVSEC_CXL_CTRL_RWL 0x5FED +#define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) +#define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) +#define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) +#define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) +#define PCI_DVSEC_CXL_STATUS2 0x12 +#define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) +#define PCI_DVSEC_CXL_RST_DONE _BITUL(1) +#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) +#define PCI_DVSEC_CXL_LOCK 0x14 +#define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5eeee0f9062dba8d440201acfb02fb4..fd1f007b76f7269b0310609e4b99ba7ce9029299 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -215,6 +215,16 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +/* + * Vendor-specific CXL device with CXL.mem capability (HDM-D or HDM-DB + * decoder, PCI class code != PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 + * accelerators and non-class-code Type-3 variants. When set, + * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The + * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL + * describing HDM decoders, region indices, decoder layout, and CXL-specific + * options. + */ +#define VFIO_DEVICE_FLAGS_CXL (1 << 9) /* Device supports CXL */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ @@ -257,6 +267,70 @@ struct vfio_device_info_cap_pci_atomic_comp { __u32 reserved; }; +/* + * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability + * + * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL + * is set. Describes Host Managed Device Memory (HDM) layout and CXL + * memory options so that userspace (e.g. QEMU) can expose the CXL region + * and component registers correctly to the guest. + * + * The HDM decoder count and HDM decoder block offset within the COMP_REGS + * region are derivable from the COMP_REGS region itself. + * + * To find the HDM decoder block offset (hdm_decoder_offset), traverse the CXL + * Capability Array starting at COMP_REGS region offset 0: + * - Dword 0 bits[31:24] (CXL_CM_CAP_HDR_ARRAY_SIZE_MASK): number of + * capability entries. + * - Each subsequent dword at offset (cap * 4): bits[15:0] = cap ID + * (CXL_CM_CAP_HDR_ID_MASK), bits[31:20] = byte offset from COMP_REGS + * start to that capability's register block (CXL_CM_CAP_PTR_MASK). + * - Locate the entry with cap ID == CXL_CM_CAP_CAP_ID_HDM (0x5); the + * extracted bits[31:20] value is directly the byte offset + * hdm_decoder_offset (no further scaling required). + * + * To find the HDM decoder count, pread the HDM Decoder Capability register + * at hdm_decoder_offset + CXL_HDM_DECODER_CAP_OFFSET within the + * COMP_REGS region; bits[3:0] (CXL_HDM_DECODER_COUNT_MASK) encode the count + * using the formula: count = (field == 0) ? 1 : field * 2. + */ +#define VFIO_DEVICE_INFO_CAP_CXL 6 +struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; + __u8 hdm_regs_bar_index; /* PCI BAR containing HDM registers */ + __u8 reserved[3]; + __u32 flags; +/* Decoder was committed by host firmware/BIOS */ +#define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) +/* + * Device implements an HDM-DB decoder (CXL.cache + CXL.mem). Reflects + * the Cache_Capable bit (bit 0) in the CXL DVSEC Capability register. + * + * When clear: HDM-D decoder (CXL.mem only, no CXL.cache). FLR does not + * require a Write-Back Invalidation (WBI) sequence; the device holds no + * coherent copies of host memory. + * + * When set: HDM-DB decoder (CXL 3.0+). The kernel driver does not + * perform Write-Back Invalidation (WBI) automatically. The VMM must + * issue a WBI sequence before asserting FLR to flush dirty device cache + * lines and prevent coherency violations, and should advertise + * Back-Invalidation support in the virtual CXL topology. + */ +#define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + /* + * Byte offset within the BAR to the CXL.mem register area start + * (= comp_reg_offset + CXL_CM_OFFSET). This is where the CXL + * Capability Array Header lives. + */ + __u64 hdm_regs_offset; + /* + * Region indices for the two CXL VFIO device regions. + * Avoids forcing userspace to scan all regions by type/subtype. + */ + __u32 dpa_region_index; /* VFIO_REGION_SUBTYPE_CXL */ + __u32 comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */ +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -370,6 +444,18 @@ struct vfio_region_info_cap_type { */ #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) +/* 1e98 vendor PCI sub-types (CXL Consortium) */ +/* + * CXL memory region. Use with region type + * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE). + * DPA memory region (fault+zap mmap) + */ +#define VFIO_REGION_SUBTYPE_CXL (1) +/* + * HDM decoder register emulation region (read/write only, no mmap). + */ +#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS (2) + /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 418669927fb00d0590e503e4bebe071c5412def2..6acdf48d2bd3eaab9b286791db63b9299413c1c5 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1041,6 +1041,45 @@ static void default_mock_decoder(struct cxl_decoder *cxld) WARN_ON_ONCE(!cxld_registry_new(cxld)); } +static void size_zero_mock_decoder_ep(struct cxl_decoder *cxld, u64 base) +{ + struct cxl_endpoint_decoder *cxled; + + cxled = to_cxl_endpoint_decoder(&cxld->dev); + cxld->hpa_range = (struct range){ + .start = base, + .end = base - 1, /* Size 0 */ + }; + + cxld->interleave_ways = 2; + cxld->interleave_granularity = 4096; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->flags = CXL_DECODER_F_ENABLE; + cxled->state = CXL_DECODER_STATE_AUTO; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; +} + +static void size_zero_mock_decoder_sw(struct device *dev, u64 base, int i) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + + cxlsd = to_cxl_switch_decoder(dev); + cxld = &cxlsd->cxld; + cxld->flags = CXL_DECODER_F_ENABLE; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + if (i == 0) + cxld->interleave_ways = 2; + else + cxld->interleave_ways = 1; + cxld->interleave_granularity = 4096; + cxld->hpa_range = (struct range) { + .start = base, + .end = base - 1, /* Size 0 */ + }; +} + static int first_decoder(struct device *dev, const void *data) { struct cxl_decoder *cxld; @@ -1053,22 +1092,31 @@ static int first_decoder(struct device *dev, const void *data) return 0; } -/* - * Initialize a decoder during HDM enumeration. - * - * If a saved registry entry exists: - * - enabled decoders are restored from the saved programming - * - disabled decoders are initialized in a clean disabled state - * - * If no registry entry exists the decoder follows the normal mock - * initialization path, including the special auto-region setup for - * the first endpoints under host-bridge0. - * - * Returns true if decoder state was restored from the registry. In - * that case the saved decode configuration (including target mapping) - * has already been applied and the map_targets() is skipped. - */ -static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) +static int second_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 1) + return 1; + return 0; +} + +static int third_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 2) + return 1; + return 0; +} + +static void mock_init_hdm_decoder(struct cxl_decoder *cxld) { struct acpi_cedt_cfmws *window = mock_cfmws[0]; struct platform_device *pdev = NULL; @@ -1080,7 +1128,7 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_dport *dport; struct device *dev; bool hb0 = false; - u64 base; + u64 base = window->base_hpa; int i; if (is_endpoint_decoder(&cxld->dev)) { @@ -1122,6 +1170,20 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) return false; } + /* + * Decoders 1 and 2 of the endpoint under host bridge 0 should be enabled as zero-sized. + * It would be even better to make sure that the parent switch uport decoder was + * also enabled before enabling the size zero decoders but there is no harm in doing it + * anyway. + */ + if (hb0 && (cxld->id == 1 || cxld->id == 2)) { + port = to_cxl_port(cxld->dev.parent); + size_zero_mock_decoder_ep(cxld, base); + /* Commit the zero-sized decoder */ + port->commit_end = cxld->id; + return; + } + /* * The first decoder on the first 2 devices on the first switch * attached to host-bridge0 mock a fake / static RAM region. All @@ -1142,7 +1204,6 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) return false; } - base = window->base_hpa; if (extended_linear_cache) base += mock_auto_region_size; cxld->hpa_range = (struct range) { @@ -1214,6 +1275,22 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld_registry_update(cxld); put_device(dev); + + /* Enable the next two decoders also and make them zero sized */ + dev = device_find_child(&iter->dev, NULL, second_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 1; + put_device(dev); + } + dev = device_find_child(&iter->dev, NULL, third_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 2; + put_device(dev); + } } return false;