文章目录
- 前言
- 相关链接
- Ref
- 正文
前言
CXL 是一个比较新的技术,所以我研究的内核源码是选了当前比较新的内核版本 linux 6.0。打算将内核关于 CXL 的驱动进行解析一遍,一步一步慢慢来。
在阅读之前,希望读者能有一定的 PCIe 基础知识,精力有限,不能把所有知识点都能说的很详细,需要一定的基础才能理解,同时,希望在学习的过程中,手边能有 PCIe 5.0 Spec 以及 CXL 2.0 Spec,以便随时查看,当然,我也会尽量把重点的部分截图在博文中。
最后,如果有问题请留言讨论。
相关链接
Linux Kernel 6.0 CXL Core Regs.c 详解
Ref
《PCI_Express_Base_5.0r1.0》
《CXL Specification_rev2p0_ver1p0_2020Oct26》
正文
首先,仍然是是一个PCI 设备驱动模型,根据 pci_device_id 中的 Class 去匹配设备,匹配成功调用 probe 函数
static const struct pci_device_id cxl_mem_pci_tbl[] = {
/* PCI class code for CXL.mem Type-3 Devices */
{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
{ /* terminate list */ },
};
MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
// 匹配表
.id_table = cxl_mem_pci_tbl,
// 匹配成功,回调 probe 函数
.probe = cxl_pci_probe,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
};
以下对 probe 函数进行解析:
static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct cxl_register_map map;
struct cxl_memdev *cxlmd;
struct cxl_dev_state *cxlds;
int rc;
/*
* Double check the anonymous union trickery in struct cxl_regs
* FIXME switch to struct_group()
*/
BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
offsetof(struct cxl_regs, device_regs.memdev));
// 使能设备
rc = pcim_enable_device(pdev);
if (rc)
return rc;
// 为 struct cxl_dev_state *cxlds 申请内存并初始化部分变量
cxlds = cxl_dev_state_create(&pdev->dev);
if (IS_ERR(cxlds))
return PTR_ERR(cxlds);
// Looks up the PCI_EXT_CAP_ID_DSN and reads the 8 bytes of the Device Serial Number.
// The Device Serial Number is two dwords offset 4 bytes from the capability position
// 在 PCIe Ext capability 中寻找序列号的位置并读出,详情参考 PCIe Spec
cxlds->serial = pci_get_dsn(pdev);
// 此函数会在 PCIe 配置空间中 extended capability 区域进行遍历
// 寻找匹配的 Vendor == PCI_DVSEC_VENDOR_ID_CXL (0x23)
// 以及 DVSEC ID == CXL_DVSEC_PCIE_DEVICE(0)的 DVSEC
// 0x23 是表示这个 capability 是 DVSEC
// 0 表示这个 DVSEC 是具体类型 DVSEC FOR CXL DEVICE
// DVSEC 不了解的可以认为是一块保存一些寄存器的内存区域
// 详情 Ref PCIe 5.0 Spec 7.9.6 Designated Vendor-Specific Extended Capability (DVSEC)
// ID 分配 Ref CXL 2.0 Spec 8.1.1 PCI Express Designated Vendor-Specific Extended Capability(DVSEC) ID Assignment
cxlds->cxl_dvsec = pci_find_dvsec_capability(
pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
if (!cxlds->cxl_dvsec)
dev_warn(&pdev->dev,
"Device DVSEC not present, skip CXL.mem init\n");
rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
if (rc)
return rc;
rc = cxl_map_regs(cxlds, &map);
if (rc)
return rc;
/*
* If the component registers can't be found, the cxl_pci driver may
* still be useful for management functions so don't return an error.
*/
cxlds->component_reg_phys = CXL_RESOURCE_NONE;
// 定位寄存器块位置,建立映射,记录位置和大小,详情在下面
rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
if (rc)
dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
// CXL_REGLOC_RBI_COMPONENT
// Component 寄存器块的物理基地址
cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map);
// 为每个 DOE 创建一个实体
devm_cxl_pci_create_doe(cxlds);
// 查看 mailbox 是否准备好,记录 payload size 等信息
rc = cxl_pci_setup_mailbox(cxlds);
if (rc)
return rc;
// Enumerate commands for a device.
// 详情见另一篇 mbox.c
rc = cxl_enumerate_cmds(cxlds);
if (rc)
return rc;
// Send the IDENTIFY command to the device.
// 详情见另一篇 mbox.c
// 命令作用 : Retrieve basic information about the memory device.,如 total_bytes、volatile_only_bytes等
rc = cxl_dev_state_identify(cxlds);
if (rc)
return rc;
// 创建内存范围信息
// 详情见另一篇 mbox.c
rc = cxl_mem_create_range_info(cxlds);
if (rc)
return rc;
// 创建字符设备, /dev/memX
// 详情见 memdev.c
cxlmd = devm_cxl_add_memdev(cxlds);
if (IS_ERR(cxlmd))
return PTR_ERR(cxlmd);
if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
return rc;
}
1. CXL Subsystem Component Register Ranges
2. Type:
static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
struct cxl_register_map *map)
{
int rc;
// Locate register blocks by type
// 根据 Type 定位寄存器,记录寄存器的所在的BAR,偏移以及寄存器类型,保存在 map 中
// CXL 设备有一些 CXL 相关的寄存器会以这种形式告知操作系统它的布局位置
// Type 分为 (如上图)
// 0 :空
// 1 :Component Reg 如上Table 141
// 2 : BAR Virtualization ACL Reg
// 3 : CXL Memory Device Registers
// Ref CXL 2.0 Spec 8.1.9 Register Locator DVSEC
// 8.1.9.1 Register Offset Low
rc = cxl_find_regblock(pdev, type, map);
if (rc)
return rc;
// 根据上面的信息,找到对应的 BAR 进行 io 映射并保存寄存器块的基地址,详情在下个函数
rc = cxl_map_regblock(pdev, map);
if (rc)
return rc;
rc = cxl_probe_regs(pdev, map);
cxl_unmap_regblock(pdev, map);
return rc;
}
static int cxl_map_regblock(struct pci_dev *pdev, struct cxl_register_map *map)
{
void __iomem *addr;
int bar = map->barno;
struct device *dev = &pdev->dev;
resource_size_t offset = map->block_offset;
/* Basic sanity check that BAR is big enough */
// 检查 BAR 总大小是否小于偏移,如果是报错
// 寄存器所在的偏移应该是在 BAR 空间内
if (pci_resource_len(pdev, bar) < offset) {
dev_err(dev, "BAR%d: %pr: too small (offset: %pa)\n", bar,
&pdev->resource[bar], &offset);
return -ENXIO;
}
// 映射 BAR 空间,bar 是序号,0表示不检查长度,全部映射
addr = pci_iomap(pdev, bar, 0);
if (!addr) {
dev_err(dev, "failed to map registers\n");
return -ENOMEM;
}
dev_dbg(dev, "Mapped CXL Memory Device resource bar %u @ %pa\n",
bar, &offset);
// 获取寄存器虚拟地址空间的基地址,内核可直接读写访问
map->base = addr + map->block_offset;
return 0;
}
static int cxl_probe_regs(struct pci_dev *pdev, struct cxl_register_map *map)
{
struct cxl_component_reg_map *comp_map;
struct cxl_device_reg_map *dev_map;
struct device *dev = &pdev->dev;
void __iomem *base = map->base;
// 根据不同的寄存器类型,做不同的处理
switch (map->reg_type) {
case CXL_REGLOC_RBI_COMPONENT:
// 如果是组件寄存器
comp_map = &map->component_map;
// 参考另一篇文章 Regs.c
// 记录 HDM Decoder 寄存器块的offset 以及长度
cxl_probe_component_regs(dev, base, comp_map);
if (!comp_map->hdm_decoder.valid) {
dev_err(dev, "HDM decoder registers not found\n");
return -ENXIO;
}
dev_dbg(dev, "Set up component registers\n");
break;
case CXL_REGLOC_RBI_MEMDEV:
// 如果是 CXL 内存设备寄存器
dev_map = &map->device_map;
// 参考另一篇文章 Regs.c
// 记录 CXL Device 寄存器块的offset 以及长度
cxl_probe_device_regs(dev, base, dev_map);
if (!dev_map->status.valid || !dev_map->mbox.valid ||
!dev_map->memdev.valid) {
dev_err(dev, "registers not found: %s%s%s\n",
!dev_map->status.valid ? "status " : "",
!dev_map->mbox.valid ? "mbox " : "",
!dev_map->memdev.valid ? "memdev " : "");
return -ENXIO;
}
dev_dbg(dev, "Probing device registers...\n");
break;
default:
break;
}
return 0;
}
static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
{
struct device *dev = cxlds->dev;
struct pci_dev *pdev = to_pci_dev(dev);
u16 off = 0;
// Initialise an empty XArray.
xa_init(&cxlds->doe_mbs);
// 管理资源接口,linux kernel 相关接口,非重点
if (devm_add_action(&pdev->dev, cxl_pci_destroy_doe, &cxlds->doe_mbs)) {
dev_err(dev, "Failed to create XArray for DOE's\n");
return;
}
/*
* Mailbox creation is best effort. Higher layers must determine if
* the lack of a mailbox for their protocol is a device failure or not.
*/
// 遍历枚举每一个 DOE Capability
pci_doe_for_each_off(pdev, off) {
struct pci_doe_mb *doe_mb;
// Create a DOE mailbox object
// 详情见另一篇 doe.c
doe_mb = pcim_doe_create_mb(pdev, off);
if (IS_ERR(doe_mb)) {
dev_err(dev, "Failed to create MB object for MB @ %x\n",
off);
continue;
}
// Store this entry in the XArray unless another entry is already present.
// 存储到数组中
if (xa_insert(&cxlds->doe_mbs, off, doe_mb, GFP_KERNEL)) {
dev_err(dev, "xa_insert failed to insert MB @ %x\n",
off);
continue;
}
dev_dbg(dev, "Created DOE mailbox @%x\n", off);
}
}
3. Mailbox Registers
4. Mailbox Capabilities Register
5. Mailbox Control Register
6. Mailbox Interfaces Ready
. Spec 引用,mailbox 命令超时时间
The mailbox command timeout is 2 seconds. Commands that require a longer execution time shall be completed asynchronously in the background. Only one command can be executed in the background at a time.
// Ref CXL 2.0 8.2.8.4.4 Mailbox Control Register (Mailbox Registers Capability Offset + 04h)
// bit0 DoorBell : 当为 0 时表示设备准备接收新的命令;调用者会置1,告诉设备命令已经准备好输入了
// 当置1时只读, 当命令完成后由设备清0, 如上图 5. Mailbox Control Register
// 所以为 1 时表示设备 mailbox 在忙
#define cxl_doorbell_busy(cxlds) \
(readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
CXLDEV_MBOX_CTRL_DOORBELL)
static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
{
const unsigned long start = jiffies;
unsigned long end = start;
// polling 查询mailbox 寄存器的状态
while (cxl_doorbell_busy(cxlds)) {
end = jiffies;
if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
// mailbox command 超时时间 2 S, 协议规定, 如上 6. 引用部分
// Ref CXL 2.0 8.2.8.4 Mailbox Registers (Offset Varies)
/* Check again in case preempted before timeout test */
if (!cxl_doorbell_busy(cxlds))
break;
return -ETIMEDOUT;
}
cpu_relax();
}
dev_dbg(cxlds->dev, "Doorbell wait took %dms",
jiffies_to_msecs(end) - jiffies_to_msecs(start));
return 0;
}
static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
{
const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
unsigned long timeout;
u64 md_status;
timeout = jiffies + mbox_ready_timeout * HZ;
// 首先查询设备是否准备好mailbox
do {
// Ref CXL 2.0 Spec 8.2.8.5.1.1 Memory Device Status Register
// or 上图 6. Mailbox Interfaces Ready
// bit4 置1表示设备已经准备好通过 mailbox 接收命令了
// CXLMDEV_MBOX_IF_READY == 0x4
md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
if (md_status & CXLMDEV_MBOX_IF_READY)
break;
if (msleep_interruptible(100))
break;
} while (!time_after(jiffies, timeout));
if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
cxl_err(cxlds->dev, md_status,
"timeout awaiting mailbox ready");
return -ETIMEDOUT;
}
/*
* A command may be in flight from a previous driver instance,
* think kexec, do one doorbell wait so that
* __cxl_pci_mbox_send_cmd() can assume that it is the only
* source for future doorbell busy events.
*/
// 前面详细介绍了
// 等待 mailbox 空闲
if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
cxl_err(cxlds->dev, md_status, "timeout awaiting mailbox idle");
return -ETIMEDOUT;
}
cxlds->mbox_send = cxl_pci_mbox_send;
// Payload Size: Size of the Command Payload Registers in bytes, expressed as 2^n.
// The minimum size is 256 bytes (n=8) and the maximum size is 1 MB (n=20).
// Ref CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
cxlds->payload_size =
1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
/*
* CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
*
* If the size is too small, mandatory commands will not work and so
* there's no point in going forward. If the size is too large, there's
* no harm is soft limiting it.
*/
cxlds->payload_size = min_t(size_t, cxlds->payload_size, SZ_1M);
if (cxlds->payload_size < 256) {
dev_err(cxlds->dev, "Mailbox is too small (%zub)",
cxlds->payload_size);
return -ENXIO;
}
dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
cxlds->payload_size);
return 0;
}
最后剩下一个重要的 mbox send 接口,其实理解也很简单,就是根据一系列寄存器,进行数据读写。
static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
{
int rc;
mutex_lock_io(&cxlds->mbox_mutex);
rc = __cxl_pci_mbox_send_cmd(cxlds, cmd);
mutex_unlock(&cxlds->mbox_mutex);
return rc;
}
主要函数为 __cxl_pci_mbox_send_cmd,执行一个 mailbox 命令:
CXL 2.0 8.2.8.4 Mailbox Registers
The flow for executing a command is described below. The term “caller” represents the entity submitting the command:
- Caller reads MB Control Register to verify doorbell is clear
- Caller writes Command Register
- Caller writes Command Payload Registers if input payload is non-empty
- Caller writes MB Control Register to set doorbell
- Caller either polls for doorbell to be clear or waits for interrupt if configured
- Caller reads MB Status Register to fetch Return code
- If command successful, Caller reads Command Register to get Payload Length
- If output payload is non-empty, host reads Command Payload Registers
/**
* __cxl_pci_mbox_send_cmd() - Execute a mailbox command
* @cxlds: The device state to communicate with.
* @mbox_cmd: Command to send to the memory device.
*
* Context: Any context. Expects mbox_mutex to be held.
* Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
* Caller should check the return code in @mbox_cmd to make sure it
* succeeded.
*
* This is a generic form of the CXL mailbox send command thus only using the
* registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
* devices, and perhaps other types of CXL devices may have further information
* available upon error conditions. Driver facilities wishing to send mailbox
* commands should use the wrapper command.
*
* The CXL spec allows for up to two mailboxes. The intention is for the primary
* mailbox to be OS controlled and the secondary mailbox to be used by system
* firmware. This allows the OS and firmware to communicate with the device and
* not need to coordinate with each other. The driver only uses the primary
* mailbox.
*/
static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
struct cxl_mbox_cmd *mbox_cmd)
{
void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
struct device *dev = cxlds->dev;
u64 cmd_reg, status_reg;
size_t out_len;
int rc;
lockdep_assert_held(&cxlds->mbox_mutex);
/*
* Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
* 1. Caller reads MB Control Register to verify doorbell is clear
* 2. Caller writes Command Register
* 3. Caller writes Command Payload Registers if input payload is non-empty
* 4. Caller writes MB Control Register to set doorbell
* 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
* 6. Caller reads MB Status Register to fetch Return code
* 7. If command successful, Caller reads Command Register to get Payload Length
* 8. If output payload is non-empty, host reads Command Payload Registers
*
* Hardware is free to do whatever it wants before the doorbell is rung,
* and isn't allowed to change anything after it clears the doorbell. As
* such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
* also happen in any order (though some orders might not make sense).
*/
// 基本时按照规范,依次进行
/* #1 Caller reads MB Control Register to verify doorbell is clear */
// 第一步,确保 mailbox 空闲,通过查看 doorbell 状态位
if (cxl_doorbell_busy(cxlds)) {
u64 md_status =
readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
"mailbox queue busy");
return -EBUSY;
}
cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
mbox_cmd->opcode);
if (mbox_cmd->size_in) {
if (WARN_ON(!mbox_cmd->payload_in))
return -EINVAL;
// 3. Caller writes Command Payload Registers if input payload is non-empty
cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
mbox_cmd->size_in);
memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
}
/* #2, #3 */
// 2. Caller writes Command Register
writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
/* #4 */
// 4. Caller writes MB Control Register to set doorbell
// 调用者写控制寄存器,设置 doorbell 为 1,表示设备在忙
dev_dbg(dev, "Sending command\n");
writel(CXLDEV_MBOX_CTRL_DOORBELL,
cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
/* #5 */
// 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
// 等待结束可以使用轮询或者中断方式
rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
if (rc == -ETIMEDOUT) {
u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
return rc;
}
/* #6 */
// 6. Caller reads MB Status Register to fetch Return code
// 调用者读状态寄存器,获取返回码
status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
mbox_cmd->return_code =
FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
dev_dbg(dev, "Mailbox operation had an error: %s\n",
cxl_mbox_cmd_rc2str(mbox_cmd));
return 0; /* completed but caller must check return_code */
}
/* #7 */
// 7. If command successful, Caller reads Command Register to get Payload Length
// 如果返回成功,调用者读命令寄存器获取数据长度
cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
/* #8 */
// 8. If output payload is non-empty, host reads Command Payload Registers
// 如果长度不为空,则读取数据
if (out_len && mbox_cmd->payload_out) {
/*
* Sanitize the copy. If hardware misbehaves, out_len per the
* spec can actually be greater than the max allowed size (21
* bits available but spec defined 1M max). The caller also may
* have requested less data than the hardware supplied even
* within spec.
*/
size_t n = min3(mbox_cmd->size_out, cxlds->payload_size, out_len);
memcpy_fromio(mbox_cmd->payload_out, payload, n);
mbox_cmd->size_out = n;
} else {
mbox_cmd->size_out = 0;
}
return 0;
}