接前一篇文章:
QEMU模拟的设备很多都是PCI设备,本节介绍PCI设备的模拟。与所有设备类似,PCI设备的父设备也是TYPE_DEVICE,其定义在QEMU源码根目录/hw/pci/pci.c中,代码如下:
static const TypeInfo pci_device_type_info = {
.name = TYPE_PCI_DEVICE,
.parent = TYPE_DEVICE,
.instance_size = sizeof(PCIDevice),
.abstract = true,
.class_size = sizeof(PCIDeviceClass),
.class_init = pci_device_class_init,
.class_base_init = pci_device_class_base_init,
};
static void pci_register_types(void)
{
type_register_static(&pci_bus_info);
type_register_static(&pcie_bus_info);
type_register_static(&cxl_bus_info);
type_register_static(&conventional_pci_interface_info);
type_register_static(&cxl_interface_info);
type_register_static(&pcie_interface_info);
type_register_static(&pci_device_type_info);
}
type_init(pci_register_types)
其中,TypeInfo的定义在include/qom/object.h中,如下:
typedef struct TypeInfo TypeInfo;
而struct TypeInfo的定义在include/qomobject.h中,代码如下:
/**
* struct TypeInfo:
* @name: The name of the type.
* @parent: The name of the parent type.
* @instance_size: The size of the object (derivative of #Object). If
* @instance_size is 0, then the size of the object will be the size of the
* parent object.
* @instance_align: The required alignment of the object. If @instance_align
* is 0, then normal malloc alignment is sufficient; if non-zero, then we
* must use qemu_memalign for allocation.
* @instance_init: This function is called to initialize an object. The parent
* class will have already been initialized so the type is only responsible
* for initializing its own members.
* @instance_post_init: This function is called to finish initialization of
* an object, after all @instance_init functions were called.
* @instance_finalize: This function is called during object destruction. This
* is called before the parent @instance_finalize function has been called.
* An object should only free the members that are unique to its type in this
* function.
* @abstract: If this field is true, then the class is considered abstract and
* cannot be directly instantiated.
* @class_size: The size of the class object (derivative of #ObjectClass)
* for this object. If @class_size is 0, then the size of the class will be
* assumed to be the size of the parent class. This allows a type to avoid
* implementing an explicit class type if they are not adding additional
* virtual functions.
* @class_init: This function is called after all parent class initialization
* has occurred to allow a class to set its default virtual method pointers.
* This is also the function to use to override virtual methods from a parent
* class.
* @class_base_init: This function is called for all base classes after all
* parent class initialization has occurred, but before the class itself
* is initialized. This is the function to use to undo the effects of
* memcpy from the parent class to the descendants.
* @class_data: Data to pass to the @class_init,
* @class_base_init. This can be useful when building dynamic
* classes.
* @interfaces: The list of interfaces associated with this type. This
* should point to a static array that's terminated with a zero filled
* element.
*/
struct TypeInfo
{
const char *name;
const char *parent;
size_t instance_size;
size_t instance_align;
void (*instance_init)(Object *obj);
void (*instance_post_init)(Object *obj);
void (*instance_finalize)(Object *obj);
bool abstract;
size_t class_size;
void (*class_init)(ObjectClass *klass, void *data);
void (*class_base_init)(ObjectClass *klass, void *data);
void *class_data;
InterfaceInfo *interfaces;
};
这里,对于TypeInfo即struct TypeInfo的对象pci_device_type_info来说,其class_init(函数指针)成员指向了pci_device_class_init函数。该函数也在hw/pci/pci.c中,代码如下:
static void pci_device_class_init(ObjectClass *klass, void *data)
{
DeviceClass *k = DEVICE_CLASS(klass);
k->realize = pci_qdev_realize;
k->unrealize = pci_qdev_unrealize;
k->bus_type = TYPE_PCI_BUS;
device_class_set_props(k, pci_props);
}
PCI类初始化函数中设置了PCIDeviceClass基类对象DeviceClass的realize和unrealize函数;bus_type表示设备挂接到的总线;props表示PCI设备有哪些属性,这些属性都可以在命令行指定。同样的,不存在单独的PCI设备,PCI设备也是一个抽象类。
PCI设备的具现化函数为pci_qdev_realize。该函数同样在hw/pci/pci.c中,代码如下:
static void pci_qdev_realize(DeviceState *qdev, Error **errp)
{
PCIDevice *pci_dev = (PCIDevice *)qdev;
PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev);
ObjectClass *klass = OBJECT_CLASS(pc);
Error *local_err = NULL;
bool is_default_rom;
uint16_t class_id;
/*
* capped by systemd (see: udev-builtin-net_id.c)
* as it's the only known user honor it to avoid users
* misconfigure QEMU and then wonder why acpi-index doesn't work
*/
if (pci_dev->acpi_index > ONBOARD_INDEX_MAX) {
error_setg(errp, "acpi-index should be less or equal to %u",
ONBOARD_INDEX_MAX);
return;
}
/*
* make sure that acpi-index is unique across all present PCI devices
*/
if (pci_dev->acpi_index) {
GSequence *used_indexes = pci_acpi_index_list();
if (g_sequence_lookup(used_indexes,
GINT_TO_POINTER(pci_dev->acpi_index),
g_cmp_uint32, NULL)) {
error_setg(errp, "a PCI device with acpi-index = %" PRIu32
" already exist", pci_dev->acpi_index);
return;
}
g_sequence_insert_sorted(used_indexes,
GINT_TO_POINTER(pci_dev->acpi_index),
g_cmp_uint32, NULL);
}
if (pci_dev->romsize != -1 && !is_power_of_2(pci_dev->romsize)) {
error_setg(errp, "ROM size %u is not a power of two", pci_dev->romsize);
return;
}
/* initialize cap_present for pci_is_express() and pci_config_size(),
* Note that hybrid PCIs are not set automatically and need to manage
* QEMU_PCI_CAP_EXPRESS manually */
if (object_class_dynamic_cast(klass, INTERFACE_PCIE_DEVICE) &&
!object_class_dynamic_cast(klass, INTERFACE_CONVENTIONAL_PCI_DEVICE)) {
pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
}
if (object_class_dynamic_cast(klass, INTERFACE_CXL_DEVICE)) {
pci_dev->cap_present |= QEMU_PCIE_CAP_CXL;
}
pci_dev = do_pci_register_device(pci_dev,
object_get_typename(OBJECT(qdev)),
pci_dev->devfn, errp);
if (pci_dev == NULL)
return;
if (pc->realize) {
pc->realize(pci_dev, &local_err);
if (local_err) {
error_propagate(errp, local_err);
do_pci_unregister_device(pci_dev);
return;
}
}
/*
* A PCIe Downstream Port that do not have ARI Forwarding enabled must
* associate only Device 0 with the device attached to the bus
* representing the Link from the Port (PCIe base spec rev 4.0 ver 0.3,
* sec 7.3.1).
* With ARI, PCI_SLOT() can return non-zero value as the traditional
* 5-bit Device Number and 3-bit Function Number fields in its associated
* Routing IDs, Requester IDs and Completer IDs are interpreted as a
* single 8-bit Function Number. Hence, ignore ARI capable devices.
*/
if (pci_is_express(pci_dev) &&
!pcie_find_capability(pci_dev, PCI_EXT_CAP_ID_ARI) &&
pcie_has_upstream_port(pci_dev) &&
PCI_SLOT(pci_dev->devfn)) {
warn_report("PCI: slot %d is not valid for %s,"
" parent device only allows plugging into slot 0.",
PCI_SLOT(pci_dev->devfn), pci_dev->name);
}
if (pci_dev->failover_pair_id) {
if (!pci_bus_is_express(pci_get_bus(pci_dev))) {
error_setg(errp, "failover primary device must be on "
"PCIExpress bus");
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
error_setg(errp, "failover primary device is not an "
"Ethernet device");
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
if ((pci_dev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION)
|| (PCI_FUNC(pci_dev->devfn) != 0)) {
error_setg(errp, "failover: primary device must be in its own "
"PCI slot");
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
qdev->allow_unplug_during_migration = true;
}
/* rom loading */
is_default_rom = false;
if (pci_dev->romfile == NULL && pc->romfile != NULL) {
pci_dev->romfile = g_strdup(pc->romfile);
is_default_rom = true;
}
pci_add_option_rom(pci_dev, is_default_rom, &local_err);
if (local_err) {
error_propagate(errp, local_err);
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
pci_set_power(pci_dev, true);
pci_dev->msi_trigger = pci_msi_trigger;
}
pci_qdev_realize函数主要包括三个方面的工作:
(1)首先调用do_pci_register_device函数进行注册。
代码片段如下:
pci_dev = do_pci_register_device(pci_dev,
object_get_typename(OBJECT(qdev)),
pci_dev->devfn, errp);
if (pci_dev == NULL)
return;
do_pci_register_device函数同样在hw/pci/pci.c中,代码如下:
/* -1 for devfn means auto assign */
static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
const char *name, int devfn,
Error **errp)
{
PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev);
PCIConfigReadFunc *config_read = pc->config_read;
PCIConfigWriteFunc *config_write = pc->config_write;
Error *local_err = NULL;
DeviceState *dev = DEVICE(pci_dev);
PCIBus *bus = pci_get_bus(pci_dev);
bool is_bridge = IS_PCI_BRIDGE(pci_dev);
/* Only pci bridges can be attached to extra PCI root buses */
if (pci_bus_is_root(bus) && bus->parent_dev && !is_bridge) {
error_setg(errp,
"PCI: Only PCI/PCIe bridges can be plugged into %s",
bus->parent_dev->name);
return NULL;
}
if (devfn < 0) {
for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices);
devfn += PCI_FUNC_MAX) {
if (pci_bus_devfn_available(bus, devfn) &&
!pci_bus_devfn_reserved(bus, devfn)) {
goto found;
}
}
error_setg(errp, "PCI: no slot/function available for %s, all in use "
"or reserved", name);
return NULL;
found: ;
} else if (pci_bus_devfn_reserved(bus, devfn)) {
error_setg(errp, "PCI: slot %d function %d not available for %s,"
" reserved",
PCI_SLOT(devfn), PCI_FUNC(devfn), name);
return NULL;
} else if (!pci_bus_devfn_available(bus, devfn)) {
error_setg(errp, "PCI: slot %d function %d not available for %s,"
" in use by %s,id=%s",
PCI_SLOT(devfn), PCI_FUNC(devfn), name,
bus->devices[devfn]->name, bus->devices[devfn]->qdev.id);
return NULL;
} /*
* Populating function 0 triggers a scan from the guest that
* exposes other non-zero functions. Hence we need to ensure that
* function 0 wasn't added yet.
*/
else if (dev->hotplugged &&
!pci_is_vf(pci_dev) &&
pci_get_function_0(pci_dev)) {
error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
" new func %s cannot be exposed to guest.",
PCI_SLOT(pci_get_function_0(pci_dev)->devfn),
pci_get_function_0(pci_dev)->name,
name);
return NULL;
}
pci_dev->devfn = devfn;
pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev);
pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
memory_region_init(&pci_dev->bus_master_container_region, OBJECT(pci_dev),
"bus master container", UINT64_MAX);
address_space_init(&pci_dev->bus_master_as,
&pci_dev->bus_master_container_region, pci_dev->name);
if (phase_check(PHASE_MACHINE_READY)) {
pci_init_bus_master(pci_dev);
}
pci_dev->irq_state = 0;
pci_config_alloc(pci_dev);
pci_config_set_vendor_id(pci_dev->config, pc->vendor_id);
pci_config_set_device_id(pci_dev->config, pc->device_id);
pci_config_set_revision(pci_dev->config, pc->revision);
pci_config_set_class(pci_dev->config, pc->class_id);
if (!is_bridge) {
if (pc->subsystem_vendor_id || pc->subsystem_id) {
pci_set_word(pci_dev->config + PCI_SUBSYSTEM_VENDOR_ID,
pc->subsystem_vendor_id);
pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID,
pc->subsystem_id);
} else {
pci_set_default_subsystem_id(pci_dev);
}
} else {
/* subsystem_vendor_id/subsystem_id are only for header type 0 */
assert(!pc->subsystem_vendor_id);
assert(!pc->subsystem_id);
}
pci_init_cmask(pci_dev);
pci_init_wmask(pci_dev);
pci_init_w1cmask(pci_dev);
if (is_bridge) {
pci_init_mask_bridge(pci_dev);
}
pci_init_multifunction(bus, pci_dev, &local_err);
if (local_err) {
error_propagate(errp, local_err);
do_pci_unregister_device(pci_dev);
return NULL;
}
if (!config_read)
config_read = pci_default_read_config;
if (!config_write)
config_write = pci_default_write_config;
pci_dev->config_read = config_read;
pci_dev->config_write = config_write;
bus->devices[devfn] = pci_dev;
pci_dev->version_id = 2; /* Current pci device vmstate version */
return pci_dev;
}
do_pci_register_device函数完成设备及其对应PCI总线上的一些初始化工作。
1)如果指定的devfn为-1,表示由总线自己选择插槽,得到插槽之后保存在PCIDevice的devfn(即pci_dev->devfn)中;如果在设备命令行中指定了addr,则addr会作为设备的devfn。代码片段如下:
if (devfn < 0) {
for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices);
devfn += PCI_FUNC_MAX) {
if (pci_bus_devfn_available(bus, devfn) &&
!pci_bus_devfn_reserved(bus, devfn)) {
goto found;
}
}
error_setg(errp, "PCI: no slot/function available for %s, all in use "
"or reserved", name);
return NULL;
found: ;
} else if (pci_bus_devfn_reserved(bus, devfn)) {
error_setg(errp, "PCI: slot %d function %d not available for %s,"
" reserved",
PCI_SLOT(devfn), PCI_FUNC(devfn), name);
return NULL;
} else if (!pci_bus_devfn_available(bus, devfn)) {
error_setg(errp, "PCI: slot %d function %d not available for %s,"
" in use by %s,id=%s",
PCI_SLOT(devfn), PCI_FUNC(devfn), name,
bus->devices[devfn]->name, bus->devices[devfn]->qdev.id);
return NULL;
} /*
* Populating function 0 triggers a scan from the guest that
* exposes other non-zero functions. Hence we need to ensure that
* function 0 wasn't added yet.
*/
else if (dev->hotplugged &&
!pci_is_vf(pci_dev) &&
pci_get_function_0(pci_dev)) {
error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
" new func %s cannot be exposed to guest.",
PCI_SLOT(pci_get_function_0(pci_dev)->devfn),
pci_get_function_0(pci_dev)->name,
name);
return NULL;
}
pci_dev->devfn = devfn;
pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev);
pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
2)接下来设置PCIDevice结构体中的各个域,包括调用pci_init_bus_master函数初始化PCIDevice中的Address成员bus_master_as及其对应的MR。代码片段如下:
memory_region_init(&pci_dev->bus_master_container_region, OBJECT(pci_dev),
"bus master container", UINT64_MAX);
address_space_init(&pci_dev->bus_master_as,
&pci_dev->bus_master_container_region, pci_dev->name);
if (phase_check(PHASE_MACHINE_READY)) {
pci_init_bus_master(pci_dev);
}
3)之后,调用pci_config_alloc函数分配PCI设备的配置空间,cmake用来检测相关的能力,wmask用来控制读写,w1cmask用来实现RW1C。由此完成一些初始化的设置,如vendor_id等。代码片段如下:
pci_config_alloc(pci_dev);
pci_config_set_vendor_id(pci_dev->config, pc->vendor_id);
pci_config_set_device_id(pci_dev->config, pc->device_id);
pci_config_set_revision(pci_dev->config, pc->revision);
pci_config_set_class(pci_dev->config, pc->class_id);
if (!is_bridge) {
if (pc->subsystem_vendor_id || pc->subsystem_id) {
pci_set_word(pci_dev->config + PCI_SUBSYSTEM_VENDOR_ID,
pc->subsystem_vendor_id);
pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID,
pc->subsystem_id);
} else {
pci_set_default_subsystem_id(pci_dev);
}
} else {
/* subsystem_vendor_id/subsystem_id are only for header type 0 */
assert(!pc->subsystem_vendor_id);
assert(!pc->subsystem_id);
}
pci_init_cmask(pci_dev);
pci_init_wmask(pci_dev);
pci_init_w1cmask(pci_dev);
if (is_bridge) {
pci_init_mask_bridge(pci_dev);
}
pci_init_multifunction(bus, pci_dev, &local_err);
if (local_err) {
error_propagate(errp, local_err);
do_pci_unregister_device(pci_dev);
return NULL;
}
4)然后是设置设备的config_read和config_write函数。如果相关的子类自己没有设置,那么就使用默认的pci_default_read/write_config函数。代码片段如下:
if (!config_read)
config_read = pci_default_read_config;
if (!config_write)
config_write = pci_default_write_config;
pci_dev->config_read = config_read;
pci_dev->config_write = config_write;
5)最后,将该device复制到bus->devices数组中。代码片段如下:
bus->devices[devfn] = pci_dev;
pci_dev->version_id = 2; /* Current pci device vmstate version */
至此,pci_qdev_realize函数所做的第一方面工作即所调用的第1个函数do_pci_register_device()就解析完了。
欲知后事如何,且看下回分解。