接前一篇文章:
从QEMU源码全解析 —— virtio(3)到上一回QEMU源码全解析 —— virtio(16)花了十几个回目讲解了virtio设备的初始化,本回开始讲解virtio驱动的加载。
前文书讲了每一个virtio设备都有一个对应的virtio PCI代理设备,本回开始来分析虚拟机内部操作系统是如何加载virtio PCI代理设备和virtio设备的驱动,以及如何与virtio设备通信的。由于virtio PCI代理设备的存在,PCI进行扫描的时候会扫描到这个设备,并且会调用相应驱动的probe函数。
virtio_pci_driver及其probe回调函数在Linux内核源码/drivers/virtio/virtio_pci_common.c中,如下所示:
static struct pci_driver virtio_pci_driver = {
.name = "virtio-pci",
.id_table = virtio_pci_id_table,
.probe = virtio_pci_probe,
.remove = virtio_pci_remove,
#ifdef CONFIG_PM_SLEEP
.driver.pm = &virtio_pci_pm_ops,
#endif
.sriov_configure = virtio_pci_sriov_configure,
};
module_pci_driver(virtio_pci_driver);
static int virtio_pci_probe(struct pci_dev *pci_dev,
const struct pci_device_id *id)
{
struct virtio_pci_device *vp_dev, *reg_dev = NULL;
int rc;
/* allocate our structure and fill it out */
vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
if (!vp_dev)
return -ENOMEM;
pci_set_drvdata(pci_dev, vp_dev);
vp_dev->vdev.dev.parent = &pci_dev->dev;
vp_dev->vdev.dev.release = virtio_pci_release_dev;
vp_dev->pci_dev = pci_dev;
INIT_LIST_HEAD(&vp_dev->virtqueues);
spin_lock_init(&vp_dev->lock);
/* enable the device */
rc = pci_enable_device(pci_dev);
if (rc)
goto err_enable_device;
if (force_legacy) {
rc = virtio_pci_legacy_probe(vp_dev);
/* Also try modern mode if we can't map BAR0 (no IO space). */
if (rc == -ENODEV || rc == -ENOMEM)
rc = virtio_pci_modern_probe(vp_dev);
if (rc)
goto err_probe;
} else {
rc = virtio_pci_modern_probe(vp_dev);
if (rc == -ENODEV)
rc = virtio_pci_legacy_probe(vp_dev);
if (rc)
goto err_probe;
}
pci_set_master(pci_dev);
vp_dev->is_legacy = vp_dev->ldev.ioaddr ? true : false;
rc = register_virtio_device(&vp_dev->vdev);
reg_dev = vp_dev;
if (rc)
goto err_register;
return 0;
err_register:
if (vp_dev->is_legacy)
virtio_pci_legacy_remove(vp_dev);
else
virtio_pci_modern_remove(vp_dev);
err_probe:
pci_disable_device(pci_dev);
err_enable_device:
if (reg_dev)
put_device(&vp_dev->vdev.dev);
else
kfree(vp_dev);
return rc;
}
(1)virtio_pci_probe函数分配一个virtio_pci_device结构体并赋值给vp_dev,用来表示一个virtio PCI代理设备。代码片段如下:
/* allocate our structure and fill it out */
vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
if (!vp_dev)
return -ENOMEM;
struct virtio_pci_device的定义在Linux内核源码/drivers/virtio/virtio_pci_common.h中,代码如下:
/* Our device structure */
struct virtio_pci_device {
struct virtio_device vdev;
struct pci_dev *pci_dev;
struct virtio_pci_legacy_device ldev;
struct virtio_pci_modern_device mdev;
bool is_legacy;
/* Where to read and clear interrupt */
u8 __iomem *isr;
/* a list of queues so we can dispatch IRQs */
spinlock_t lock;
struct list_head virtqueues;
/* array of all queues for house-keeping */
struct virtio_pci_vq_info **vqs;
/* MSI-X support */
int msix_enabled;
int intx_enabled;
cpumask_var_t *msix_affinity_masks;
/* Name strings for interrupts. This size should be enough,
* and I'm too lazy to allocate each name separately. */
char (*msix_names)[256];
/* Number of available vectors */
unsigned int msix_vectors;
/* Vectors allocated, excluding per-vq vectors if any */
unsigned int msix_used_vectors;
/* Whether we have vector per vq */
bool per_vq_vectors;
struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
struct virtio_pci_vq_info *info,
unsigned int idx,
void (*callback)(struct virtqueue *vq),
const char *name,
bool ctx,
u16 msix_vec);
void (*del_vq)(struct virtio_pci_vq_info *info);
u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
};
(2)将vp_dev设置为该pci_dev的私有结构。代码片段如下:
pci_set_drvdata(pci_dev, vp_dev);
pci_set_drvdata函数在Linux内核源码/include/linux/pci.h中,代码如下:
static inline void pci_set_drvdata(struct pci_dev *pdev, void *data)
{
dev_set_drvdata(&pdev->dev, data);
}
dev_set_drvdata函数在Linux内核源码/include\linux\device.h中,代码如下:
static inline void dev_set_drvdata(struct device *dev, void *data)
{
dev->driver_data = data;
}
struct pci_dev的定义在Linux内核源码/include/linux/pci.h中,如下:
/* The pci_dev structure describes PCI devices */
struct pci_dev {
struct list_head bus_list; /* Node in per-bus list */
struct pci_bus *bus; /* Bus this device is on */
struct pci_bus *subordinate; /* Bus this device bridges to */
void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procent; /* Device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */
unsigned int devfn; /* Encoded device & function index */
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class; /* 3 bytes: (base,sub,prog-if) */
u8 revision; /* PCI revision, low byte of class word */
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
#ifdef CONFIG_PCIEAER
u16 aer_cap; /* AER capability offset */
struct aer_stats *aer_stats; /* AER stats for this device */
#endif
#ifdef CONFIG_PCIEPORTBUS
struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
struct pci_dev *rcec; /* Associated RCEC device */
#endif
u32 devcap; /* PCIe Device Capabilities */
u8 pcie_cap; /* PCIe capability offset */
u8 msi_cap; /* MSI capability offset */
u8 msix_cap; /* MSI-X capability offset */
u8 pcie_mpss:3; /* PCIe Max Payload Size Supported */
u8 rom_base_reg; /* Config register controlling ROM */
u8 pin; /* Interrupt pin this device uses */
u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */
unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */
struct pci_driver *driver; /* Driver bound to this device */
u64 dma_mask; /* Mask of the bits of bus address this
device implements. Normally this is
0xffffffff. You only need to change
this if your device has broken DMA
or supports 64-bit transfers. */
struct device_dma_parameters dma_parms;
pci_power_t current_state; /* Current operating state. In ACPI,
this is D0-D3, D0 being fully
functional, and D3 being off. */
unsigned int imm_ready:1; /* Supports Immediate Readiness */
u8 pm_cap; /* PM capability offset */
unsigned int pme_support:5; /* Bitmask of states from which PME#
can be generated */
unsigned int pme_poll:1; /* Poll device's PME status bit */
unsigned int d1_support:1; /* Low power state D1 is supported */
unsigned int d2_support:1; /* Low power state D2 is supported */
unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
unsigned int no_d3cold:1; /* D3cold is forbidden */
unsigned int bridge_d3:1; /* Allow D3 for bridge */
unsigned int d3cold_allowed:1; /* D3cold is allowed by user */
unsigned int mmio_always_on:1; /* Disallow turning off io/mem
decoding during BAR sizing */
unsigned int wakeup_prepared:1;
unsigned int skip_bus_pm:1; /* Internal: Skip bus-level PM */
unsigned int ignore_hotplug:1; /* Ignore hotplug events */
unsigned int hotplug_user_indicators:1; /* SlotCtl indicators
controlled exclusively by
user sysfs */
unsigned int clear_retrain_link:1; /* Need to clear Retrain Link
bit manually */
unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */
unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */
#ifdef CONFIG_PCIEASPM
struct pcie_link_state *link_state; /* ASPM link state */
unsigned int ltr_path:1; /* Latency Tolerance Reporting
supported from root to here */
u16 l1ss; /* L1SS Capability pointer */
#endif
unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */
unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */
pci_channel_state_t error_state; /* Current connectivity state */
struct device dev; /* Generic device interface */
int cfg_size; /* Size of config space */
/*
* Instead of touching interrupt line and base address registers
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
bool match_driver; /* Skip attaching driver */
unsigned int transparent:1; /* Subtractive decode bridge */
unsigned int io_window:1; /* Bridge has I/O window */
unsigned int pref_window:1; /* Bridge has pref mem window */
unsigned int pref_64_window:1; /* Pref mem window is 64-bit */
unsigned int multifunction:1; /* Multi-function device */
unsigned int is_busmaster:1; /* Is busmaster */
unsigned int no_msi:1; /* May not use MSI */
unsigned int no_64bit_msi:1; /* May only use 32-bit MSIs */
unsigned int block_cfg_access:1; /* Config space access blocked */
unsigned int broken_parity_status:1; /* Generates false positive parity */
unsigned int irq_reroute_variant:2; /* Needs IRQ rerouting variant */
unsigned int msi_enabled:1;
unsigned int msix_enabled:1;
unsigned int ari_enabled:1; /* ARI forwarding */
unsigned int ats_enabled:1; /* Address Translation Svc */
unsigned int pasid_enabled:1; /* Process Address Space ID */
unsigned int pri_enabled:1; /* Page Request Interface */
unsigned int is_managed:1; /* Managed via devres */
unsigned int is_msi_managed:1; /* MSI release via devres installed */
unsigned int needs_freset:1; /* Requires fundamental reset */
unsigned int state_saved:1;
unsigned int is_physfn:1;
unsigned int is_virtfn:1;
unsigned int is_hotplug_bridge:1;
unsigned int shpc_managed:1; /* SHPC owned by shpchp */
unsigned int is_thunderbolt:1; /* Thunderbolt controller */
/*
* Devices marked being untrusted are the ones that can potentially
* execute DMA attacks and similar. They are typically connected
* through external ports such as Thunderbolt but not limited to
* that. When an IOMMU is enabled they should be getting full
* mappings to make sure they cannot access arbitrary memory.
*/
unsigned int untrusted:1;
/*
* Info from the platform, e.g., ACPI or device tree, may mark a
* device as "external-facing". An external-facing device is
* itself internal but devices downstream from it are external.
*/
unsigned int external_facing:1;
unsigned int broken_intx_masking:1; /* INTx masking can't be used */
unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */
unsigned int irq_managed:1;
unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */
unsigned int is_probed:1; /* Device probing in progress */
unsigned int link_active_reporting:1;/* Device capable of reporting link active */
unsigned int no_vf_scan:1; /* Don't scan for VFs after IOV enablement */
unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */
unsigned int rom_bar_overlap:1; /* ROM BAR disable broken */
pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */
u32 saved_config_space[16]; /* Config space saved at suspend time */
struct hlist_head saved_cap_space;
int rom_attr_enabled; /* Display of ROM attribute enabled? */
struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
#ifdef CONFIG_HOTPLUG_PCI_PCIE
unsigned int broken_cmd_compl:1; /* No compl for some cmds */
#endif
#ifdef CONFIG_PCIE_PTM
u16 ptm_cap; /* PTM Capability */
unsigned int ptm_root:1;
unsigned int ptm_enabled:1;
u8 ptm_granularity;
#endif
#ifdef CONFIG_PCI_MSI
void __iomem *msix_base;
raw_spinlock_t msi_lock;
#endif
struct pci_vpd vpd;
#ifdef CONFIG_PCIE_DPC
u16 dpc_cap;
unsigned int dpc_rp_extensions:1;
u8 dpc_rp_log_size;
#endif
#ifdef CONFIG_PCI_ATS
union {
struct pci_sriov *sriov; /* PF: SR-IOV info */
struct pci_dev *physfn; /* VF: related PF */
};
u16 ats_cap; /* ATS Capability offset */
u8 ats_stu; /* ATS Smallest Translation Unit */
#endif
#ifdef CONFIG_PCI_PRI
u16 pri_cap; /* PRI Capability offset */
u32 pri_reqs_alloc; /* Number of PRI requests allocated */
unsigned int pasid_required:1; /* PRG Response PASID Required */
#endif
#ifdef CONFIG_PCI_PASID
u16 pasid_cap; /* PASID Capability offset */
u16 pasid_features;
#endif
#ifdef CONFIG_PCI_P2PDMA
struct pci_p2pdma __rcu *p2pdma;
#endif
u16 acs_cap; /* ACS Capability offset */
phys_addr_t rom; /* Physical address if not from BAR */
size_t romlen; /* Length if not from BAR */
/*
* Driver name to force a match. Do not set directly, because core
* frees it. Use driver_set_override() to set or clear it.
*/
const char *driver_override;
unsigned long priv_flags; /* Private flags for the PCI driver */
/* These methods index pci_reset_fn_methods[] */
u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
};
(3)初始化vp_dev即virtio_pci_device结构中virtio_device类型的vdev成员相关结构。代码片段如下:
vp_dev->vdev.dev.parent = &pci_dev->dev;
vp_dev->vdev.dev.release = virtio_pci_release_dev;
vp_dev->pci_dev = pci_dev;
INIT_LIST_HEAD(&vp_dev->virtqueues);
spin_lock_init(&vp_dev->lock);
virtio_pci_probe函数其余部分代码的解析,请看下回。