feat: pci passthrough (#114)

* feat: pci passthrough

* feat: guest device management

* feat: addons mounting and kernel modules support

* feat: more pci work

* fix: kernel build squashfs fixes

* fix: e820entry should be available on all platforms
This commit is contained in:
Alex Zenla
2024-04-29 10:02:20 -07:00
committed by GitHub
parent bece7f33c7
commit 84920a88ab
33 changed files with 2294 additions and 1944 deletions

View File

@ -3,23 +3,29 @@ pub mod sys;
use crate::error::{Error, Result};
use crate::sys::{
AddressSize, CreateDomain, DomCtl, DomCtlValue, DomCtlVcpuContext, EvtChnAllocUnbound,
GetDomainInfo, GetPageFrameInfo3, Hypercall, HypercallInit, MaxMem, MaxVcpus, MemoryMap,
MemoryReservation, MmapBatch, MmapResource, MmuExtOp, MultiCallEntry, VcpuGuestContext,
VcpuGuestContextAny, XenCapabilitiesInfo, HYPERVISOR_DOMCTL, HYPERVISOR_EVENT_CHANNEL_OP,
HYPERVISOR_MEMORY_OP, HYPERVISOR_MMUEXT_OP, HYPERVISOR_MULTICALL, HYPERVISOR_XEN_VERSION,
XENVER_CAPABILITIES, XEN_DOMCTL_CREATEDOMAIN, XEN_DOMCTL_DESTROYDOMAIN,
XEN_DOMCTL_GETDOMAININFO, XEN_DOMCTL_GETPAGEFRAMEINFO3, XEN_DOMCTL_GETVCPUCONTEXT,
XEN_DOMCTL_HYPERCALL_INIT, XEN_DOMCTL_MAX_MEM, XEN_DOMCTL_MAX_VCPUS, XEN_DOMCTL_PAUSEDOMAIN,
XEN_DOMCTL_SETVCPUCONTEXT, XEN_DOMCTL_SET_ADDRESS_SIZE, XEN_DOMCTL_UNPAUSEDOMAIN,
XEN_MEM_CLAIM_PAGES, XEN_MEM_MEMORY_MAP, XEN_MEM_POPULATE_PHYSMAP,
AddressSize, AssignDevice, CreateDomain, DomCtl, DomCtlValue, DomCtlVcpuContext,
EvtChnAllocUnbound, GetDomainInfo, GetPageFrameInfo3, Hypercall, HypercallInit,
IoMemPermission, IoPortPermission, IrqPermission, MaxMem, MaxVcpus, MemoryMap,
MemoryReservation, MmapBatch, MmapResource, MmuExtOp, MultiCallEntry, PciAssignDevice,
VcpuGuestContext, VcpuGuestContextAny, XenCapabilitiesInfo, DOMCTL_DEV_PCI, HYPERVISOR_DOMCTL,
HYPERVISOR_EVENT_CHANNEL_OP, HYPERVISOR_MEMORY_OP, HYPERVISOR_MMUEXT_OP, HYPERVISOR_MULTICALL,
HYPERVISOR_XEN_VERSION, XENVER_CAPABILITIES, XEN_DOMCTL_ASSIGN_DEVICE, XEN_DOMCTL_CREATEDOMAIN,
XEN_DOMCTL_DESTROYDOMAIN, XEN_DOMCTL_GETDOMAININFO, XEN_DOMCTL_GETPAGEFRAMEINFO3,
XEN_DOMCTL_GETVCPUCONTEXT, XEN_DOMCTL_HYPERCALL_INIT, XEN_DOMCTL_IOMEM_PERMISSION,
XEN_DOMCTL_IOPORT_PERMISSION, XEN_DOMCTL_IRQ_PERMISSION, XEN_DOMCTL_MAX_MEM,
XEN_DOMCTL_MAX_VCPUS, XEN_DOMCTL_PAUSEDOMAIN, XEN_DOMCTL_SETVCPUCONTEXT,
XEN_DOMCTL_SET_ADDRESS_SIZE, XEN_DOMCTL_UNPAUSEDOMAIN, XEN_MEM_CLAIM_PAGES, XEN_MEM_MEMORY_MAP,
XEN_MEM_POPULATE_PHYSMAP,
};
use libc::{c_int, mmap, usleep, MAP_FAILED, MAP_SHARED, PROT_READ, PROT_WRITE};
use log::trace;
use nix::errno::Errno;
use std::ffi::{c_long, c_uint, c_ulong, c_void};
use std::sync::Arc;
use sys::{XEN_DOMCTL_MAX_INTERFACE_VERSION, XEN_DOMCTL_MIN_INTERFACE_VERSION};
use sys::{
E820Entry, ForeignMemoryMap, PhysdevMapPirq, HYPERVISOR_PHYSDEV_OP, PHYSDEVOP_MAP_PIRQ,
XEN_DOMCTL_MAX_INTERFACE_VERSION, XEN_DOMCTL_MIN_INTERFACE_VERSION, XEN_MEM_SET_MEMORY_MAP,
};
use tokio::sync::Semaphore;
use std::fs::{File, OpenOptions};
@ -569,26 +575,42 @@ impl XenCall {
Ok(())
}
pub async fn get_memory_map(&self, size_of_entry: usize) -> Result<Vec<u8>> {
pub async fn get_memory_map(&self, max_entries: u32) -> Result<Vec<E820Entry>> {
let mut memory_map = MemoryMap {
count: 0,
count: max_entries,
buffer: 0,
};
let mut entries = vec![E820Entry::default(); max_entries as usize];
memory_map.buffer = entries.as_mut_ptr() as c_ulong;
self.hypercall2(
HYPERVISOR_MEMORY_OP,
XEN_MEM_MEMORY_MAP as c_ulong,
addr_of_mut!(memory_map) as c_ulong,
)
.await?;
entries.truncate(memory_map.count as usize);
Ok(entries)
}
pub async fn set_memory_map(
&self,
domid: u32,
entries: Vec<E820Entry>,
) -> Result<Vec<E820Entry>> {
let mut memory_map = ForeignMemoryMap {
domid: domid as u16,
map: MemoryMap {
count: entries.len() as u32,
buffer: entries.as_ptr() as u64,
},
};
self.hypercall2(
HYPERVISOR_MEMORY_OP,
XEN_MEM_MEMORY_MAP as c_ulong,
XEN_MEM_SET_MEMORY_MAP as c_ulong,
addr_of_mut!(memory_map) as c_ulong,
)
.await?;
let mut buffer = vec![0u8; memory_map.count as usize * size_of_entry];
memory_map.buffer = buffer.as_mut_ptr() as c_ulong;
self.hypercall2(
HYPERVISOR_MEMORY_OP,
XEN_MEM_MEMORY_MAP as c_ulong,
addr_of_mut!(memory_map) as c_ulong,
)
.await?;
Ok(buffer)
Ok(entries)
}
pub async fn populate_physmap(
@ -671,4 +693,140 @@ impl XenCall {
.await
.map(|_| ())
}
pub async fn iomem_permission(
&self,
domid: u32,
first_mfn: u64,
nr_mfns: u64,
allow: bool,
) -> Result<()> {
trace!(
"domctl fd={} iomem_permission domid={} first_mfn={:#x}, nr_mfns={:#x} allow={}",
self.handle.as_raw_fd(),
domid,
first_mfn,
nr_mfns,
allow,
);
let mut domctl = DomCtl {
cmd: XEN_DOMCTL_IOMEM_PERMISSION,
interface_version: self.domctl_interface_version,
domid,
value: DomCtlValue {
iomem_permission: IoMemPermission {
first_mfn,
nr_mfns,
allow: if allow { 1 } else { 0 },
},
},
};
self.hypercall1(HYPERVISOR_DOMCTL, addr_of_mut!(domctl) as c_ulong)
.await?;
Ok(())
}
pub async fn ioport_permission(
&self,
domid: u32,
first_port: u32,
nr_ports: u32,
allow: bool,
) -> Result<()> {
trace!(
"domctl fd={} ioport_permission domid={} first_port={:#x}, nr_ports={:#x} allow={}",
self.handle.as_raw_fd(),
domid,
first_port,
nr_ports,
allow,
);
let mut domctl = DomCtl {
cmd: XEN_DOMCTL_IOPORT_PERMISSION,
interface_version: self.domctl_interface_version,
domid,
value: DomCtlValue {
ioport_permission: IoPortPermission {
first_port,
nr_ports,
allow: if allow { 1 } else { 0 },
},
},
};
self.hypercall1(HYPERVISOR_DOMCTL, addr_of_mut!(domctl) as c_ulong)
.await?;
Ok(())
}
pub async fn irq_permission(&self, domid: u32, irq: u32, allow: bool) -> Result<()> {
trace!(
"domctl fd={} irq_permission domid={} irq={} allow={}",
self.handle.as_raw_fd(),
domid,
irq,
allow,
);
let mut domctl = DomCtl {
cmd: XEN_DOMCTL_IRQ_PERMISSION,
interface_version: self.domctl_interface_version,
domid,
value: DomCtlValue {
irq_permission: IrqPermission {
pirq: irq,
allow: if allow { 1 } else { 0 },
pad: [0; 3],
},
},
};
self.hypercall1(HYPERVISOR_DOMCTL, addr_of_mut!(domctl) as c_ulong)
.await?;
Ok(())
}
pub async fn map_pirq(&self, domid: u32, index: isize, pirq: Option<u32>) -> Result<u32> {
trace!(
"physdev fd={} map_pirq domid={} index={} pirq={:?}",
self.handle.as_raw_fd(),
domid,
index,
pirq,
);
let mut physdev = PhysdevMapPirq::default();
physdev.domid = domid as u16;
physdev.typ = 0x1;
physdev.index = index as c_int;
physdev.pirq = pirq.map(|x| x as c_int).unwrap_or(index as c_int);
self.hypercall2(
HYPERVISOR_PHYSDEV_OP,
PHYSDEVOP_MAP_PIRQ,
addr_of_mut!(physdev) as c_ulong,
)
.await?;
Ok(physdev.pirq as u32)
}
pub async fn assign_device(&self, domid: u32, sbdf: u32, flags: u32) -> Result<()> {
trace!(
"domctl fd={} assign_device domid={} sbdf={} flags={}",
self.handle.as_raw_fd(),
domid,
sbdf,
flags,
);
let mut domctl = DomCtl {
cmd: XEN_DOMCTL_ASSIGN_DEVICE,
interface_version: self.domctl_interface_version,
domid,
value: DomCtlValue {
assign_device: AssignDevice {
device: DOMCTL_DEV_PCI,
flags,
pci_assign_device: PciAssignDevice { sbdf, padding: 0 },
},
},
};
self.hypercall1(HYPERVISOR_DOMCTL, addr_of_mut!(domctl) as c_ulong)
.await?;
Ok(())
}
}

View File

@ -104,6 +104,7 @@ pub const XEN_DOMCTL_CDF_HAP: u32 = 1u32 << 1;
pub const XEN_DOMCTL_CDF_S3_INTEGRITY: u32 = 1u32 << 2;
pub const XEN_DOMCTL_CDF_OOS_OFF: u32 = 1u32 << 3;
pub const XEN_DOMCTL_CDF_XS_DOMAIN: u32 = 1u32 << 4;
pub const XEN_DOMCTL_CDF_IOMMU: u32 = 1u32 << 5;
pub const XEN_X86_EMU_LAPIC: u32 = 1 << 0;
pub const XEN_X86_EMU_HPET: u32 = 1 << 1;
@ -237,6 +238,10 @@ pub union DomCtlValue {
pub vcpu_context: DomCtlVcpuContext,
pub address_size: AddressSize,
pub get_page_frame_info: GetPageFrameInfo3,
pub ioport_permission: IoPortPermission,
pub iomem_permission: IoMemPermission,
pub irq_permission: IrqPermission,
pub assign_device: AssignDevice,
pub pad: [u8; 128],
}
@ -309,6 +314,30 @@ pub struct GetPageFrameInfo3 {
pub array: c_ulong,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct IoPortPermission {
pub first_port: u32,
pub nr_ports: u32,
pub allow: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct IoMemPermission {
pub first_mfn: u64,
pub nr_mfns: u64,
pub allow: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct IrqPermission {
pub pirq: u32,
pub allow: u8,
pub pad: [u8; 3],
}
#[repr(C)]
#[derive(Copy, Clone, Debug, Default)]
#[cfg(target_arch = "x86_64")]
@ -378,7 +407,8 @@ pub struct MultiCallEntry {
}
pub const XEN_MEM_POPULATE_PHYSMAP: u32 = 6;
pub const XEN_MEM_MEMORY_MAP: u32 = 9;
pub const XEN_MEM_MEMORY_MAP: u32 = 10;
pub const XEN_MEM_SET_MEMORY_MAP: u32 = 13;
pub const XEN_MEM_CLAIM_PAGES: u32 = 24;
#[repr(C)]
@ -388,6 +418,13 @@ pub struct MemoryMap {
pub buffer: c_ulong,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ForeignMemoryMap {
pub domid: u16,
pub map: MemoryMap,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct VcpuGuestContextFpuCtx {
@ -582,3 +619,60 @@ pub struct EvtChnAllocUnbound {
pub remote_dom: u16,
pub port: u32,
}
#[repr(C, packed)]
#[derive(Debug, Copy, Clone, Default)]
pub struct E820Entry {
pub addr: u64,
pub size: u64,
pub typ: u32,
}
#[cfg(target_arch = "x86_64")]
pub const E820_MAX: u32 = 1024;
#[cfg(target_arch = "x86_64")]
pub const E820_RAM: u32 = 1;
#[cfg(target_arch = "x86_64")]
pub const E820_RESERVED: u32 = 2;
#[cfg(target_arch = "x86_64")]
pub const E820_ACPI: u32 = 3;
#[cfg(target_arch = "x86_64")]
pub const E820_NVS: u32 = 4;
#[cfg(target_arch = "x86_64")]
pub const E820_UNUSABLE: u32 = 5;
pub const PHYSDEVOP_MAP_PIRQ: u64 = 13;
#[repr(C)]
#[derive(Default, Clone, Copy, Debug)]
pub struct PhysdevMapPirq {
pub domid: u16,
pub typ: c_int,
pub index: c_int,
pub pirq: c_int,
pub bus: c_int,
pub devfn: c_int,
pub entry_nr: u16,
pub table_base: u64,
}
pub const DOMCTL_DEV_RDM_RELAXED: u32 = 1;
pub const DOMCTL_DEV_PCI: u32 = 0;
pub const DOMCTL_DEV_DT: u32 = 1;
#[repr(C)]
#[derive(Default, Clone, Copy, Debug)]
pub struct PciAssignDevice {
pub sbdf: u32,
pub padding: u64,
}
#[repr(C)]
#[derive(Default, Clone, Copy, Debug)]
pub struct AssignDevice {
pub device: u32,
pub flags: u32,
pub pci_assign_device: PciAssignDevice,
}
pub const DOMID_IO: u32 = 0x7FF1;

View File

@ -18,6 +18,7 @@ krata-xencall = { path = "../xencall", version = "^0.0.10" }
krata-xenstore = { path = "../xenstore", version = "^0.0.10" }
memchr = { workspace = true }
nix = { workspace = true }
regex = { workspace = true }
slice-copy = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
@ -34,3 +35,7 @@ name = "xenclient"
[[example]]
name = "xenclient-boot"
path = "examples/boot.rs"
[[example]]
name = "xenclient-pci"
path = "examples/pci.rs"

View File

@ -27,6 +27,7 @@ async fn main() -> Result<()> {
disks: vec![],
channels: vec![],
vifs: vec![],
pcis: vec![],
filesystems: vec![],
extra_keys: vec![],
extra_rw_paths: vec![],

View File

@ -0,0 +1,32 @@
use xenclient::pci::*;
use xenclient::error::Result;
#[tokio::main]
async fn main() -> Result<()> {
let backend = XenPciBackend::new();
if !backend.is_loaded().await? {
return Err(xenclient::error::Error::GenericError(
"xen-pciback module not loaded".to_string(),
));
}
println!("assignable devices:");
for device in backend.list_devices().await? {
let is_assigned = backend.is_assigned(&device).await?;
let has_slot = backend.has_slot(&device).await?;
println!("{} slot={} assigned={}", device, has_slot, is_assigned);
let resources = backend.read_resources(&device).await?;
for resource in resources {
println!(
" resource start={:#x} end={:#x} flags={:#x} bar-io={}",
resource.start,
resource.end,
resource.flags,
resource.is_bar_io()
);
}
}
Ok(())
}

View File

@ -1,5 +1,7 @@
use std::io;
use crate::pci::PciBdf;
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("io issue encountered: {0}")]
@ -34,6 +36,16 @@ pub enum Error {
ElfInvalidImage,
#[error("provided elf image does not contain xen support")]
ElfXenSupportMissing,
#[error("regex error: {0}")]
RegexError(#[from] regex::Error),
#[error("error: {0}")]
GenericError(String),
#[error("failed to parse int: {0}")]
ParseIntError(#[from] std::num::ParseIntError),
#[error("invalid pci bdf string")]
InvalidPciBdfString,
#[error("pci device {0} is not assignable")]
PciDeviceNotAssignable(PciBdf),
}
pub type Result<T> = std::result::Result<T, Error>;

View File

@ -21,18 +21,26 @@ use crate::elfloader::ElfImageLoader;
use crate::error::{Error, Result};
use boot::BootState;
use log::{debug, trace, warn};
use pci::{PciBdf, XenPciBackend};
use sys::XEN_PAGE_SHIFT;
use tokio::time::timeout;
use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use uuid::Uuid;
use xencall::sys::{CreateDomain, XEN_DOMCTL_CDF_HAP, XEN_DOMCTL_CDF_HVM_GUEST};
use xencall::sys::{
CreateDomain, DOMCTL_DEV_RDM_RELAXED, XEN_DOMCTL_CDF_HAP, XEN_DOMCTL_CDF_HVM_GUEST,
XEN_DOMCTL_CDF_IOMMU,
};
use xencall::XenCall;
use xenstore::{
XsPermission, XsdClient, XsdInterface, XS_PERM_NONE, XS_PERM_READ, XS_PERM_READ_WRITE,
};
pub mod pci;
#[derive(Clone)]
pub struct XenClient {
pub store: XsdClient,
@ -78,6 +86,33 @@ pub struct DomainEventChannel {
pub name: String,
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub enum DomainPciRdmReservePolicy {
Invalid,
#[default]
Strict,
Relaxed,
}
impl DomainPciRdmReservePolicy {
pub fn to_option_str(&self) -> &str {
match self {
DomainPciRdmReservePolicy::Invalid => "-1",
DomainPciRdmReservePolicy::Strict => "0",
DomainPciRdmReservePolicy::Relaxed => "1",
}
}
}
#[derive(Clone, Debug)]
pub struct DomainPciDevice {
pub bdf: PciBdf,
pub permissive: bool,
pub msi_translate: bool,
pub power_management: bool,
pub rdm_reserve_policy: DomainPciRdmReservePolicy,
}
#[derive(Clone, Debug)]
pub struct DomainConfig {
pub backend_domid: u32,
@ -93,6 +128,7 @@ pub struct DomainConfig {
pub vifs: Vec<DomainNetworkInterface>,
pub filesystems: Vec<DomainFilesystem>,
pub event_channels: Vec<DomainEventChannel>,
pub pcis: Vec<DomainPciDevice>,
pub extra_keys: Vec<(String, String)>,
pub extra_rw_paths: Vec<String>,
}
@ -118,12 +154,14 @@ impl XenClient {
pub async fn create(&self, config: &DomainConfig) -> Result<CreatedDomain> {
let mut domain = CreateDomain {
max_vcpus: config.max_vcpus,
..Default::default()
};
domain.max_vcpus = config.max_vcpus;
if cfg!(target_arch = "aarch64") {
domain.flags = XEN_DOMCTL_CDF_HVM_GUEST | XEN_DOMCTL_CDF_HAP;
} else {
domain.flags = XEN_DOMCTL_CDF_IOMMU;
}
let domid = self.call.create_domain(domain).await?;
@ -411,6 +449,19 @@ impl XenClient {
.await?;
}
for (index, pci) in config.pcis.iter().enumerate() {
self.pci_device_add(
&dom_path,
&backend_dom_path,
config.backend_domid,
domid,
index,
config.pcis.len(),
pci,
)
.await?;
}
for channel in &config.event_channels {
let id = self
.call
@ -645,6 +696,129 @@ impl XenClient {
Ok(())
}
#[allow(clippy::too_many_arguments)]
async fn pci_device_add(
&self,
dom_path: &str,
backend_dom_path: &str,
backend_domid: u32,
domid: u32,
index: usize,
device_count: usize,
device: &DomainPciDevice,
) -> Result<()> {
let backend = XenPciBackend::new();
if !backend.is_assigned(&device.bdf).await? {
return Err(Error::PciDeviceNotAssignable(device.bdf));
}
let resources = backend.read_resources(&device.bdf).await?;
for resource in resources {
if resource.is_bar_io() {
self.call
.ioport_permission(domid, resource.start as u32, resource.size() as u32, true)
.await?;
} else {
self.call
.iomem_permission(
domid,
resource.start >> XEN_PAGE_SHIFT,
(resource.size() + (XEN_PAGE_SHIFT - 1)) >> XEN_PAGE_SHIFT,
true,
)
.await?;
}
}
// backend.reset(&device.bdf).await?;
self.call
.assign_device(
domid,
device.bdf.encode(),
if device.rdm_reserve_policy == DomainPciRdmReservePolicy::Relaxed {
DOMCTL_DEV_RDM_RELAXED
} else {
0
},
)
.await?;
let id = 60;
if index == 0 {
let backend_items: Vec<(&str, String)> = vec![
("frontend-id", domid.to_string()),
("online", "1".to_string()),
("state", "1".to_string()),
("num_devs", device_count.to_string()),
];
let frontend_items: Vec<(&str, String)> = vec![
("backend-id", backend_domid.to_string()),
("state", "1".to_string()),
];
self.device_add(
"pci",
id,
dom_path,
backend_dom_path,
backend_domid,
domid,
frontend_items,
backend_items,
)
.await?;
}
let backend_path = format!("{}/backend/{}/{}/{}", backend_dom_path, "pci", domid, id);
let transaction = self.store.transaction().await?;
transaction
.write_string(
format!("{}/key-{}", backend_path, index),
&device.bdf.to_string(),
)
.await?;
transaction
.write_string(
format!("{}/dev-{}", backend_path, index),
&device.bdf.to_string(),
)
.await?;
if let Some(vdefn) = device.bdf.vdefn {
transaction
.write_string(
format!("{}/vdefn-{}", backend_path, index),
&format!("{:#x}", vdefn),
)
.await?;
}
let mut options = HashMap::new();
options.insert("permissive", if device.permissive { "1" } else { "0" });
options.insert("rdm_policy", device.rdm_reserve_policy.to_option_str());
options.insert("msitranslate", if device.msi_translate { "1" } else { "0" });
options.insert(
"power_mgmt",
if device.power_management { "1" } else { "0" },
);
let options = options
.into_iter()
.map(|(key, value)| format!("{}={}", key, value))
.collect::<Vec<_>>()
.join(",");
transaction
.write_string(format!("{}/opts-{}", backend_path, index), &options)
.await?;
transaction.commit().await?;
Ok(())
}
#[allow(clippy::too_many_arguments)]
async fn device_add(
&self,
@ -809,21 +983,4 @@ impl XenClient {
tx.commit().await?;
Ok(())
}
pub async fn get_console_path(&self, domid: u32) -> Result<String> {
let dom_path = self.store.get_domain_path(domid).await?;
let console_tty_path = format!("{}/console/tty", dom_path);
let mut tty: Option<String> = None;
for _ in 0..5 {
tty = self.store.read_string(&console_tty_path).await?;
if tty.is_some() {
break;
}
tokio::time::sleep(Duration::from_millis(200)).await;
}
let Some(tty) = tty else {
return Err(Error::TtyNotFound);
};
Ok(tty)
}
}

View File

@ -0,0 +1,305 @@
use regex::Regex;
use std::{fmt::Display, path::PathBuf, str::FromStr};
use tokio::fs;
use crate::error::{Error, Result};
const PCIBACK_SYSFS_PATH: &str = "/sys/bus/pci/drivers/pciback";
const PCI_BDF_REGEX: &str = r"^([0-9a-f]{4}):([0-9a-f]{2}):([0-9a-f]{2}).([0-9a-f]{1})$";
const PCI_BDF_SHORT_REGEX: &str = r"^([0-9a-f]{2}):([0-9a-f]{2}).([0-9a-f]{1})$";
const PCI_BDF_VDEFN_REGEX: &str =
r"^([0-9a-f]{4}):([0-9a-f]{2}):([0-9a-f]{2}).([0-9a-f]{1})@([0-9a-f]{2})$";
const FLAG_PCI_BAR_IO: u64 = 0x1;
#[derive(Clone)]
pub struct XenPciBackend {
path: PathBuf,
}
impl Default for XenPciBackend {
fn default() -> Self {
Self::new()
}
}
impl XenPciBackend {
pub fn new() -> Self {
Self {
path: PathBuf::from(PCIBACK_SYSFS_PATH),
}
}
pub async fn is_loaded(&self) -> Result<bool> {
Ok(fs::try_exists(&self.path).await?)
}
pub async fn list_devices(&self) -> Result<Vec<PciBdf>> {
let mut devices = Vec::new();
let mut dir = fs::read_dir(&self.path).await?;
while let Some(entry) = dir.next_entry().await? {
let file_name_string = entry.file_name().to_string_lossy().to_string();
let Some(bdf) = PciBdf::from_str(&file_name_string).ok() else {
continue;
};
devices.push(bdf);
}
Ok(devices)
}
pub async fn is_assigned(&self, bdf: &PciBdf) -> Result<bool> {
let mut path = self.path.clone();
path.push(bdf.to_string());
Ok(fs::try_exists(path).await?)
}
pub async fn read_irq(&self, bdf: &PciBdf) -> Result<Option<u32>> {
let mut path: PathBuf = self.path.clone();
path.push(bdf.to_string());
path.push("irq");
if !path.exists() {
return Ok(None);
}
let content = fs::read_to_string(&path).await?;
Ok(u32::from_str(content.trim()).ok())
}
pub async fn read_resources(&self, bdf: &PciBdf) -> Result<Vec<PciMemoryResource>> {
let mut resources = Vec::new();
let mut path: PathBuf = self.path.clone();
path.push(bdf.to_string());
path.push("resource");
let content = fs::read_to_string(&path).await?;
for line in content.lines() {
let parts = line.split(' ').collect::<Vec<_>>();
if parts.len() != 3 {
continue;
}
let Some(start) = parts.first() else {
continue;
};
let Some(end) = parts.get(1) else {
continue;
};
let Some(flags) = parts.get(2) else {
continue;
};
if !start.starts_with("0x") || !end.starts_with("0x") || !flags.starts_with("0x") {
continue;
}
let start = &start[2..];
let end = &end[2..];
let flags = &flags[2..];
let Some(start) = u64::from_str_radix(start, 16).ok() else {
continue;
};
let Some(end) = u64::from_str_radix(end, 16).ok() else {
continue;
};
let Some(flags) = u64::from_str_radix(flags, 16).ok() else {
continue;
};
if start > 0 {
resources.push(PciMemoryResource::new(start, end, flags));
}
}
Ok(resources)
}
pub async fn has_slot(&self, bdf: &PciBdf) -> Result<bool> {
let mut slots_path = self.path.clone();
slots_path.push("slots");
let content = fs::read_to_string(&slots_path).await?;
for line in content.lines() {
if let Ok(slot) = PciBdf::from_str(line) {
if slot == *bdf {
return Ok(true);
}
}
}
Ok(false)
}
pub async fn reset(&self, bdf: &PciBdf) -> Result<()> {
let mut path: PathBuf = self.path.clone();
path.push("do_flr");
fs::write(&path, bdf.to_string()).await?;
Ok(())
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub struct PciBdf {
pub domain: Option<u32>,
pub bus: u16,
pub device: u16,
pub function: u16,
pub vdefn: Option<u16>,
}
impl PciBdf {
pub fn new(
domain: Option<u32>,
bus: u16,
device: u16,
function: u16,
vdefn: Option<u16>,
) -> Self {
Self {
domain,
bus,
device,
function,
vdefn,
}
}
pub fn with_domain(&self, domain: u32) -> PciBdf {
PciBdf {
domain: Some(domain),
bus: self.bus,
device: self.device,
function: self.function,
vdefn: self.vdefn,
}
}
pub fn encode(&self) -> u32 {
let mut value = self.domain.unwrap_or(0) << 16u32;
value |= ((self.bus & 0xff) << 8u32) as u32;
value |= ((self.device & 0x1f) << 3u32) as u32;
value |= (self.function & 0x7) as u32;
value
}
}
impl FromStr for PciBdf {
type Err = Error;
fn from_str(s: &str) -> Result<Self> {
let pci_bdf_regex = Regex::from_str(PCI_BDF_REGEX)?;
let pci_bdf_vdefn_regex = Regex::from_str(PCI_BDF_VDEFN_REGEX)?;
let pci_bdf_short_regex = Regex::from_str(PCI_BDF_SHORT_REGEX)?;
if let Some(pci_bdf_captures) = pci_bdf_regex.captures(s) {
let domain = pci_bdf_captures
.get(1)
.ok_or_else(|| Error::GenericError("capture group 1 did not exist".to_string()))?;
let bus = pci_bdf_captures
.get(2)
.ok_or_else(|| Error::GenericError("capture group 2 did not exist".to_string()))?;
let device = pci_bdf_captures
.get(3)
.ok_or_else(|| Error::GenericError("capture group 3 did not exist".to_string()))?;
let function = pci_bdf_captures
.get(4)
.ok_or_else(|| Error::GenericError("capture group 4 did not exist".to_string()))?;
let domain = u32::from_str_radix(domain.as_str(), 16)?;
let bus = u16::from_str_radix(bus.as_str(), 16)?;
let device = u16::from_str_radix(device.as_str(), 16)?;
let function = u16::from_str_radix(function.as_str(), 16)?;
Ok(PciBdf::new(Some(domain), bus, device, function, None))
} else if let Some(pci_bdf_vdefn_captures) = pci_bdf_vdefn_regex.captures(s) {
let domain = pci_bdf_vdefn_captures
.get(1)
.ok_or_else(|| Error::GenericError("capture group 1 did not exist".to_string()))?;
let bus = pci_bdf_vdefn_captures
.get(2)
.ok_or_else(|| Error::GenericError("capture group 2 did not exist".to_string()))?;
let device = pci_bdf_vdefn_captures
.get(3)
.ok_or_else(|| Error::GenericError("capture group 3 did not exist".to_string()))?;
let function = pci_bdf_vdefn_captures
.get(4)
.ok_or_else(|| Error::GenericError("capture group 4 did not exist".to_string()))?;
let vdefn = pci_bdf_vdefn_captures
.get(5)
.ok_or_else(|| Error::GenericError("capture group 5 did not exist".to_string()))?;
let domain = u32::from_str_radix(domain.as_str(), 16)?;
let bus = u16::from_str_radix(bus.as_str(), 16)?;
let device = u16::from_str_radix(device.as_str(), 16)?;
let function = u16::from_str_radix(function.as_str(), 16)?;
let vdefn = u16::from_str_radix(vdefn.as_str(), 16)?;
Ok(PciBdf::new(
Some(domain),
bus,
device,
function,
Some(vdefn),
))
} else if let Some(pci_bdf_short_captures) = pci_bdf_short_regex.captures(s) {
let bus = pci_bdf_short_captures
.get(1)
.ok_or_else(|| Error::GenericError("capture group 1 did not exist".to_string()))?;
let device = pci_bdf_short_captures
.get(2)
.ok_or_else(|| Error::GenericError("capture group 2 did not exist".to_string()))?;
let function = pci_bdf_short_captures
.get(3)
.ok_or_else(|| Error::GenericError("capture group 3 did not exist".to_string()))?;
let bus = u16::from_str_radix(bus.as_str(), 16)?;
let device = u16::from_str_radix(device.as_str(), 16)?;
let function = u16::from_str_radix(function.as_str(), 16)?;
Ok(PciBdf::new(None, bus, device, function, None))
} else {
Err(Error::InvalidPciBdfString)
}
}
}
impl Display for PciBdf {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(domain) = self.domain {
if let Some(vdefn) = self.vdefn {
write!(
f,
"{:04x}:{:02x}:{:02x}.{:01x}@{:02x}",
domain, self.bus, self.device, self.function, vdefn
)
} else {
write!(
f,
"{:04x}:{:02x}:{:02x}.{:01x}",
domain, self.bus, self.device, self.function
)
}
} else {
write!(
f,
"{:02x}:{:02x}.{:01x}",
self.bus, self.device, self.function
)
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct PciMemoryResource {
pub start: u64,
pub end: u64,
pub flags: u64,
}
impl PciMemoryResource {
pub fn new(start: u64, end: u64, flags: u64) -> PciMemoryResource {
PciMemoryResource { start, end, flags }
}
pub fn is_bar_io(&self) -> bool {
(self.flags & FLAG_PCI_BAR_IO) != 0
}
pub fn size(&self) -> u64 {
(self.end - self.start) + 1
}
}

View File

@ -11,7 +11,9 @@ use slice_copy::copy;
use std::cmp::{max, min};
use std::mem::size_of;
use std::slice;
use xencall::sys::{VcpuGuestContext, MMUEXT_PIN_L4_TABLE};
use xencall::sys::{
E820Entry, VcpuGuestContext, E820_MAX, E820_RAM, E820_UNUSABLE, MMUEXT_PIN_L4_TABLE,
};
pub const X86_PAGE_SHIFT: u64 = 12;
pub const X86_PAGE_SIZE: u64 = 1 << X86_PAGE_SHIFT;
@ -273,6 +275,154 @@ impl X86BootSetup {
self.table.mappings[m] = map;
Ok(m)
}
fn e820_sanitize(
&self,
mut source: Vec<E820Entry>,
map_limit_kb: u64,
balloon_kb: u64,
) -> Result<Vec<E820Entry>> {
let mut e820 = vec![E820Entry::default(); E820_MAX as usize];
for entry in &mut source {
if entry.addr > 0x100000 {
continue;
}
// entries under 1MB should be removed.
entry.typ = 0;
entry.size = 0;
entry.addr = u64::MAX;
}
let mut lowest = u64::MAX;
let mut highest = 0;
for entry in &source {
if entry.typ == E820_RAM || entry.typ == E820_UNUSABLE || entry.typ == 0 {
continue;
}
lowest = if entry.addr < lowest {
entry.addr
} else {
lowest
};
highest = if entry.addr + entry.size > highest {
entry.addr + entry.size
} else {
highest
}
}
let start_kb = if lowest > 1024 { lowest >> 10 } else { 0 };
let mut idx: usize = 0;
e820[idx].addr = 0;
e820[idx].size = map_limit_kb << 10;
e820[idx].typ = E820_RAM;
let mut delta_kb = 0u64;
if start_kb > 0 && map_limit_kb > start_kb {
delta_kb = map_limit_kb - start_kb;
if delta_kb > 0 {
e820[idx].size -= delta_kb << 10;
}
}
let ram_end = source[0].addr + source[0].size;
idx += 1;
for src in &mut source {
let end = src.addr + src.size;
if src.typ == E820_UNUSABLE || end < ram_end {
src.typ = 0;
continue;
}
if src.typ != E820_RAM {
continue;
}
if src.addr >= (1 << 32) {
continue;
}
if src.addr < ram_end {
let delta = ram_end - src.addr;
src.typ = E820_UNUSABLE;
if src.size < delta {
src.typ = 0;
} else {
src.size -= delta;
src.addr = ram_end;
}
if src.addr + src.size != end {
src.typ = 0;
}
}
if end > ram_end {
src.typ = E820_UNUSABLE;
}
}
if lowest > ram_end {
let mut add_unusable = true;
for src in &mut source {
if !add_unusable {
break;
}
if src.typ != E820_UNUSABLE {
continue;
}
if ram_end != src.addr {
continue;
}
if lowest != src.addr + src.size {
src.size = lowest - src.addr;
}
add_unusable = false;
}
if add_unusable {
e820[1].typ = E820_UNUSABLE;
e820[1].addr = ram_end;
e820[1].size = lowest - ram_end;
}
}
for src in &source {
if src.typ == E820_RAM || src.typ == 0 {
continue;
}
e820[idx].typ = src.typ;
e820[idx].addr = src.addr;
e820[idx].size = src.size;
idx += 1;
}
if balloon_kb > 0 || delta_kb > 0 {
e820[idx].typ = E820_RAM;
e820[idx].addr = if (1u64 << 32u64) > highest {
1u64 << 32u64
} else {
highest
};
e820[idx].size = (delta_kb << 10) + (balloon_kb << 10);
}
Ok(e820)
}
}
#[async_trait::async_trait]
@ -615,6 +765,13 @@ impl ArchBootSetup for X86BootSetup {
let pg_mfn = setup.phys.p2m[pg_pfn as usize];
setup.phys.unmap(pg_pfn)?;
setup.phys.unmap(p2m_segment.pfn)?;
let map = setup.call.get_memory_map(E820_MAX).await?;
let mem_mb = setup.total_pages >> (20 - self.page_shift());
let mem_kb = mem_mb * 1024;
let e820 = self.e820_sanitize(map, mem_kb, 0)?;
setup.call.set_memory_map(setup.domid, e820).await?;
setup
.call
.mmuext(setup.domid, MMUEXT_PIN_L4_TABLE, pg_mfn, 0)