diff --git a/.gitignore b/.gitignore index ea8c4bf7f3..45e9459eaa 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ /target + +# Our CI script wants this in our project directory, but this is a compiled binary file +hermit-loader* diff --git a/Cargo.lock b/Cargo.lock index 4a76d5cd1a..ca9212aa43 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,9 +68,9 @@ checksum = "4ac6c08a67736554282858203cd9b7ff53cf55f54c34e85689962748a350cbf0" [[package]] name = "allocator-api2" -version = "0.2.21" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +checksum = "c583acf993cf4245c4acb0a2cc2ab1f9cc097de73411bb6d3647ff6af2b1013d" [[package]] name = "android-tzdata" @@ -197,9 +197,9 @@ checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" [[package]] name = "bitfield-struct" -version = "0.9.5" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2869c63ccf4f8bf0d485070b880e60e097fb7aeea80ee82a0a94a957e372a0b" +checksum = "d3ca019570363e800b05ad4fd890734f28ac7b72f563ad8a35079efb793616f8" dependencies = [ "proc-macro2", "quote", @@ -1609,9 +1609,8 @@ dependencies = [ [[package]] name = "talc" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fcad3be1cfe36eb7d716a04791eba36a197da9d9b6ea1e28e64ac569da3701d" +version = "4.4.3" +source = "git+https://github.com/hcsch/talc.git?branch=try-without-oom-handler#be81838265fb2319aaaba01f5c01dd46457255f4" dependencies = [ "lock_api", ] @@ -1778,9 +1777,9 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "virtio-spec" -version = "0.2.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49c4d9057347dd68d5c3f9ac5ec5d76adfe59d7378c4ae44f022795aa42953e6" +checksum = "cd2f32d50a7e480738d6e43ea2048cd1c34cc33362e55d87d1eebd02bd2f563f" dependencies = [ "allocator-api2", "bitfield-struct", diff --git a/Cargo.toml b/Cargo.toml index 31c424978b..eceeb075f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,8 @@ trace = [] udp = ["smoltcp", "smoltcp/socket-udp"] vga = [] vsock = ["pci"] +balloon = ["pci"] +allocation-stats = ["talc/counters"] [lints.rust] rust_2018_idioms = "warn" @@ -99,7 +101,7 @@ unreadable_literal = "warn" [dependencies] hermit-macro = { version = "=0.1.0", path = "hermit-macro" } -virtio = { package = "virtio-spec", version = "0.2", features = ["alloc", "mmio", "nightly", "zerocopy"] } +virtio = { package = "virtio-spec", version = "0.3.2", features = ["alloc", "mmio", "nightly", "zerocopy"] } ahash = { version = "0.8", default-features = false } align-address = "0.3" anstyle = { version = "1", default-features = false } @@ -130,7 +132,7 @@ shell-words = { version = "1.1", default-features = false } simple-shell = { version = "0.0.1", optional = true } smallvec = { version = "1", features = ["const_new"] } take-static = "0.1" -talc = { version = "4" } +talc = { git = "https://github.com/hcsch/talc.git", version = "4.4.3", branch = "try-without-oom-handler" } time = { version = "0.3", default-features = false } volatile = "0.6" zerocopy = { version = "0.8", default-features = false } @@ -204,3 +206,6 @@ members = [ exclude = [ "hermit-builtins", ] + +[profile.release] +debug = true diff --git a/hermit-builtins/Cargo.toml b/hermit-builtins/Cargo.toml index 5c664be635..015d0faf11 100644 --- a/hermit-builtins/Cargo.toml +++ b/hermit-builtins/Cargo.toml @@ -10,3 +10,6 @@ libm = "0.2" crate-type = ["staticlib"] [workspace] + +[profile.release] +debug = true diff --git a/src/arch/x86_64/kernel/processor.rs b/src/arch/x86_64/kernel/processor.rs index 7d2b143c6a..cb1a0d1fed 100644 --- a/src/arch/x86_64/kernel/processor.rs +++ b/src/arch/x86_64/kernel/processor.rs @@ -1089,6 +1089,8 @@ fn triple_fault() -> ! { } fn qemu_exit(success: bool) { + // Write exit code into isa-debug-exit port. + // For a value `e` written into the port, QEMU will exit with `(e << 1) | 1`. let code = if success { 3 >> 1 } else { 0 }; unsafe { Port::::new(0xf4).write(code); diff --git a/src/config.rs b/src/config.rs index 6f9581a068..cc07824449 100644 --- a/src/config.rs +++ b/src/config.rs @@ -7,7 +7,8 @@ pub(crate) const USER_STACK_SIZE: usize = 0x0010_0000; #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] pub(crate) const VIRTIO_MAX_QUEUE_SIZE: u16 = if cfg!(feature = "pci") { 2048 } else { 1024 }; diff --git a/src/drivers/balloon/mod.rs b/src/drivers/balloon/mod.rs new file mode 100644 index 0000000000..0ea72d9f4d --- /dev/null +++ b/src/drivers/balloon/mod.rs @@ -0,0 +1,1222 @@ +use alloc::vec::Vec; +use core::alloc::Layout; +use core::fmt::Debug; +use core::num::{NonZeroU32, NonZeroUsize}; +use core::ptr::NonNull; +use core::time::Duration; + +use memory_addresses::VirtAddr; +use pci_types::InterruptLine; +use smallvec::{SmallVec, smallvec}; +use talc::Talc; +use virtio::FeatureBits; +use virtio::balloon::{ConfigVolatileFieldAccess as _, F}; +use volatile::VolatileRef; + +use super::Driver; +use super::virtio::virtqueue::error::VirtqError; +use super::virtio::virtqueue::split::SplitVq; +use super::virtio::virtqueue::{ + AvailBufferToken, BufferElem, BufferType, VirtQueue, Virtq as _, VqIndex, VqSize, +}; +use crate::VIRTIO_MAX_QUEUE_SIZE; +#[cfg(not(feature = "pci"))] +use crate::drivers::virtio::transport::mmio::{ComCfg, IsrStatus, NotifCfg}; +#[cfg(feature = "pci")] +use crate::drivers::virtio::transport::pci::{ComCfg, IsrStatus, NotifCfg}; +use crate::mm::allocator::HermitOomHandler; +use crate::mm::device_alloc::DeviceAlloc; +use crate::mm::{ALLOCATOR, virtual_to_physical}; + +#[cfg(feature = "pci")] +pub mod oom; +#[cfg(feature = "pci")] +mod pci; + +const KIBI: u32 = 1024; +const MEBI: u32 = 1024 * KIBI; +const GIBI: u32 = 1024 * MEBI; + +/// Fixed size of pages as handled by the basic balloon device interface. +/// The basic interface only deals with 4 KiB pages. Optional features can support +/// larger page sizes, e.g. [`F::PAGE_REPORTING`]. +const BALLOON_PAGE_SIZE: usize = 4 * KIBI as usize; + +/// Minimum interval between voluntary inflation attempts in microseconds. +/// Actual interval may be longer as inflation is only attempted in +/// [`VirtioBalloonDriver::poll_events`]. This is called by the balloon executor +/// task which is cooperatively scheduled, so it may miss the exact interval while +/// other tasks are executing. +const VOLUNTARY_INFLATE_INTERVAL_MICROS: u64 = 1_000_000; + +/// Maximum number of 4 KiB pages voluntarily inflated per voluntary inflation +/// attempt, i.e. per call of [`VirtioBalloonDriver::poll_events`]. +const VOLUNTARY_INFLATE_MAX_NUM_PAGES: u32 = 2 * GIBI / BALLOON_PAGE_SIZE as u32; + +// TODO: prevent possible deflate of not yet acknowledged inflated pages. See VIRTIO v1.2 5.5.6.1 + +/// A wrapper struct for the raw configuration structure. +/// Handling the right access to fields, as some are read-only +/// for the driver. +#[derive(Debug)] +struct BalloonDevCfg { + pub raw: VolatileRef<'static, virtio::balloon::Config>, + pub dev_id: u16, + pub features: virtio::balloon::F, +} + +impl BalloonDevCfg { + fn num_pages(&self) -> u32 { + self.raw.as_ptr().num_pages().read().into() + } + + fn actual(&mut self) -> u32 { + self.raw.as_ptr().actual().read().into() + } + + fn set_actual(&mut self, num_pages: u32) { + self.raw.as_mut_ptr().actual().write(num_pages.into()); + } +} + +/// Virtio traditional memory balloon driver. +/// +/// Supports host requested inflation and voluntary inflation (above what the +/// host has requested). When the host decreases the requested balloon size again +/// (i.e. increasing permissible guest size again), the driver does not deflate +/// the balloon proactively. +/// +/// Voluntary inflation occurs when [`VirtioBalloonDriver::poll_events`] is called, +/// but at most every [`VOLUNTARY_INFLATE_INTERVAL_MICROS`] microseconds. +/// +/// The balloon is deflated again (making memory available to other Hermit tasks) +/// when an out of memory event occurs and the allocator's out of memory handler +/// calls [`VirtioBalloonDriver::deflate_for_oom`]. This way memory previously +/// returned to the host can be reused to ensure system stability. See also +/// [`oom::DeflateBalloonOnOom`]. +pub(crate) struct VirtioBalloonDriver { + dev_cfg: BalloonDevCfg, + com_cfg: ComCfg, + isr_stat: IsrStatus, + notif_cfg: NotifCfg, + irq: InterruptLine, + + inflateq: BalloonVq, + deflateq: BalloonVq, + + num_in_balloon: u32, + num_pending_inflation: u32, + num_pending_deflation: u32, + num_targeted: u32, + + balloon_storage: BalloonStorage, + last_voluntary_inflate: u64, +} + +impl VirtioBalloonDriver { + /// Negotiates a subset of features, understood and wanted by both the OS + /// and the device. + fn negotiate_features( + &mut self, + driver_features: virtio::balloon::F, + ) -> Result<(), VirtioBalloonError> { + let device_features = virtio::balloon::F::from(self.com_cfg.dev_features()); + + if driver_features.requirements_satisfied() { + debug!( + " Feature set requested by device driver are in conformance with specification." + ); + } else { + return Err(VirtioBalloonError::FeatureRequirementsNotMet { driver_features }); + } + + if device_features.contains(driver_features) { + // If device supports superset of our driver's current target feature set, + // write this feature set to common config + self.com_cfg.set_drv_features(driver_features.into()); + Ok(()) + } else { + Err(VirtioBalloonError::IncompatibleFeatureSets { + driver_features, + device_features, + }) + } + } + + /// Initializes the device in adherence to specification. + /// + /// See Virtio specification v1.2. - 3.1.1 + /// and v1.2. - 5.5.5 + pub fn init_dev(&mut self) -> Result<(), VirtioBalloonError> { + // Reset + self.com_cfg.reset_dev(); + + // Indicate device, that OS noticed it + self.com_cfg.ack_dev(); + + // Indicate device, that driver is able to handle it + self.com_cfg.set_drv(); + + // TODO: add support for free page hinting and reporting + + let features = F::VERSION_1; + self.negotiate_features(features)?; + + // Indicates the device, that the current feature set is final for the driver + // and will not be changed. + self.com_cfg.features_ok(); + + // Checks if the device has accepted final set. This finishes feature negotiation. + if self.com_cfg.check_features() { + info!( + " Features have been negotiated between device {:x} and driver: {features:?}", + self.dev_cfg.dev_id + ); + // Set feature set in device config fur future use. + self.dev_cfg.features = features; + } else { + return Err(VirtioBalloonError::FeatureNegotiationFailed { + device_id: self.dev_cfg.dev_id, + }); + } + + self.inflateq.init(VirtQueue::Split( + SplitVq::new( + &mut self.com_cfg, + &self.notif_cfg, + VqSize::from(VIRTIO_MAX_QUEUE_SIZE), + VqIndex::from(0u16), + self.dev_cfg.features.into(), + ) + .expect("Failed to create SplitVq for inflateq due to invalid parameters (bug)"), + )); + + self.deflateq.init(VirtQueue::Split( + SplitVq::new( + &mut self.com_cfg, + &self.notif_cfg, + VqSize::from(VIRTIO_MAX_QUEUE_SIZE), + VqIndex::from(1u16), + self.dev_cfg.features.into(), + ) + .expect("Failed to create SplitVq for deflateq due to invalid parameters (bug)"), + )); + + // At this point the device is "live" + self.com_cfg.drv_ok(); + + info!(" Finished initialization"); + + self.adjust_balloon_size(); + + Ok(()) + } + + fn num_pages_changed(&mut self) -> Option { + let new_num_pages = self.dev_cfg.num_pages(); + + if new_num_pages == self.num_targeted { + None + } else { + self.num_targeted = new_num_pages; + Some(new_num_pages) + } + } + + pub(crate) fn poll_events(&mut self) { + trace!(" Driver is being polled..."); + + trace!(" Processing acknowledgements for inflation/deflation"); + + let mut changed = false; + + { + let num_new_acknowledged_deflated = self.deflateq.discard_new_used(); + + if num_new_acknowledged_deflated > 0 { + debug!( + " Deflation acknowledged for {num_new_acknowledged_deflated} pages" + ); + + self.num_pending_deflation -= num_new_acknowledged_deflated as u32; + self.num_in_balloon -= num_new_acknowledged_deflated as u32; + changed = true; + } + } + + { + let num_new_acknowledged_inflated = self.inflateq.discard_new_used(); + + if num_new_acknowledged_inflated > 0 { + debug!( + " Inflation acknowledged for {num_new_acknowledged_inflated} pages" + ); + + self.num_pending_inflation -= num_new_acknowledged_inflated as u32; + self.num_in_balloon += num_new_acknowledged_inflated as u32; + changed = true; + } + } + + if changed { + debug!( + " Setting new actual balloon size of {} pages", + self.num_in_balloon + ); + self.dev_cfg.set_actual(self.num_in_balloon); + } + + self.adjust_balloon_size(); + } + + /// Deflate the balloon by the given number of pages. + /// + /// # Panics + /// When `num_pages_to_deflate` is larger than the number of pages currently + /// deflatable in the balloon. That is all pages currently in the balloon, + /// minus the number of pages already queued for deflation. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::inflate`] to inflate the balloon. + unsafe fn deflate(&mut self, talc: &mut Talc, num_pages_to_deflate: u32) { + assert!( + num_pages_to_deflate <= self.num_in_balloon - self.num_pending_deflation, + "Can't deflate more pages than there are in the balloon" + ); + + trace!(" Attempting to deflate by {num_pages_to_deflate} pages"); + + let page_indices = self + .balloon_storage + .mark_pages_for_deflation(num_pages_to_deflate); + + trace!( + " Marked {} pages for deflation, sending them into the deflateq: {page_indices:?}", + page_indices.len() + ); + + for chunk_page_indices in &page_indices { + // SAFETY: We ensure with our balloon storage that we only deflate pages + // that we have previously inflated into the balloon. + // Deflating also does not give the host ownership over + // additional memory of ours. Merely sending the indices into + // the queue does not yet deallocate the pages on our side. + unsafe { + self.deflateq + .send_pages(chunk_page_indices.iter().copied(), false) + .expect("Failed to send pages into the deflateq"); + } + } + + // SAFETY: For now we don't have [`F::MUST_TELL_HOST`] support, so + // we can deallocate all pages immediately once we have sent + // them into the deflateq. See VIRTIO v1.2 5.5.6 3. + // We pass on the upholding of the requirements on the `Talc` + // instance used to our caller. + unsafe { + self.balloon_storage.shrink_chunks(talc, page_indices); + } + + self.num_pending_deflation += num_pages_to_deflate; + } + + fn inflate( + &mut self, + talc: &mut Talc, + num_pages_to_inflate: u32, + voluntary: bool, + ) -> usize { + trace!(" Attempting to inflate as much as possible"); + + let page_indices = + self.balloon_storage + .allocate_chunks(talc, num_pages_to_inflate, voluntary); + let num_pages_inflated = page_indices.len(); + + trace!(" Sending page indices into inflateq: {page_indices:?}"); + + // SAFETY: We ensure with our balloon storage that we only inflate pages + // that we have allocated via the global allocator. Inflating + // a page hands ownership over to the host, but we ensure that + // the contents of the page are not used until the page has + // been deflated again by keeping our allocation in the balloon storage. + unsafe { + self.inflateq + .send_pages(page_indices, false) + .expect("Failed to send pages into the inflateq"); + } + + self.num_pending_inflation += num_pages_inflated as u32; + + num_pages_inflated + } + + fn adjust_balloon_size(&mut self) { + trace!(" Adjusting balloon size"); + + if let Some(new_target_num_pages) = self.num_pages_changed() { + if new_target_num_pages < self.num_in_balloon - self.num_pending_deflation { + let num_to_deflate = + (self.num_in_balloon - self.num_pending_deflation) - new_target_num_pages; + + debug!( + " Size change requested: deflate of {num_to_deflate}, from {} (with pending: inflation={} deflation={}) to {new_target_num_pages}", + self.num_in_balloon, self.num_pending_inflation, self.num_pending_deflation + ); + + trace!(" Ignoring, we only deflate on OOM"); + } else if new_target_num_pages > self.num_in_balloon + self.num_pending_inflation { + let num_to_inflate = + new_target_num_pages - (self.num_in_balloon + self.num_pending_inflation); + + debug!( + " Size change requested: inflate of {num_to_inflate}, from {} (with pending: inflation={} deflation={}) to {new_target_num_pages}", + self.num_in_balloon, self.num_pending_inflation, self.num_pending_deflation + ); + + self.inflate(&mut ALLOCATOR.inner().lock(), num_to_inflate, false); + trace!(" Done inflating"); + } + }; + + let now = crate::arch::processor::get_timestamp(); + + if now + >= self.last_voluntary_inflate + + u64::from(crate::arch::processor::get_frequency()) + * VOLUNTARY_INFLATE_INTERVAL_MICROS + { + debug!(" Voluntarily inflating balloon as much as we can"); + let num_inflated = self.inflate( + &mut ALLOCATOR.inner().lock(), + VOLUNTARY_INFLATE_MAX_NUM_PAGES, + true, + ); + debug!( + " Voluntarily inflated {num_inflated} pages. Next voluntary inflate in {:?}", + Duration::from_micros(VOLUNTARY_INFLATE_INTERVAL_MICROS) + ); + self.last_voluntary_inflate = now; + } + } + + pub fn disable_interrupts(&mut self) { + self.inflateq.disable_notifs(); + self.deflateq.disable_notifs(); + } + + pub fn enable_interrupts(&mut self) { + self.inflateq.enable_notifs(); + self.deflateq.enable_notifs(); + } + + pub fn num_deflatable_for_oom(&self) -> u32 { + self.num_in_balloon + .saturating_sub(self.dev_cfg.num_pages()) + .saturating_sub(self.num_pending_deflation) + } + + /// Deflate the balloon in case of an out-of-memory (OOM) event. + /// This is meant to be called from a [`talc::OomHandler`] registered to Hermit's + /// global instance of [`Talc`]. + /// + /// # Safety + /// May only be called with the one [`Talc`] instance registered as the global + /// allocator for Hermit. + pub unsafe fn deflate_for_oom( + &mut self, + talc: &mut Talc, + failed_alloc_num_pages: u32, + ) -> Result<(), ()> { + // We don't really know how much space Talc has left. + // The allocation might have failed only by a short margin, or by a lot. + + let num_deflatable = self.num_deflatable_for_oom(); + + if num_deflatable > 0 { + // Deflate as many pages as we can up to the amount needed for the allocation. + // We don't have to wait for host acknowledgement, because for now + // we don't support [`F::MUST_TELL_HOST`]. + + let num_to_deflate = num_deflatable.min(failed_alloc_num_pages); + + info!( + " Deflating {num_to_deflate} pages in an attempt to recover from an OOM condition" + ); + + // SAFETY: We pass on the requirement of using the correct `Talc` + // instance to our caller. + unsafe { + self.deflate(talc, num_to_deflate); + } + Ok(()) + } else { + error!(" Unable to deflate balloon further"); + // Nothing more we can do + Err(()) + } + } +} + +impl Driver for VirtioBalloonDriver { + fn get_interrupt_number(&self) -> InterruptLine { + self.irq + } + + fn get_name(&self) -> &'static str { + "virtio-balloon" + } +} + +struct BalloonVq { + vq: Option, +} + +impl BalloonVq { + pub fn new() -> Self { + Self { vq: None } + } + + fn init(&mut self, vq: VirtQueue) { + self.vq = Some(vq); + } + + pub fn enable_notifs(&mut self) { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::enable_notifs called on uninitialized vq"); + return; + }; + + vq.enable_notifs(); + } + + pub fn disable_notifs(&mut self) { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::disable_notifs called on uninitialized vq"); + return; + }; + + vq.disable_notifs(); + } + + fn is_empty(&self) -> bool { + let Some(vq) = &self.vq else { + debug!(" BalloonVq::disable_notifs called on uninitialized vq"); + return true; + }; + + vq.is_empty() + } + + fn used_send_buff_to_page_indices( + used_send_buff: SmallVec<[BufferElem; 2]>, + ) -> impl Iterator { + used_send_buff.into_iter().flat_map(|buffer_elem| { + match buffer_elem { + BufferElem::Sized(_any) => + panic!("Unexpected used `BufferElem::Sized` encountered, BalloonVq should only have sent `BufferElem::Vector`s"), + BufferElem::Vector(items) => { + assert!(items.len() % 4 == 0, "Unexpected size of used `BufferElem::Vector`, BalloonVq should only have sent lengths that are multiples of 4"); + + items + .into_iter() + .array_chunks() + .map(|bytes: [u8; 4]| u32::from_le_bytes(bytes)) + }, + } + }) + } + + /// Receive all new page indices marked used by the host. + /// These are the page indices we have previously sent into the queue in available buffers. + pub fn recv_new_used(&mut self) -> impl Iterator { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::try_recv_new_used called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling try_recv_new_used"); + }; + + let mut current_used_page_indices_iter = None; + + core::iter::from_fn(move || { + match current_used_page_indices_iter.as_mut() { + // Must appear in the code before `current_used_page_indices_iter.next()` for an existing iterator (see below). + // Otherwise Rust is unable to infer the contents of the `Option` (and the type can't be named explicitly). + // If this inference failure gets fixed, this match can be converted to an `if let Some(iter) = ...` + None => match vq.try_recv() { + Ok(new_used) => { + let mut new_used_page_indices_iter = + Self::used_send_buff_to_page_indices(new_used.send_buff); + + let used = new_used_page_indices_iter.next()?; + + current_used_page_indices_iter = Some(new_used_page_indices_iter); + + Some(used) + } + + Err(VirtqError::NoNewUsed) => None, + + Err(error) => { + panic!( + "Failed to receive new used virtqueue descriptors with unexpected error: {error:?}" + ) + } + }, + + Some(current_used_page_indices_iter) => current_used_page_indices_iter.next(), + } + }) + } + + /// Discard all new page indices marked used by the host. + /// These are the page indices we have previously sent into the queue in available buffers. + pub fn discard_new_used(&mut self) -> usize { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::discard_new_used called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling discard_new_used"); + }; + + let mut num_discarded = 0; + + loop { + match vq.try_recv() { + Ok(new_used) => { + let num_page_indices = + Self::used_send_buff_to_page_indices(new_used.send_buff).count(); + trace!( + " Discarded used buffer received from host with {num_page_indices} page indices" + ); + num_discarded += num_page_indices; + } + + Err(VirtqError::NoNewUsed) => break, + + Err(error) => { + panic!( + "Failed to receive new used virtqueue descriptors with unexpected error: {error:?}" + ) + } + } + } + + num_discarded + } + + pub fn discard_blocking_until_empty(&mut self) -> usize { + self.disable_notifs(); + + trace!( + " trying to empty the virtqueue, blocking until all elements have been discarded" + ); + + let mut num_discarded = 0; + while !self.is_empty() { + num_discarded += self.discard_new_used(); + } + + trace!(" done emptying the virtqueue"); + + self.enable_notifs(); + + num_discarded + } + + /// Send specified pages into the balloon virtqueue. + /// + /// To ensure that there is enough space in the queue, call [`Self::recv_new_used`] + /// or [`Self::discard_new_used`] before sending. + /// + /// The page indices are of 4096B (4K) pages and are submitted as `u32`s, + /// i.e. only pages up to (2³² - 1) * 4096 B = 16 TiB in our physical memory + /// can be submitted here. + /// + /// # Safety + /// The caller must ensure that the pages of which the indices are sent into + /// the inflate queue are not used by the kernel or the application until they + /// have been deflated again via the deflate queue + /// (with or without acknowledgement by the host depending on [`F::MUST_TELL_HOST`]). + pub unsafe fn send_pages>( + &mut self, + page_indices: I, + notif: bool, + ) -> Result<(), VirtqError> { + trace!(" Sending page indices into queue"); + + let Some(vq) = &mut self.vq else { + error!(" BalloonVq::send_pages called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling send_pages"); + }; + + trace!(" Allocating new Vec (DeviceAlloc) for page indices"); + + let mut page_indices_bytes = Vec::new_in(DeviceAlloc); + page_indices + .into_iter() + // Not specified as little-endian by the spec? Linux does it little-endian for VIRTIO 1.0 + .flat_map(|index| index.to_le_bytes()) + .collect_into(&mut page_indices_bytes); + + if page_indices_bytes.is_empty() { + debug!(" Vec of page indices is empty, doing nothing"); + return Ok(()); + } + + let buff_tkn = AvailBufferToken::new( + smallvec![BufferElem::Vector(page_indices_bytes)], + smallvec![], + ) + .expect("We have specified a send_buff so AvailBufferToken::new should succeed"); + + trace!(" Dispatching buffer to the queue"); + + vq.dispatch(buff_tkn, notif, BufferType::Direct)?; + + Ok(()) + } +} + +/// Errors that can occur during the lifetime and initialization of the [`VirtioBalloonDriver`](`super::VirtioBalloonDriver`) +#[derive(Debug, Copy, Clone)] +pub enum VirtioBalloonError { + #[cfg(feature = "pci")] + NoDevCfg { device_id: u16 }, + /// The device did not accept the negotiated features at the last step of negotiation. + FeatureNegotiationFailed { device_id: u16 }, + /// Set of features requested by driver does not adhere to the requirements of features + /// indicated by the specification + FeatureRequirementsNotMet { driver_features: virtio::balloon::F }, + /// The first u64 contains the feature bits wanted by the driver. + /// but which are incompatible with the device feature set, second u64. + IncompatibleFeatureSets { + driver_features: virtio::balloon::F, + device_features: virtio::balloon::F, + }, +} + +#[derive(Debug)] +struct BalloonStorage { + /// A stack of chunks of pages allocated for the balloon. + chunks: Vec, +} + +impl BalloonStorage { + pub fn new() -> Self { + Self { + chunks: Vec::new_in(DeviceAlloc), + } + } + + fn allocate_chunk( + &mut self, + talc: &mut Talc, + num_pages: NonZeroU32, + ) -> Result, ()> { + let page = BalloonAllocation::try_allocate(talc, num_pages)?; + + self.chunks.push(page); + + // Only now get the iterator over physical indices, so it lives as long + // as chunks, instead of referencing the now moved page variable. + let mut page_indices = self + .chunks + .last() + .expect("We just pushed one chunk") + .phys_page_indices() + .peekable(); + let first_page_index = *page_indices + .peek() + .expect("If the allocation didn't fail, we should have at least one page index"); + + trace!( + " Allocated ballon page chunk starting at page index {first_page_index} with {num_pages} pages" + ); + + Ok(page_indices) + } + + pub fn allocate_chunks( + &mut self, + talc: &mut Talc, + target_num_pages: u32, + voluntary: bool, + ) -> Vec { + let mut page_indices = Vec::new_in(DeviceAlloc); + let mut current_exponent = target_num_pages.ilog2(); + let mut num_remaining = target_num_pages; + + trace!(" Attempting to allocate {target_num_pages} pages"); + + while num_remaining > 0 { + trace!( + " Attempting to allocate chunk of {} pages (pages remaining: {num_remaining})", + 1u32 << current_exponent + ); + match self.allocate_chunk( + talc, + NonZeroU32::new(1 << current_exponent) + .expect("One shifted left by any number is always at least one"), + ) { + Ok(chunk_page_indices) => { + num_remaining -= 1 << current_exponent; + page_indices.extend(chunk_page_indices); + } + Err(()) => { + if current_exponent == 0 { + log!( + if voluntary { + log::Level::Debug + } else { + log::Level::Warn + }, + " Failed to allocate as many pages as requested to fill the balloon with, continuing with as many as possible ({})", + target_num_pages - num_remaining + ); + break; + } + + let old_exponent = current_exponent; + current_exponent -= 1; + trace!( + " Failed to allocate new chunk of 2^{old_exponent} ({}) pages to fill the balloon with, reducing chunk size to 2^{current_exponent} ({})", + 1u32 << old_exponent, + 1u32 << current_exponent, + ); + + continue; + } + } + } + + trace!(" Done allocating {} chunks", page_indices.len()); + + page_indices + } + + pub fn mark_pages_for_deflation( + &mut self, + target_num_pages: u32, + ) -> Vec, DeviceAlloc> { + trace!(" Attempting to mark {target_num_pages} pages as queued for deflation"); + + let mut num_remaining = target_num_pages; + let mut per_chunk_page_indices = Vec::new_in(DeviceAlloc); + + // Go through chunks from small/recent to large/old, mark as much as requested if possible. + // Collect the page indices of marked pages for submission to the deflate queue. + + for chunk in self.chunks.iter_mut().rev() { + let num_to_mark = chunk.num_available_for_deflation().min(num_remaining); + + let mut page_indices = Vec::new_in(DeviceAlloc); + chunk + .mark_queued_for_deflation(num_to_mark) + .collect_into(&mut page_indices); + + per_chunk_page_indices.push(page_indices); + + num_remaining -= num_to_mark; + + if num_remaining == 0 { + break; + } + } + + if num_remaining > 0 { + warn!( + " Attempted to deflate more pages than were in the balloon: no more allocation chunks left to deflate" + ); + } + + per_chunk_page_indices + } + + /// Shrink chunks previously marked partially or fully as queued for deflation previously. + /// The chunks will be shrunk only by the pages the indices of which are provided + /// in `acknowledged_deflated_pages`. The indices should be provided in the + /// groups and order they were returned by [`Self::allocate_chunks`]. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::allocate_chunks`] to allocate the chunks. This should be the same + /// [`Talc`] instance for all chunks. + /// + /// Must not be called with page indices that the host still has ownership of. + /// That is, only page indices to pages that are already deflated may be passed + /// to this function. Otherwise pages still owned by the host may be freed, + /// leading to unsound future allocations. + pub unsafe fn shrink_chunks( + &mut self, + talc: &mut Talc, + acknowledged_deflated_pages: Vec, DeviceAlloc>, + ) { + let mut next_chunk_index = self.chunks.len().checked_sub(1); + + for chunk_deflated_pages in acknowledged_deflated_pages.into_iter() { + let Some(mut current_chunk_index) = next_chunk_index else { + error!( + " Was unable to use all page indices acknowledged for deflation to shrink allocation chunks" + ); + return; + }; + + loop { + if self.chunks[current_chunk_index].can_shrink_by_pages(&chunk_deflated_pages) { + break; + } + + trace!( + " Skipped one chunk, because it cannot be shrunk by the current block of deflated pages" + ); + + let Some(new_chunk_index) = current_chunk_index.checked_sub(1) else { + error!( + " Was unable to use all page indices acknowledged for deflation to shrink allocation chunks" + ); + return; + }; + + current_chunk_index = new_chunk_index; + } + + // SAFETY: We pass on the upholding of the requirements on the `Talc` + // instance passed and the page indices provided to our caller. + let shrink_res = + unsafe { self.chunks[current_chunk_index].shrink(talc, chunk_deflated_pages) }; + + match shrink_res { + ShrinkResult::PagesRemain => (), + ShrinkResult::Deallocated => { + self.chunks.remove(current_chunk_index); + } + } + + next_chunk_index = current_chunk_index.checked_sub(1); + } + } +} + +/// Represents a chunk of consecutive 4K pages allocated for the balloon. +/// +/// This ensures via encapsulation, that inflated pages, pages released to the host, +/// are not read from / written to while they are in the balloon. +/// +/// The allocation represented by this type must be manually deallocated via [`Self:deallocate`]. +/// If the type is dropped, the allocation is leaked. +/// This is not unsafe, but undesirable. +#[derive(Debug)] +struct BalloonAllocation { + /// Pointer to the allocation or `None` if fully deallocated. + allocation_ptr: Option>, + /// Indices of the pages currently allocated and owned by this struct. + page_indices: Vec, + /// Index of the first index that is queued for deflation, with all following + /// also being queued for deflation. + /// This is an index into [`Self::page_indices`]. + /// When there are no pages queued for deflation, this index is the one after + /// the last element of [`Self::page_indices`], i.e. the length of [`Self::page_indices`]. + queued_for_deflation_start: usize, +} + +// SAFETY: `BalloonAllocation` does not implement `Clone` (or any other cloning mechanism) +// and implies exclusive ownership of an allocation, with the exception of host ineractions. +// Sending it across threads cannot create a situation where we can access +// mutable state across two threads. The host interactions are guarded by +// unsafe functions and in general we don't dereference pointers into our allocation. +unsafe impl Send for BalloonAllocation {} + +// SAFETY: We don't allow for any interior mutability as `allocation_ptr` is never +// dereferenced by us and is not exposed outside of our type. Other than that we +// only have plain integer types that are `Sync` themselves. +unsafe impl Sync for BalloonAllocation {} + +impl BalloonAllocation { + /// Get the memory layout for an allocation of `num_pages` 4K pages + fn layout(num_pages: NonZeroUsize) -> Layout { + Layout::from_size_align(num_pages.get() * BALLOON_PAGE_SIZE, BALLOON_PAGE_SIZE).expect( + "Layout of a non-zero amount of 4K pages aligned to 4K page boundaries should be valid", + ) + } + + /// The current layout of our allocation if we have any pages allocated, + /// `None` otherwise. + fn current_layout(&self) -> Option { + self.num_pages_allocated().map(Self::layout) + } + + /// The total number of pages allocated for this chunk. + /// This also includes pages marked for deflation that haven't been shrunk away yet. + fn num_pages_allocated(&self) -> Option { + NonZeroUsize::new(self.page_indices.len()) + } + + /// The number of pages of this chunk that can be queued for deflation. + fn num_available_for_deflation(&self) -> u32 { + (0..self.queued_for_deflation_start) + .len() + .try_into() + .expect( + "We only deal with 32-bit indexed pages, so our number of pages has to fit in a u32", + ) + } + + pub fn is_empty(&self) -> bool { + self.allocation_ptr.is_none() + } + + pub fn phys_page_indices(&self) -> impl Iterator { + self.page_indices.iter().copied() + } + + #[must_use = "this returns an object representing the allocation, unless stored, it is leaked"] + pub fn try_allocate( + talc: &mut Talc, + num_pages: NonZeroU32, + ) -> Result { + // SAFETY: We require a non-zero number of pages, from which we construct + // a non-zero-sized layout of this many 4K pages. + let allocation_ptr = unsafe { + talc.malloc_without_oom_handler(Self::layout(num_pages.try_into().expect( + "We don't support 16-bit or narrower platforms so a u32 should fit into a usize", + ))) + }?; + + let num_pages = num_pages.get() as usize; + + let mut page_indices = Vec::with_capacity_in(num_pages, DeviceAlloc); + (0..num_pages) + .map(|offset| VirtAddr::from_ptr(allocation_ptr.as_ptr()) + offset * BALLOON_PAGE_SIZE) + .map(|virt_addr| { + virtual_to_physical(virt_addr) + .expect("We only deal with virtual addresses that are mapped") + }) + .map(|phys_addr| { + u32::try_from(phys_addr.as_u64() / BALLOON_PAGE_SIZE as u64) + .expect("Balloon cannot handle physical pages above 16TiB") + }) + .collect_into(&mut page_indices); + + Ok(Self { + allocation_ptr: Some(allocation_ptr), + page_indices, + queued_for_deflation_start: num_pages, + }) + } + + fn pages_queued_for_deflation(&self) -> &[u32] { + &self.page_indices[self.queued_for_deflation_start..] + } + + pub fn mark_queued_for_deflation( + &mut self, + num_pages_to_mark: u32, + ) -> impl Iterator { + let num_previously_marked = self.pages_queued_for_deflation().len(); + + assert!( + num_pages_to_mark as usize <= self.page_indices.len() - num_previously_marked, + "Cannot mark mark more pages for deflation than are still contained and unmarked in the chunk" + ); + + let num_allocated = self.page_indices.len(); + + trace!( + " Marking {num_pages_to_mark} pages for chunk: {num_allocated} (of that {num_previously_marked} marked for deflation) -> {num_allocated} (of that {} marked for deflation)", + num_previously_marked + num_pages_to_mark as usize + ); + + self.queued_for_deflation_start -= num_pages_to_mark as usize; + + self.pages_queued_for_deflation()[..num_pages_to_mark as usize] + .iter() + .copied() + } + + pub fn can_shrink_by_pages(&self, page_indices: &[u32]) -> bool { + self.pages_queued_for_deflation() + .iter() + .rev() + .zip(page_indices.iter().rev()) + .all(|(marked, deflated)| *marked == *deflated) + } + + /// Shrinks the allocated chunk by `pages_to_shrink`. + /// Takes `self` by value and if there are remaining pages in the chunks after + /// shrinking, returns it via [`ShrinkResult::PagesRemain`]. Otherwise `self` + /// is consumed with the chunk having been emptied. + /// + /// `pages_to_shrink` should be a list of page indices previously returned by + /// [`Self::mark_queued_for_deflation`]. They should be submitted in the order + /// they were returned by [`Self::mark_queued_for_deflation`] both within such + /// a list and across multiple calls of this function with different lists. + /// This ensure we can actually shrink our allocation. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::try_allocate`] to create this instance of [`BalloonAllocation`]. + /// + /// Must not be called while the host still has ownership of any of the pages + /// that are a part of the allocation represented by this struct. + /// I.e. deallocation may only take place once the host has returned ownership + /// back to us for all pages of this allocation. + /// + /// # Panics + /// If `pages_to_shrink` contains page indices of pages not marked queued for deflation + #[must_use = "If pages remain after shrinking, remaining BalloonAllocation is returned. Dropping it would leak the allocation"] + pub unsafe fn shrink( + &mut self, + talc: &mut Talc, + pages_to_shrink: Vec, + ) -> ShrinkResult { + let num_previously_marked = self.pages_queued_for_deflation().len(); + assert!( + pages_to_shrink.len() <= num_previously_marked, + "Must mark the amount of the allocation chunk to be shrunk for deflation before shrinking" + ); + + if self.is_empty() { + warn!(" Attempted to shrink already empty balloon allocation chunk"); + return ShrinkResult::Deallocated; + } + + if pages_to_shrink.is_empty() { + return ShrinkResult::PagesRemain; + } + + trace!( + " Shrinking chunk by {} pages: {} (of that {} marked for deflation) -> {} (of that {} marked for deflation)", + pages_to_shrink.len(), + self.page_indices.len(), + num_previously_marked, + self.page_indices.len() - pages_to_shrink.len(), + num_previously_marked - pages_to_shrink.len(), + ); + + let old_layout = self + .current_layout() + .expect("We checked above that we have at least one page still allocated"); + + // Find the position in `self.page_indices` from which we want to start shrinking. + // Only look through the sub-slice of it that is actually marked queued for deflation + // to find the index. + let Some(first_to_shrink) = self + .pages_queued_for_deflation() + .iter() + .position(|page_index| *page_index == pages_to_shrink[0]) + .map(|index| self.queued_for_deflation_start + index) + else { + error!( + " First page to shrink ({}) was not found inside balloon allocation chunk, can't shrink", + pages_to_shrink[0] + ); + panic!("Attempted to shrink balloon allocation chunk by page not inside the chunk") + }; + + if !self + .pages_queued_for_deflation() + .iter() + .last() + .is_some_and(|page_index| { + page_index + == pages_to_shrink + .last() + .expect("We checked for non-emptiness above") + }) { + error!( + " Last page to shrink {} was not found inside balloon allocation chunk, can't shrink", + pages_to_shrink + .last() + .expect("We checked for non-emptiness above") + ); + panic!( + "Attempted to shrink balloon allocation chunk by pages not consecutively at the end of the chunk" + ) + } + + for (page_index_to_shrink, page_index_marked) in pages_to_shrink + .into_iter() + .zip(self.page_indices.drain(first_to_shrink..)) + { + assert!( + page_index_to_shrink == page_index_marked, + "Attempted to shrink balloon allocation chunk by page not inside the chunk" + ); + } + + let new_num_pages = self.page_indices.len(); + + let res = if new_num_pages == 0 { + trace!( + " Deallocating balloon chunk as all its pages were shrunk away after acknowledged deflation" + ); + + trace!( + " Freeing ptr={:x?}, layout={old_layout:?}", + self.allocation_ptr + ); + // SAFETY: We require that our caller ensures that the same `Talc` + // instance is passed here as the one passed to allocate our + // `BalloonAllocation`. As we don't expose our pointer, or + // allow other modification from outside, it must have been + // allocated with the given `Talc` instance. + // We track the size of our allocation beginning with the intial + // allocation and also during shrinking operations. Our alignment + // is always to 4K page boundaries. We thus ensure the correct + // layout is passed here. + unsafe { + talc.free( + self.allocation_ptr + .take() + .expect("We checked above that we still have at least one page allocated"), + old_layout, + ); + } + + ShrinkResult::Deallocated + } else { + trace!( + " Shrinking chunk with {} pages still remaining of which {} pages marked queued for deflation", + self.page_indices.len(), + self.pages_queued_for_deflation().len() + ); + + trace!( + " shrinking ptr={:x?}, old_layout={old_layout:?}, len={new_num_pages}", + self.allocation_ptr + ); + // SAFETY: We require that our caller ensures that the same `Talc` + // instance is passed here as the one passed to allocate our + // `BalloonAllocation`. As we don't expose our pointer, or + // allow other modification from outside, it must have been + // allocated with the given `Talc` instance. + // We track the size of our allocation beginning with the intial + // allocation and also during shrinking operations. Our alignment + // is always to 4K page boundaries. We thus ensure the correct + // old layout is passed here. + // This branch cannout be reached if the new size is zero. + // The size can also not be larger than the old size, as we + // take a non-negative amount to shrink by as our parameter, + // not a new size. + unsafe { + talc.shrink( + self.allocation_ptr + .expect("We checked above that we still have at least one page allocated"), + old_layout, + new_num_pages * BALLOON_PAGE_SIZE, + ); + } + + ShrinkResult::PagesRemain + }; + + trace!(" Done shrinking"); + + res + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ShrinkResult { + PagesRemain, + Deallocated, +} diff --git a/src/drivers/balloon/oom.rs b/src/drivers/balloon/oom.rs new file mode 100644 index 0000000000..ad7702036b --- /dev/null +++ b/src/drivers/balloon/oom.rs @@ -0,0 +1,61 @@ +use core::alloc::Layout; + +use talc::{OomHandler, Talc}; + +use crate::drivers::pci::get_balloon_driver; + +/// [`Talc`] out of memory handler that attempts to recover memory previously +/// returned to the host via the VIRTIO Traditional Memory Balloon device. +/// +/// It attempts to deflate the balloon (re-acquiring memory from the host, and +/// freeing the allocations made by the balloon driver in the host's stead) by +/// the amount required for the allocation that would have failed. If the balloon +/// is filled with fewer pages than would be required to cover the allocation's +/// size, this handler attempts to recover as many as possible still. +/// +/// Memory freed across chunks of pages allocated for the balloon may not be +/// contiguous. This means that even if we free as many bytes as required for the +/// allocation, we may not have freed enough _contiguous_ memory for it. This is +/// ok however and [`Talc`] will simply call our handler again until we've either +/// exhausted the memory available for recovery from the host, or the allocation +/// succeeds. +pub struct DeflateBalloonOnOom { + /// Dummy field to prevent construction of the struct except through [`Self::new`] + /// which is marked `unsafe`` and documents our requirements for safety. + #[doc(hidden)] + _private: (), +} + +impl DeflateBalloonOnOom { + /// Construct a new instance of the balloon deflating [`OomHandler`] for [`Talc`]. + /// + /// # Safety + /// May only be used with the one instance of [`Talc`] registered as Hermit's + /// global allocator. + pub const unsafe fn new() -> Self { + Self { _private: () } + } +} + +impl OomHandler for DeflateBalloonOnOom { + fn handle_oom(talc: &mut Talc, layout: Layout) -> Result<(), ()> { + warn!(" Encountered OOM, attempting to deflate balloon to recover..."); + + let Some(balloon_driver) = get_balloon_driver() else { + return Err(()); + }; + + let Some(mut ballon_driver_guard) = balloon_driver.try_lock() else { + error!( + " Driver was locked while attempting to allocate more than available. Unable to deflate balloon" + ); + return Err(()); + }; + + // For Talc's tag adjacent to the allocation, just always free one page more. + // Divide rounding up so the allocation always fits even if it's not a multiple of 4K pages large. + unsafe { + ballon_driver_guard.deflate_for_oom(talc, (layout.size().div_ceil(4096)) as u32 + 1) + } + } +} diff --git a/src/drivers/balloon/pci.rs b/src/drivers/balloon/pci.rs new file mode 100644 index 0000000000..e156a22198 --- /dev/null +++ b/src/drivers/balloon/pci.rs @@ -0,0 +1,117 @@ +use virtio::pci::IsrStatus; +use volatile::VolatileRef; + +use super::{BalloonDevCfg, BalloonStorage, BalloonVq, VirtioBalloonDriver, VirtioBalloonError}; +use crate::drivers::pci::PciDevice; +use crate::drivers::virtio::error::VirtioError; +use crate::drivers::virtio::transport::pci::{self as virtio_pci, PciCap, UniCapsColl}; +use crate::pci::PciConfigRegion; + +impl VirtioBalloonDriver { + pub fn get_dev_id(&self) -> u16 { + self.dev_cfg.dev_id + } + + pub fn set_failed(&mut self) { + self.com_cfg.set_failed(); + } + + fn map_cfg(cap: &PciCap) -> Option { + let dev_cfg = virtio_pci::map_dev_cfg::(cap)?; + + let dev_cfg = VolatileRef::from_mut_ref(dev_cfg); + + Some(BalloonDevCfg { + raw: dev_cfg, + dev_id: cap.dev_id(), + features: virtio::balloon::F::empty(), + }) + } + + /// Instantiates a new [`VirtioBalloonDriver`] struct, by checking the available + /// configuration structures and moving them into the struct. + fn new( + caps_coll: UniCapsColl, + device: &PciDevice, + ) -> Result { + let device_id = device.device_id(); + + let UniCapsColl { + com_cfg, + notif_cfg, + isr_cfg, + dev_cfg_list, + .. + } = caps_coll; + + let Some(dev_cfg) = dev_cfg_list.iter().find_map(VirtioBalloonDriver::map_cfg) else { + error!(" No dev config. Aborting!"); + return Err(VirtioBalloonError::NoDevCfg { device_id }); + }; + + Ok(VirtioBalloonDriver { + dev_cfg, + com_cfg, + isr_stat: isr_cfg, + notif_cfg, + irq: device.get_irq().unwrap(), + + inflateq: BalloonVq::new(), + deflateq: BalloonVq::new(), + + num_in_balloon: 0, + num_pending_inflation: 0, + num_pending_deflation: 0, + num_targeted: 0, + + balloon_storage: BalloonStorage::new(), + last_voluntary_inflate: 0, + }) + } + + /// Initialize a new VIRTIO Traditional Memory Balloon device based on the given PCI device + pub fn from_pci_device( + device: &PciDevice, + ) -> Result { + let caps = virtio_pci::map_caps(device).inspect_err(|_| { + error!(" Mapping capabilities failed. Aborting!"); + })?; + + let mut driver = VirtioBalloonDriver::new(caps, device) + .inspect_err(|_| { + error!(" Initializing new driver failed. Aborting!"); + }) + .map_err(VirtioError::BalloonDriver)?; + + driver + .init_dev() + .inspect_err(|_| driver.set_failed()) + .map_err(VirtioError::BalloonDriver)?; + + info!( + " device with id {:x}, has been initialized by driver!", + driver.get_dev_id() + ); + + Ok(driver) + } + + pub fn handle_interrupt(&mut self) { + let status = self.isr_stat.is_queue_interrupt(); + + if status.contains(IsrStatus::DEVICE_CONFIGURATION_INTERRUPT) { + debug!( + " Received config interrupt, new config: {:?}", + self.dev_cfg + ); + } + + if status.contains(IsrStatus::QUEUE_INTERRUPT) { + debug!(" Received queue interrupt"); + } + + // TODO: wake tasks via wakers once introduced (currently every task just gets polled round-robin, always) + + self.isr_stat.acknowledge(); + } +} diff --git a/src/drivers/mod.rs b/src/drivers/mod.rs index 86629f1779..f1249bcace 100644 --- a/src/drivers/mod.rs +++ b/src/drivers/mod.rs @@ -1,5 +1,7 @@ //! A module containing hermit-rs driver, hermit-rs driver trait and driver specific errors. +#[cfg(feature = "balloon")] +pub mod balloon; #[cfg(feature = "fuse")] pub mod fs; #[cfg(not(feature = "pci"))] @@ -11,7 +13,8 @@ pub mod pci; #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] pub mod virtio; #[cfg(feature = "vsock")] @@ -39,7 +42,8 @@ pub mod error { #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] use crate::drivers::virtio::error::VirtioError; @@ -48,7 +52,8 @@ pub mod error { #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] InitVirtioDevFail(VirtioError), #[cfg(all(target_arch = "x86_64", feature = "rtl8139"))] @@ -60,7 +65,8 @@ pub mod error { #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] impl From for DriverError { fn from(err: VirtioError) -> Self { @@ -89,7 +95,8 @@ pub mod error { #[cfg(any( all(any(feature = "tcp", feature = "udp"), not(feature = "rtl8139")), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] DriverError::InitVirtioDevFail(ref err) => { write!(f, "Virtio driver failed: {err:?}") diff --git a/src/drivers/pci.rs b/src/drivers/pci.rs index ed28afe116..f47af6b35b 100644 --- a/src/drivers/pci.rs +++ b/src/drivers/pci.rs @@ -6,7 +6,13 @@ use core::fmt; use ahash::RandomState; use hashbrown::HashMap; -#[cfg(any(feature = "tcp", feature = "udp", feature = "fuse", feature = "vsock"))] +#[cfg(any( + feature = "tcp", + feature = "udp", + feature = "fuse", + feature = "vsock", + feature = "balloon" +))] use hermit_sync::InterruptTicketMutex; use hermit_sync::without_interrupts; use memory_addresses::{PhysAddr, VirtAddr}; @@ -17,6 +23,8 @@ use pci_types::{ }; use crate::arch::pci::PciConfigRegion; +#[cfg(feature = "balloon")] +use crate::drivers::balloon::VirtioBalloonDriver; #[cfg(feature = "fuse")] use crate::drivers::fs::virtio_fs::VirtioFsDriver; #[cfg(any(feature = "tcp", feature = "udp"))] @@ -34,7 +42,8 @@ use crate::drivers::net::virtio::VirtioNetDriver; not(all(target_arch = "x86_64", feature = "rtl8139")) ), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] use crate::drivers::virtio::transport::pci as pci_virtio; #[cfg(any( @@ -43,7 +52,8 @@ use crate::drivers::virtio::transport::pci as pci_virtio; not(all(target_arch = "x86_64", feature = "rtl8139")) ), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] use crate::drivers::virtio::transport::pci::VirtioDriver; #[cfg(feature = "vsock")] @@ -344,6 +354,8 @@ pub(crate) enum PciDriver { any(feature = "tcp", feature = "udp") ))] RTL8139Net(InterruptTicketMutex), + #[cfg(feature = "balloon")] + VirtioBalloon(InterruptTicketMutex), } impl PciDriver { @@ -390,6 +402,15 @@ impl PciDriver { } } + #[cfg(feature = "balloon")] + fn get_balloon_driver(&self) -> Option<&InterruptTicketMutex> { + #[allow(unreachable_patterns)] + match self { + Self::VirtioBalloon(drv) => Some(drv), + _ => None, + } + } + fn get_interrupt_handler(&self) -> (InterruptLine, fn()) { #[allow(unreachable_patterns)] match self { @@ -444,6 +465,18 @@ impl PciDriver { (irq_number, fuse_handler) } + #[cfg(feature = "balloon")] + Self::VirtioBalloon(drv) => { + fn balloon_handler() { + if let Some(driver) = get_balloon_driver() { + driver.lock().handle_interrupt(); + } + } + + let irq_number = drv.lock().get_interrupt_number(); + + (irq_number, balloon_handler) + } _ => todo!(), } } @@ -512,6 +545,14 @@ pub(crate) fn get_filesystem_driver() -> Option<&'static InterruptTicketMutex Option<&'static InterruptTicketMutex> { + PCI_DRIVERS + .get()? + .iter() + .find_map(|drv| drv.get_balloon_driver()) +} + pub(crate) fn init() { // virtio: 4.1.2 PCI Device Discovery without_interrupts(|| { @@ -530,7 +571,8 @@ pub(crate) fn init() { not(all(target_arch = "x86_64", feature = "rtl8139")) ), feature = "fuse", - feature = "vsock" + feature = "vsock", + feature = "balloon" ))] match pci_virtio::init_device(adapter) { #[cfg(all( @@ -548,6 +590,10 @@ pub(crate) fn init() { Ok(VirtioDriver::FileSystem(drv)) => { register_driver(PciDriver::VirtioFs(InterruptTicketMutex::new(drv))); } + #[cfg(feature = "balloon")] + Ok(VirtioDriver::Balloon(drv)) => { + register_driver(PciDriver::VirtioBalloon(InterruptTicketMutex::new(drv))); + } _ => {} } } diff --git a/src/drivers/virtio/mod.rs b/src/drivers/virtio/mod.rs index ec0d275363..52740fcbdb 100644 --- a/src/drivers/virtio/mod.rs +++ b/src/drivers/virtio/mod.rs @@ -7,6 +7,8 @@ pub mod virtqueue; pub mod error { use core::fmt; + #[cfg(feature = "balloon")] + use crate::drivers::balloon::VirtioBalloonError; #[cfg(feature = "fuse")] pub use crate::drivers::fs::virtio_fs::error::VirtioFsError; #[cfg(all( @@ -40,6 +42,8 @@ pub mod error { FsDriver(VirtioFsError), #[cfg(feature = "vsock")] VsockDriver(VirtioVsockError), + #[cfg(feature = "balloon")] + BalloonDriver(VirtioBalloonError), #[cfg(not(feature = "pci"))] Unknown, } @@ -174,6 +178,31 @@ pub mod error { ) } }, + #[cfg(feature = "balloon")] + VirtioError::BalloonDriver(balloon_error) => match balloon_error { + #[cfg(feature = "pci")] + VirtioBalloonError::NoDevCfg { device_id } => write!( + f, + "Virtio traditional memory balloon device driver failed, for device {device_id:x}, due to a missing or malformed device config!" + ), + VirtioBalloonError::FeatureNegotiationFailed { device_id } => write!( + f, + "Virtio traditional memory balloon device driver failed, for device {device_id:x}, device did not acknowledge negotiated feature set!" + ), + VirtioBalloonError::FeatureRequirementsNotMet { driver_features } => write!( + f, + "Virtio traditional memory balloon device driver tried to set feature bit without setting dependency feature. Feat set: {driver_features:?}" + ), + VirtioBalloonError::IncompatibleFeatureSets { + driver_features, + device_features, + } => { + write!( + f, + "Feature set: {driver_features:?} , is incompatible with the device features: {device_features:?}" + ) + } + }, } } } diff --git a/src/drivers/virtio/transport/pci.rs b/src/drivers/virtio/transport/pci.rs index fec67ae4e4..bb122a70b3 100644 --- a/src/drivers/virtio/transport/pci.rs +++ b/src/drivers/virtio/transport/pci.rs @@ -20,6 +20,8 @@ use volatile::{VolatilePtr, VolatileRef}; use crate::arch::memory_barrier; use crate::arch::pci::PciConfigRegion; +#[cfg(feature = "balloon")] +use crate::drivers::balloon::VirtioBalloonDriver; use crate::drivers::error::DriverError; #[cfg(feature = "fuse")] use crate::drivers::fs::virtio_fs::VirtioFsDriver; @@ -861,6 +863,24 @@ pub(crate) fn init_device( } } } + #[cfg(feature = "balloon")] + virtio::Id::Balloon => match VirtioBalloonDriver::from_pci_device(device) { + Ok(virtio_balloon_driver) => { + info!("Virtio traditional memory balloon driver initialized."); + + let irq = device.get_irq().unwrap(); + crate::arch::interrupts::add_irq_name(irq, "virtio-balloon"); + info!("Virtio balloon interrupt handler at line {irq}"); + + Ok(VirtioDriver::Balloon(virtio_balloon_driver)) + } + Err(virtio_error) => { + error!( + "Virtio traditional memory balloon driver could not be initialized with device id {device_id:x}: {virtio_error}" + ); + Err(DriverError::InitVirtioDevFail(virtio_error)) + } + }, id => { warn!("Virtio device {id:?} is not supported, skipping!"); @@ -882,4 +902,6 @@ pub(crate) enum VirtioDriver { Vsock(Box), #[cfg(feature = "fuse")] FileSystem(VirtioFsDriver), + #[cfg(feature = "balloon")] + Balloon(VirtioBalloonDriver), } diff --git a/src/drivers/virtio/virtqueue/mod.rs b/src/drivers/virtio/virtqueue/mod.rs index 1df59a091b..49892336a3 100644 --- a/src/drivers/virtio/virtqueue/mod.rs +++ b/src/drivers/virtio/virtqueue/mod.rs @@ -148,6 +148,9 @@ pub trait Virtq: Send { /// Disables interrupts for this virtqueue upon receiving a transfer fn disable_notifs(&mut self); + /// Check if there are no more descriptors left in the queue. + fn is_empty(&self) -> bool; + /// Checks if new used descriptors have been written by the device. /// This activates the queue and polls the descriptor ring of the queue. fn try_recv(&mut self) -> Result; @@ -569,6 +572,14 @@ impl MemPool { limit: size, } } + + fn all_used(&self) -> bool { + self.pool.len() == 0 + } + + fn all_available(&self) -> bool { + self.pool.len() == self.limit as usize + } } /// Virtqeueus error module. diff --git a/src/drivers/virtio/virtqueue/packed.rs b/src/drivers/virtio/virtqueue/packed.rs index 20ebba75a3..2af99f657b 100644 --- a/src/drivers/virtio/virtqueue/packed.rs +++ b/src/drivers/virtio/virtqueue/packed.rs @@ -532,6 +532,10 @@ impl Virtq for PackedVq { self.drv_event.disable_notif(); } + fn is_empty(&self) -> bool { + todo!() + } + fn try_recv(&mut self) -> Result { self.descr_ring.try_recv() } diff --git a/src/drivers/virtio/virtqueue/split.rs b/src/drivers/virtio/virtqueue/split.rs index a7fafa6853..f472b112ac 100644 --- a/src/drivers/virtio/virtqueue/split.rs +++ b/src/drivers/virtio/virtqueue/split.rs @@ -28,7 +28,7 @@ use crate::mm::device_alloc::DeviceAlloc; struct DescrRing { read_idx: u16, - token_ring: Box<[Option>>]>, + token_ring: Box<[Option>]>, mem_pool: MemPool, /// Descriptor Tables @@ -56,22 +56,38 @@ impl DescrRing { unsafe { &*self.used_ring_cell.get() } } + fn is_empty(&self) -> bool { + self.mem_pool.all_available() + } + fn push(&mut self, tkn: TransferToken) -> Result { let mut index; if let Some(ctrl_desc) = tkn.ctrl_desc.as_ref() { + trace!(" Creating indirect descriptor"); let descriptor = SplitVq::indirect_desc(ctrl_desc.as_ref()); + trace!(" Attempting to assign descriptor to free slot in table"); + index = self.mem_pool.pool.pop().ok_or(VirtqError::NoDescrAvail)?.0; + trace!(" Assigned one descriptor (indirect)"); self.descr_table_mut()[usize::from(index)] = MaybeUninit::new(descriptor); } else { + trace!(" Creating direct descriptor iterator"); let mut rev_all_desc_iter = SplitVq::descriptor_iter(&tkn.buff_tkn)?.rev(); + trace!( + " Attempting to assign descriptors to free slots in table in reverse order" + ); + + let mut num_descriptors_assigned = 0; + // We need to handle the last descriptor (the first for the reversed iterator) specially to not set the next flag. { // If the [AvailBufferToken] is empty, we panic let descriptor = rev_all_desc_iter.next().unwrap(); index = self.mem_pool.pool.pop().ok_or(VirtqError::NoDescrAvail)?.0; + num_descriptors_assigned += 1; self.descr_table_mut()[usize::from(index)] = MaybeUninit::new(descriptor); } for mut descriptor in rev_all_desc_iter { @@ -79,13 +95,19 @@ impl DescrRing { descriptor.next = le16::from(index); index = self.mem_pool.pool.pop().ok_or(VirtqError::NoDescrAvail)?.0; + num_descriptors_assigned += 1; self.descr_table_mut()[usize::from(index)] = MaybeUninit::new(descriptor); } // At this point, `index` is the index of the last element of the reversed iterator, // thus the head of the descriptor chain. + trace!(" Assigned {num_descriptors_assigned} descriptors (direct)"); } - self.token_ring[usize::from(index)] = Some(Box::new(tkn)); + trace!(" Inserting transfer token into token ring at index {index}"); + + self.token_ring[usize::from(index)] = Some(tkn); + + trace!(" Updating available ring"); let len = self.token_ring.len(); let idx = self.avail_ring_mut().idx.to_ne(); @@ -111,9 +133,11 @@ impl DescrRing { "The buff_id is incorrect or the reference to the TransferToken was misplaced.", ); + let mut num_descriptors_freed = 0; // We return the indices of the now freed ring slots back to `mem_pool.` let mut id_ret_idx = u16::try_from(used_elem.id.to_ne()).unwrap(); loop { + num_descriptors_freed += 1; self.mem_pool.ret_id(super::MemDescrId(id_ret_idx)); let cur_chain_elem = unsafe { self.descr_table_mut()[usize::from(id_ret_idx)].assume_init() }; @@ -123,6 +147,7 @@ impl DescrRing { break; } } + trace!(" freed {num_descriptors_freed} descriptors"); memory_barrier(); self.read_idx = self.read_idx.wrapping_add(1); @@ -171,6 +196,10 @@ impl Virtq for SplitVq { self.ring.try_recv() } + fn is_empty(&self) -> bool { + self.ring.is_empty() + } + fn dispatch_batch( &mut self, _tkns: Vec<(AvailBufferToken, BufferType)>, @@ -193,7 +222,15 @@ impl Virtq for SplitVq { notif: bool, buffer_type: BufferType, ) -> Result<(), VirtqError> { + // IMPORTANT: This function may not allocate from GlobalAlloc if the + // balloon device support has been enabled, as the inflate/deflate + // operations operate with a locked global allocator and need + // to send descriptors into their respective queues. + // Allocating with the global allocator here would deadlock. + + trace!(" Creating transfer token"); let transfer_tkn = Self::transfer_token_from_buffer_token(buffer_tkn, buffer_type); + trace!(" Pushing to descriptor ring transfer token"); let next_idx = self.ring.push(transfer_tkn)?; if notif { diff --git a/src/executor/alloc_stats.rs b/src/executor/alloc_stats.rs new file mode 100644 index 0000000000..63cfea2c6e --- /dev/null +++ b/src/executor/alloc_stats.rs @@ -0,0 +1,20 @@ +use core::task::Poll; + +use crate::executor::spawn; +use crate::mm::ALLOCATOR; + +async fn print_alloc_stats() { + core::future::poll_fn::<(), _>(|_cx| { + let talc = ALLOCATOR.inner().lock(); + + debug!("\n{}", talc.get_counters()); + + Poll::Pending + }) + .await; +} + +pub(crate) fn init() { + info!("Spawning allocation stats printing task"); + spawn(print_alloc_stats()); +} diff --git a/src/executor/balloon.rs b/src/executor/balloon.rs new file mode 100644 index 0000000000..fc017b2710 --- /dev/null +++ b/src/executor/balloon.rs @@ -0,0 +1,35 @@ +use core::future; +use core::task::Poll; + +use crate::drivers::pci; +use crate::executor::spawn; + +async fn balloon_run() { + future::poll_fn(|_cx| { + if let Some(driver) = pci::get_balloon_driver() { + let Some(mut driver_guard) = driver.try_lock() else { + debug!( + "Balloon driver was polled while the driver was locked elsewhere, doing nothing" + ); + return Poll::Pending; + }; + + driver_guard.poll_events(); + + Poll::Pending + } else { + Poll::Ready(()) + } + }) + .await; +} + +pub(crate) fn init() { + info!("Try to initialize balloon interface!"); + + if let Some(driver) = pci::get_balloon_driver() { + driver.lock().enable_interrupts(); + } + + spawn(balloon_run()); +} diff --git a/src/executor/mod.rs b/src/executor/mod.rs index 5c5e0e8d1e..8c18ebb3f5 100644 --- a/src/executor/mod.rs +++ b/src/executor/mod.rs @@ -1,5 +1,9 @@ #![allow(dead_code)] +#[cfg(feature = "allocation-stats")] +mod alloc_stats; +#[cfg(feature = "balloon")] +mod balloon; #[cfg(any(feature = "tcp", feature = "udp"))] pub(crate) mod device; #[cfg(any(feature = "tcp", feature = "udp"))] @@ -127,6 +131,10 @@ pub fn init() { crate::executor::network::init(); #[cfg(feature = "vsock")] crate::executor::vsock::init(); + #[cfg(feature = "allocation-stats")] + crate::executor::alloc_stats::init(); + #[cfg(feature = "balloon")] + crate::executor::balloon::init(); } /// Blocks the current thread on `f`, running the executor when idling. diff --git a/src/lib.rs b/src/lib.rs index 3b1e059f43..0798a7fe3a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,6 +31,7 @@ reexport_test_harness_main = "test_main" )] #![cfg_attr(all(target_os = "none", test), no_main)] +#![cfg_attr(feature = "balloon", feature(iter_collect_into, iter_array_chunks))] // EXTERNAL CRATES #[macro_use] diff --git a/src/mm/allocator.rs b/src/mm/allocator.rs index 941fff33b6..7c2adf1faa 100644 --- a/src/mm/allocator.rs +++ b/src/mm/allocator.rs @@ -4,15 +4,36 @@ use core::alloc::{GlobalAlloc, Layout}; use hermit_sync::RawInterruptTicketMutex; -use talc::{ErrOnOom, Span, Talc, Talck}; +#[cfg(not(feature = "balloon"))] +use talc::ErrOnOom; +use talc::{Span, Talc, Talck}; -pub struct LockedAllocator(Talck); +#[cfg(feature = "balloon")] +use crate::drivers::balloon::oom::DeflateBalloonOnOom; + +#[cfg(not(feature = "balloon"))] +pub(crate) type HermitOomHandler = ErrOnOom; +#[cfg(feature = "balloon")] +pub(crate) type HermitOomHandler = DeflateBalloonOnOom; + +pub struct LockedAllocator(Talck); impl LockedAllocator { + #[cfg(not(feature = "balloon"))] pub const fn new() -> Self { Self(Talc::new(ErrOnOom).lock()) } + /// # Safety + /// May only be used to construct the allocator that will be used as the + /// global allocator. + #[cfg(feature = "balloon")] + pub const unsafe fn new() -> Self { + // SAFETY: We pass on the requirement of usage being restricted to only + // the one global allocator to our caller. + Self(Talc::new(unsafe { DeflateBalloonOnOom::new() }).lock()) + } + #[inline] fn align_layout(layout: Layout) -> Layout { let align = layout @@ -27,6 +48,10 @@ impl LockedAllocator { self.0.lock().claim(arena).unwrap(); } } + + pub(crate) fn inner(&self) -> &Talck { + &self.0 + } } /// To avoid false sharing, the global memory allocator align diff --git a/src/mm/mod.rs b/src/mm/mod.rs index ac4a8edc22..63ff673306 100644 --- a/src/mm/mod.rs +++ b/src/mm/mod.rs @@ -17,10 +17,18 @@ pub use crate::arch::mm::paging::virtual_to_physical; use crate::arch::mm::paging::{BasePageSize, LargePageSize, PageSize}; use crate::{arch, env}; -#[cfg(target_os = "none")] +#[cfg(all(target_os = "none", not(feature = "balloon")))] #[global_allocator] pub(crate) static ALLOCATOR: LockedAllocator = LockedAllocator::new(); +#[cfg(all(target_os = "none", feature = "balloon"))] +#[global_allocator] +pub(crate) static ALLOCATOR: LockedAllocator = { + // SAFETY: We are constructing this `LockedAllocator` to be Hermit's global + // allocator. + unsafe { LockedAllocator::new() } +}; + /// Physical and virtual address range of the 2 MiB pages that map the kernel. static KERNEL_ADDR_RANGE: Lazy> = Lazy::new(|| { if cfg!(target_os = "none") {