Skip to content

Commit d1c8744

Browse files
committed
reuse a buffer for read/write-through-propolis-heap
this seems to have some effect, but not a lot; nothing when I/Os are all the same size, on the order of 5% better throughput when I/Os vary between 4 KiB, 32 KiB, 256 KiB, and in-between
1 parent 21b3da8 commit d1c8744

2 files changed

Lines changed: 86 additions & 24 deletions

File tree

lib/propolis/src/block/file.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,24 @@ impl SharedState {
5959
}
6060

6161
fn processing_loop(&self, wctx: SyncWorkerCtx) {
62+
// Conspire with gross hack in `preadv()` and `pwritev()`: there, we
63+
// want to give the OS a buffer backed by Propolis heap, rather than a
64+
// VMM segment, to avoid a slow fallback to lock pages in memory during
65+
// the I/O. We could either allocate buffers on-demand sized for the
66+
// I/O, or up front in the worker thread, here. On-demand allocation
67+
// works, but when I/O sizes are mixed the high malloc/free rates
68+
// (X00k/sec/process) get a bit busy on locks in libumem. So instead,
69+
// retain a buffer across I/Os, which is passed into `process_request()`
70+
// and on.
71+
//
72+
// This buffer is grown on-demand when handling an I/O because some
73+
// guests' maximum I/O size seems to be smaller than maximums we
74+
// indicate (for example, Linux seems to send NVMe I/Os 256 KiB or
75+
// smaller, even though we report we'll tolerate up to 1 MiB). In those
76+
// cases, sizing against an expected maximum will grossly oversize
77+
// buffers and increase memory pressure for no good reason.
78+
let mut scratch = Vec::new();
79+
6280
while let Some(dreq) = wctx.block_for_req() {
6381
let req = dreq.req();
6482
if self.info.read_only && req.op.is_write() {
@@ -74,7 +92,8 @@ impl SharedState {
7492
dreq.complete(block::Result::Failure);
7593
continue;
7694
};
77-
let res = match self.process_request(&req, &mem) {
95+
let res = match self.process_request(&req, &mem, Some(&mut scratch))
96+
{
7897
Ok(_) => block::Result::Success,
7998
Err(_) => block::Result::Failure,
8099
};
@@ -86,13 +105,14 @@ impl SharedState {
86105
&self,
87106
req: &block::Request,
88107
mem: &MemCtx,
108+
scratch: Option<&mut Vec<u8>>,
89109
) -> std::result::Result<(), &'static str> {
90110
match req.op {
91111
block::Operation::Read(off, len) => {
92112
let maps = req.mappings(mem).ok_or("mapping unavailable")?;
93113

94114
let nbytes = maps
95-
.preadv(self.fp.as_raw_fd(), off as i64)
115+
.preadv(self.fp.as_raw_fd(), off as i64, scratch)
96116
.map_err(|_| "io error")?;
97117
if nbytes != len {
98118
return Err("bad read length");
@@ -102,7 +122,7 @@ impl SharedState {
102122
let maps = req.mappings(mem).ok_or("bad guest region")?;
103123

104124
let nbytes = maps
105-
.pwritev(self.fp.as_raw_fd(), off as i64)
125+
.pwritev(self.fp.as_raw_fd(), off as i64, scratch)
106126
.map_err(|_| "io error")?;
107127
if nbytes != len {
108128
return Err("bad write length");

lib/propolis/src/vmm/mem.rs

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -781,10 +781,20 @@ unsafe impl Sync for SubMapping<'_> {}
781781

782782
pub trait MappingExt {
783783
/// preadv from `file` into multiple mappings
784-
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize>;
784+
fn preadv(
785+
&self,
786+
fd: RawFd,
787+
offset: i64,
788+
scratch: Option<&mut Vec<u8>>,
789+
) -> Result<usize>;
785790

786791
/// pwritev from multiple mappings to `file`
787-
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
792+
fn pwritev(
793+
&self,
794+
fd: RawFd,
795+
offset: i64,
796+
scratch: Option<&mut Vec<u8>>,
797+
) -> Result<usize>;
788798
}
789799

790800
// Gross hack alert: since the mappings below are memory regions backed by
@@ -825,8 +835,45 @@ fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> {
825835
})
826836
}
827837

838+
/// Ensure the maybe-provided buffer is ready for I/O of `desired_size`.
839+
///
840+
/// Returns `Some` with a slice with length at least as large as `desired_size`,
841+
/// if the desired size is in the range we're willing to use the I/O buffer
842+
/// proxying workaround.
843+
///
844+
/// If the scratch buffer is provided, was too small, but the desired size is
845+
/// acceptable, this function will grow the scratch buffer.
846+
fn prepare_scratch_buffer(
847+
scratch: Option<&mut Vec<u8>>,
848+
desired_size: usize,
849+
) -> Option<&mut [u8]> {
850+
scratch.and_then(|scratch| {
851+
if desired_size > MAPPING_IO_LIMIT_BYTES {
852+
// Even though the caller provided a buffer, we're not willing to
853+
// grow it to handle this I/O.
854+
return None;
855+
}
856+
857+
if scratch.len() < desired_size {
858+
// The provided buffer is smaller than necessary, but the needed
859+
// size is acceptable, so grow the buffer.
860+
//
861+
// Buffer growth is done lazily because experience shows that guests
862+
// may submit I/Os much smaller than the largest permitted by MDTS.
863+
scratch.resize(desired_size, 0);
864+
}
865+
866+
Some(scratch.as_mut_slice())
867+
})
868+
}
869+
828870
impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
829-
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
871+
fn preadv(
872+
&self,
873+
fd: RawFd,
874+
offset: i64,
875+
scratch: Option<&mut Vec<u8>>,
876+
) -> Result<usize> {
830877
if !self
831878
.as_ref()
832879
.iter()
@@ -841,16 +888,11 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
841888
let total_capacity = total_mapping_size(self.as_ref())?;
842889

843890
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
844-
if total_capacity <= MAPPING_IO_LIMIT_BYTES {
845-
// If we're motivated to avoid the zero-fill via
846-
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
847-
// probably avoid this gross hack entirely (see comment on
848-
// MAPPING_IO_LIMIT_BYTES).
849-
let mut buf = vec![0; total_capacity];
850-
891+
let scratch = prepare_scratch_buffer(scratch, total_capacity);
892+
if let Some(buf) = scratch {
851893
let iov = [iovec {
852894
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
853-
iov_len: buf.len(),
895+
iov_len: total_capacity,
854896
}];
855897

856898
let res = unsafe {
@@ -904,7 +946,12 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
904946
}
905947
}
906948

907-
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
949+
fn pwritev(
950+
&self,
951+
fd: RawFd,
952+
offset: i64,
953+
scratch: Option<&mut Vec<u8>>,
954+
) -> Result<usize> {
908955
if !self
909956
.as_ref()
910957
.iter()
@@ -919,14 +966,9 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
919966
let total_capacity = total_mapping_size(self.as_ref())?;
920967

921968
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
922-
let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
923-
// If we're motivated to avoid the zero-fill via
924-
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
925-
// probably avoid this gross hack entirely (see comment on
926-
// MAPPING_IO_LIMIT_BYTES).
927-
let mut buf = vec![0; total_capacity];
928-
929-
let mut remaining = buf.as_mut_slice();
969+
let scratch = prepare_scratch_buffer(scratch, total_capacity);
970+
let written = if let Some(buf) = scratch {
971+
let mut remaining = &mut buf[..];
930972
for mapping in self.as_ref().iter() {
931973
// The original `buf` is at least as large as all mappings
932974
// combined, so `remaining` is at least as large as this and all
@@ -940,7 +982,7 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
940982

941983
let iovs = [iovec {
942984
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
943-
iov_len: buf.len(),
985+
iov_len: total_capacity,
944986
}];
945987

946988
unsafe {

0 commit comments

Comments
 (0)