diff --git a/.github/workflows/kani.yml b/.github/workflows/kani.yml index 24cbca6f451..1f28f735edd 100644 --- a/.github/workflows/kani.yml +++ b/.github/workflows/kani.yml @@ -60,6 +60,11 @@ jobs: perf: runs-on: ubuntu-24.04 + # The full perf suite baseline runs in ~45 min; with the per-test --timeout + # in scripts/kani-perf.sh as a guardrail, the worst-case suite duration is + # 15 x 600s = 150 min. A retry doubles that ceiling, so the job timeout + # has to sit above (step timeout) x (max_attempts) to let the retry land. + timeout-minutes: 380 permissions: contents: read steps: @@ -72,7 +77,14 @@ jobs: os: ubuntu-24.04 - name: Execute Kani performance tests - run: ./scripts/kani-perf.sh + # Retry on infra-driven failures (spot preemption / runner shutdown). + # The script itself bounds per-test wall time, so functional regressions + # surface as normal test failures and are not retried indefinitely. + uses: nick-fields/retry@v3 + with: + timeout_minutes: 180 + max_attempts: 2 + command: ./scripts/kani-perf.sh env: RUST_TEST_THREADS: 1 diff --git a/kani-compiler/src/kani_middle/reachability.rs b/kani-compiler/src/kani_middle/reachability.rs index e945c3aaec1..5a7ac9cb44e 100644 --- a/kani-compiler/src/kani_middle/reachability.rs +++ b/kani-compiler/src/kani_middle/reachability.rs @@ -32,13 +32,14 @@ use rustc_public::mir::{ use rustc_public::rustc_internal; use rustc_public::ty::{Allocation, ClosureKind, ConstantKind, RigidTy, Ty, TyKind}; use rustc_public::{CrateDef, ItemKind}; +#[cfg(debug_assertions)] use rustc_session::config::OutputType; +use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; -use std::{ - collections::{HashMap, HashSet}, - fs::File, - io::{BufWriter, Write}, -}; +#[cfg(debug_assertions)] +use std::fs::File; +#[cfg(debug_assertions)] +use std::io::{BufWriter, Write}; use crate::kani_middle::coercion; use crate::kani_middle::coercion::CoercionBase; @@ -580,6 +581,7 @@ impl CallGraph { /// Print the graph in DOT format to a file. /// See for more information. + #[cfg(debug_assertions)] fn dump_dot(&self, tcx: TyCtxt, initial: Option) -> std::io::Result<()> { if let Ok(target) = std::env::var("KANI_REACH_DEBUG") { debug!(?target, "dump_dot"); @@ -608,6 +610,7 @@ impl CallGraph { } /// Write all notes to the given writer. + #[cfg(debug_assertions)] fn dump_all(&self, writer: &mut W) -> std::io::Result<()> { tracing::info!(nodes=?self.nodes.len(), edges=?self.edges.len(), "dump_all"); for node in &self.nodes { @@ -621,6 +624,7 @@ impl CallGraph { } /// Write all notes that may have led to the discovery of the given target. + #[cfg(debug_assertions)] fn dump_reason(&self, writer: &mut W, target: &str) -> std::io::Result<()> { let mut queue: Vec = self.nodes.iter().filter(|item| item.to_string().contains(target)).cloned().collect(); diff --git a/rust-toolchain.toml b/rust-toolchain.toml index ad3c08879ce..ff15e5e6aad 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -2,5 +2,5 @@ # SPDX-License-Identifier: Apache-2.0 OR MIT [toolchain] -channel = "nightly-2025-12-03" +channel = "nightly-2025-12-04" components = ["llvm-tools", "rustc-dev", "rust-src", "rustfmt"] diff --git a/scripts/kani-fmt.sh b/scripts/kani-fmt.sh index d4ddfaf19b5..b15c1009fdc 100755 --- a/scripts/kani-fmt.sh +++ b/scripts/kani-fmt.sh @@ -30,7 +30,13 @@ cargo fmt ${check_flag} || error=1 # Check test source files. TESTS=("tests" "docs/src/tutorial") # Add ignore patterns for code we don't want to format. -IGNORE=("*/perf/s2n-quic/*") +# `*/perf/s2n-quic/*` excludes the upstream submodule. +# `*/perf/overlays/*` excludes the overlay sources (see +# `tests/perf/overlays/README.md`): these are partial copies of submodule +# files staged for `cp -r` by `scripts/kani-perf.sh`, and they can reference +# `mod` declarations whose siblings only exist in the submodule, so rustfmt +# cannot standalone-parse them. +IGNORE=("*/perf/s2n-quic/*" "*/perf/overlays/*") # Arguments for the find command for excluding the IGNORE paths IGNORE_ARGS=() diff --git a/scripts/kani-perf.sh b/scripts/kani-perf.sh index a7e2710773a..d247958c96b 100755 --- a/scripts/kani-perf.sh +++ b/scripts/kani-perf.sh @@ -26,8 +26,15 @@ done suite="perf" mode="cargo-kani-test" -echo "Check compiletest suite=$suite mode=$mode" -cargo run -p compiletest -- --suite $suite --mode $mode --no-fail-fast +# Bound each test's wall time so a runaway case (e.g. an OOM-prone harness) +# fails as a normal test failure with output, instead of triggering an +# unattributable runner OOM-kill / shutdown signal in CI. +# Default 600s (10 min) is well above the typical perf-case runtime (seconds +# to a couple of minutes) and keeps the worst-case suite duration compatible +# with the workflow step timeout. Override via KANI_PERF_TEST_TIMEOUT. +timeout_secs="${KANI_PERF_TEST_TIMEOUT:-600}" +echo "Check compiletest suite=$suite mode=$mode timeout=${timeout_secs}s" +cargo run -p compiletest -- --suite $suite --mode $mode --no-fail-fast --timeout "$timeout_secs" exit_code=$? echo "Cleaning up..." diff --git a/tests/perf/overlays/s2n-quic/quic/s2n-quic-core/src/inet/checksum.rs b/tests/perf/overlays/s2n-quic/quic/s2n-quic-core/src/inet/checksum.rs new file mode 100644 index 00000000000..f6d1f97b219 --- /dev/null +++ b/tests/perf/overlays/s2n-quic/quic/s2n-quic-core/src/inet/checksum.rs @@ -0,0 +1,502 @@ +// Copyright Kani Contributors +// SPDX-License-Identifier: Apache-2.0 OR MIT +// +// Derived from `quic/s2n-quic-core/src/inet/checksum.rs` in +// https://github.com/aws/s2n-quic, originally authored by Amazon.com, Inc. +// and its affiliates under SPDX-License-Identifier: Apache-2.0. +// +// This file is a Kani-only overlay that gets copied over the s2n-quic +// submodule by `scripts/kani-perf.sh` before the perf suite runs (see +// `tests/perf/overlays/README.md`). Compared to upstream, the only +// changes are the verification-time `LEN` and `kani::unwind` constants +// in the `tests` module, to keep peak RSS within the GitHub-hosted +// runner ceiling on `nightly-2025-12-04` (rust-lang/rust#146436). + +use core::{fmt, hash::Hasher, num::Wrapping}; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod x86; + +/// Computes the [IP checksum](https://www.rfc-editor.org/rfc/rfc1071) over the given slice of bytes +#[inline] +pub fn checksum(data: &[u8]) -> u16 { + let mut checksum = Checksum::default(); + checksum.write(data); + checksum.finish() +} + +/// Minimum size for a payload to be considered for platform-specific code +const LARGE_WRITE_LEN: usize = 32; + +type Accumulator = u64; +type State = Wrapping; + +/// Platform-specific function for computing a checksum +type LargeWriteFn = for<'a> unsafe fn(&mut State, bytes: &'a [u8]) -> &'a [u8]; + +#[inline(always)] +fn write_sized_generic<'a, const MAX_LEN: usize, const CHUNK_LEN: usize>( + state: &mut State, + mut bytes: &'a [u8], + on_chunk: impl Fn(&[u8; CHUNK_LEN], &mut Accumulator), +) -> &'a [u8] { + //= https://www.rfc-editor.org/rfc/rfc1071#section-4.1 + //# The following "C" code algorithm computes the checksum with an inner + //# loop that sums 16-bits at a time in a 32-bit accumulator. + //# + //# in 6 + //# { + //# /* Compute Internet Checksum for "count" bytes + //# * beginning at location "addr". + //# */ + //# register long sum = 0; + //# + //# while( count > 1 ) { + //# /* This is the inner loop */ + //# sum += * (unsigned short) addr++; + //# count -= 2; + //# } + //# + //# /* Add left-over byte, if any */ + //# if( count > 0 ) + //# sum += * (unsigned char *) addr; + //# + //# /* Fold 32-bit sum to 16 bits */ + //# while (sum>>16) + //# sum = (sum & 0xffff) + (sum >> 16); + //# + //# checksum = ~sum; + //# } + + while bytes.len() >= MAX_LEN { + // use `get_unchecked` to make it easier for kani to analyze + let chunks = unsafe { bytes.get_unchecked(..MAX_LEN) }; + bytes = unsafe { bytes.get_unchecked(MAX_LEN..) }; + + let mut sum = 0; + // for each pair of bytes, interpret them as integers and sum them up + for chunk in chunks.chunks_exact(CHUNK_LEN) { + let chunk = unsafe { + // SAFETY: chunks_exact always produces a slice of CHUNK_LEN + debug_assert_eq!(chunk.len(), CHUNK_LEN); + &*(chunk.as_ptr() as *const [u8; CHUNK_LEN]) + }; + on_chunk(chunk, &mut sum); + } + *state += sum; + } + + bytes +} + +/// Generic implementation of a function that computes a checksum over the given slice +#[inline(always)] +fn write_sized_generic_u16<'a, const LEN: usize>(state: &mut State, bytes: &'a [u8]) -> &'a [u8] { + write_sized_generic::( + state, + bytes, + #[inline(always)] + |&bytes, acc| { + *acc += u16::from_ne_bytes(bytes) as Accumulator; + }, + ) +} + +#[inline(always)] +fn write_sized_generic_u32<'a, const LEN: usize>(state: &mut State, bytes: &'a [u8]) -> &'a [u8] { + write_sized_generic::( + state, + bytes, + #[inline(always)] + |&bytes, acc| { + *acc += u32::from_ne_bytes(bytes) as Accumulator; + }, + ) +} + +/// Returns the most optimized function implementation for the current platform +#[inline] +#[cfg(all(feature = "once_cell", not(any(kani, miri))))] +fn probe_write_large() -> LargeWriteFn { + static LARGE_WRITE_FN: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(fun) = x86::probe() { + return fun; + } + } + + write_sized_generic_u32::<16> + }); + + *LARGE_WRITE_FN +} + +#[inline] +#[cfg(not(all(feature = "once_cell", not(any(kani, miri)))))] +fn probe_write_large() -> LargeWriteFn { + write_sized_generic_u32::<16> +} + +/// Computes the [IP checksum](https://www.rfc-editor.org/rfc/rfc1071) over an arbitrary set of inputs +#[derive(Clone, Copy)] +pub struct Checksum { + state: State, + partial_write: bool, + write_large: LargeWriteFn, +} + +impl Default for Checksum { + fn default() -> Self { + Self { + state: Default::default(), + partial_write: false, + write_large: probe_write_large(), + } + } +} + +impl fmt::Debug for Checksum { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut v = *self; + v.carry(); + f.debug_tuple("Checksum").field(&v.finish()).finish() + } +} + +impl Checksum { + /// Creates a checksum instance without enabling the native implementation + #[inline] + pub fn generic() -> Self { + Self { + state: Default::default(), + partial_write: false, + write_large: write_sized_generic_u32::<16>, + } + } + + /// Writes a single byte to the checksum state + #[inline] + fn write_byte(&mut self, byte: u8, shift: bool) { + if shift { + self.state += (byte as Accumulator) << 8; + } else { + self.state += byte as Accumulator; + } + } + + /// Carries all of the bits into a single 16 bit range + #[inline] + fn carry(&mut self) { + #[cfg(kani)] + self.carry_rfc(); + #[cfg(not(kani))] + self.carry_optimized(); + } + + /// Carries all of the bits into a single 16 bit range + /// + /// This implementation is very similar to the way the RFC is written. + #[inline] + #[allow(dead_code)] + fn carry_rfc(&mut self) { + let mut state = self.state.0; + + for _ in 0..core::mem::size_of::() { + state = (state & 0xffff) + (state >> 16); + } + + self.state.0 = state; + } + + /// Carries all of the bits into a single 16 bit range + /// + /// This implementation was written after some optimization on the RFC version. It results in + /// about half the instructions needed as the RFC. + #[inline] + #[allow(dead_code)] + fn carry_optimized(&mut self) { + let values: [u16; core::mem::size_of::() / 2] = unsafe { + // SAFETY: alignment of the State is >= of u16 + debug_assert!(core::mem::align_of::() >= core::mem::align_of::()); + core::mem::transmute(self.state.0) + }; + + let mut sum = 0u16; + + for value in values { + let (res, overflowed) = sum.overflowing_add(value); + sum = res; + if overflowed { + sum += 1; + } + } + + self.state.0 = sum as _; + } + + /// Writes bytes to the checksum and ensures any single byte remainders are padded + #[inline] + pub fn write_padded(&mut self, bytes: &[u8]) { + self.write(bytes); + + // write a null byte if `bytes` wasn't 16-bit aligned + if core::mem::take(&mut self.partial_write) { + self.write_byte(0, cfg!(target_endian = "little")); + } + } + + /// Computes the final checksum + #[inline] + pub fn finish(self) -> u16 { + self.finish_be().to_be() + } + + #[inline] + pub fn finish_be(mut self) -> u16 { + self.carry(); + + let value = self.state.0 as u16; + let value = !value; + + // if value is 0, we need to set it to the max value to indicate the checksum was actually + // computed + if value == 0 { + return 0xffff; + } + + value + } +} + +impl Hasher for Checksum { + #[inline] + fn write(&mut self, mut bytes: &[u8]) { + if bytes.is_empty() { + return; + } + + // Check to see if we have a partial write to flush + if core::mem::take(&mut self.partial_write) { + let (chunk, remaining) = bytes.split_at(1); + bytes = remaining; + + // shift the byte if we're on little endian + self.write_byte(chunk[0], cfg!(target_endian = "little")); + } + + // Only delegate to the optimized platform function if the payload is big enough + if bytes.len() >= LARGE_WRITE_LEN { + bytes = unsafe { (self.write_large)(&mut self.state, bytes) }; + } + + // Fall back on the generic implementation to wrap things up + // + // NOTE: We don't use the u32 version with kani as it causes the verification time to + // increase by quite a bit. We have a separate proof for the functional equivalence of + // these two configurations. + #[cfg(not(kani))] + { + bytes = write_sized_generic_u32::<4>(&mut self.state, bytes); + } + + bytes = write_sized_generic_u16::<2>(&mut self.state, bytes); + + // if we only have a single byte left, write it to the state and mark it as a partial write + if let Some(byte) = bytes.first().copied() { + self.partial_write = true; + self.write_byte(byte, cfg!(target_endian = "big")); + } + } + + #[inline] + fn finish(&self) -> u64 { + Self::finish(*self) as _ + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bolero::check; + + #[test] + fn rfc_example_test() { + //= https://www.rfc-editor.org/rfc/rfc1071#section-3 + //= type=test + //# We now present explicit examples of calculating a simple 1's + //# complement sum on a 2's complement machine. The examples show the + //# same sum calculated byte by bye, by 16-bits words in normal and + //# swapped order, and 32 bits at a time in 3 different orders. All + //# numbers are in hex. + //# + //# Byte-by-byte "Normal" Swapped + //# Order Order + //# + //# Byte 0/1: 00 01 0001 0100 + //# Byte 2/3: f2 03 f203 03f2 + //# Byte 4/5: f4 f5 f4f5 f5f4 + //# Byte 6/7: f6 f7 f6f7 f7f6 + //# --- --- ----- ----- + //# Sum1: 2dc 1f0 2ddf0 1f2dc + //# + //# dc f0 ddf0 f2dc + //# Carrys: 1 2 2 1 + //# -- -- ---- ---- + //# Sum2: dd f2 ddf2 f2dd + //# + //# Final Swap: dd f2 ddf2 ddf2 + let bytes = [0x00, 0x01, 0xf2, 0x03, 0xf4, 0xf5, 0xf6, 0xf7]; + + let mut checksum = Checksum::default(); + checksum.write(&bytes); + checksum.carry(); + + assert_eq!((checksum.state.0 as u16).to_le_bytes(), [0xdd, 0xf2]); + assert_eq!((!rfc_c_port(&bytes)).to_be_bytes(), [0xdd, 0xf2]); + } + + fn rfc_c_port(data: &[u8]) -> u16 { + //= https://www.rfc-editor.org/rfc/rfc1071#section-4.1 + //= type=test + //# The following "C" code algorithm computes the checksum with an inner + //# loop that sums 16-bits at a time in a 32-bit accumulator. + //# + //# in 6 + //# { + //# /* Compute Internet Checksum for "count" bytes + //# * beginning at location "addr". + //# */ + //# register long sum = 0; + //# + //# while( count > 1 ) { + //# /* This is the inner loop */ + //# sum += * (unsigned short) addr++; + //# count -= 2; + //# } + //# + //# /* Add left-over byte, if any */ + //# if( count > 0 ) + //# sum += * (unsigned char *) addr; + //# + //# /* Fold 32-bit sum to 16 bits */ + //# while (sum>>16) + //# sum = (sum & 0xffff) + (sum >> 16); + //# + //# checksum = ~sum; + //# } + + let mut addr = data.as_ptr(); + let mut count = data.len(); + + unsafe { + let mut sum = 0u32; + + while count > 1 { + let value = u16::from_be_bytes([*addr, *addr.add(1)]); + sum = sum.wrapping_add(value as u32); + addr = addr.add(2); + count -= 2; + } + + if count > 0 { + let value = u16::from_be_bytes([*addr, 0]); + sum = sum.wrapping_add(value as u32); + } + + while sum >> 16 != 0 { + sum = (sum & 0xffff) + (sum >> 16); + } + + !(sum as u16) + } + } + + // The kani case uses a deliberately smaller LEN than upstream. The + // bolero/Kani state space for `differential` scales with LEN; with LEN=16 + // and the post-2025-12-04 nightly, peak RSS exceeds the 16 GB GH-runner + // ceiling (see `tests/perf/overlays/README.md` for the overlay rationale). + // LEN=8 keeps the same property under verification but with a state space + // that fits in CI. The non-Kani (miri) value is preserved. + #[cfg(any(kani, miri))] + const LEN: usize = if cfg!(kani) { 8 } else { 32 }; + + /// * Compares the implementation to a port of the C code defined in the RFC + /// * Ensures partial writes are correctly handled, even if they're not at a 16 bit boundary + // The unwind value is set to LEN+1 (9) rather than the upstream 17. With LEN=8, + // unwind=9 is sufficient to fully unroll all loops touching the input slice; + // the upstream value of 17 was sized for LEN=16 and is now overkill. + #[test] + #[cfg_attr(kani, kani::proof, kani::unwind(9), kani::solver(cadical))] + fn differential() { + #[cfg(any(kani, miri))] + type Bytes = crate::testing::InlineVec; + #[cfg(not(any(kani, miri)))] + type Bytes = Vec; + + check!() + .with_type::<(usize, Bytes)>() + .for_each(|(index, bytes)| { + let index = if bytes.is_empty() { + 0 + } else { + *index % bytes.len() + }; + let (a, b) = bytes.split_at(index); + let mut cs = Checksum::default(); + cs.write(a); + cs.write(b); + + let mut rfc_value = rfc_c_port(bytes); + if rfc_value == 0 { + rfc_value = 0xffff; + } + + assert_eq!(rfc_value.to_be_bytes(), cs.finish().to_be_bytes()); + }); + } + + /// Shows that using the u32+u16 methods is the same as only using u16 + #[test] + #[cfg_attr(kani, kani::proof, kani::unwind(9), kani::solver(kissat))] + fn u32_u16_differential() { + #[cfg(any(kani, miri))] + type Bytes = crate::testing::InlineVec; + #[cfg(not(any(kani, miri)))] + type Bytes = Vec; + + check!().with_type::().for_each(|bytes| { + let a = { + let mut cs = Checksum::generic(); + let bytes = write_sized_generic_u32::<4>(&mut cs.state, bytes); + write_sized_generic_u16::<2>(&mut cs.state, bytes); + cs.finish() + }; + + let b = { + let mut cs = Checksum::generic(); + write_sized_generic_u16::<2>(&mut cs.state, bytes); + cs.finish() + }; + + assert_eq!(a, b); + }); + } + + /// Shows that RFC carry implementation is the same as the optimized version + #[test] + #[cfg_attr(kani, kani::proof, kani::unwind(9), kani::solver(kissat))] + fn carry_differential() { + check!().with_type::().cloned().for_each(|state| { + let mut opt = Checksum::generic(); + opt.state.0 = state; + opt.carry_optimized(); + + let mut rfc = Checksum::generic(); + rfc.state.0 = state; + rfc.carry_rfc(); + + assert_eq!(opt.state.0, rfc.state.0); + }); + } +}