Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/openshell-driver-docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ contract:
| `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
| `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. |
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |

The agent child process does not retain these supervisor privileges.
Expand Down
33 changes: 33 additions & 0 deletions crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const SUPERVISOR_PATH: &str = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin
const HOST_OPENSHELL_INTERNAL: &str = "host.openshell.internal";
const HOST_DOCKER_INTERNAL: &str = "host.docker.internal";
const DOCKER_NETWORK_DRIVER: &str = "bridge";
const DEFAULT_SANDBOX_PIDS_LIMIT: i64 = 2048;

/// Default image holding the Linux `openshell-sandbox` binary. The gateway
/// pulls this image and extracts the binary to a host-side cache when no
Expand Down Expand Up @@ -163,6 +164,11 @@ pub struct DockerComputeConfig {

/// Unix socket path the in-container supervisor bridges relay traffic to.
pub ssh_socket_path: String,

/// Container cgroup PID limit for Docker-managed sandboxes.
///
/// Set to `0` to leave Docker's runtime/default PID limit unchanged.
pub sandbox_pids_limit: i64,
}

impl Default for DockerComputeConfig {
Expand All @@ -180,6 +186,7 @@ impl Default for DockerComputeConfig {
network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(),
host_gateway_ip: String::new(),
ssh_socket_path: "/run/openshell/ssh.sock".to_string(),
sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT,
}
}
}
Expand All @@ -206,6 +213,7 @@ struct DockerDriverRuntimeConfig {
guest_tls: Option<DockerGuestTlsPaths>,
daemon_version: String,
supports_gpu: bool,
sandbox_pids_limit: i64,
}

#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -252,6 +260,7 @@ impl DockerComputeDriver {
.cdi_spec_dirs
.as_ref()
.is_some_and(|dirs| !dirs.is_empty());
validate_sandbox_pids_limit(docker_config.sandbox_pids_limit)?;
let gateway_port = config.bind_address.port();
if gateway_port == 0 {
return Err(Error::config(
Expand Down Expand Up @@ -298,6 +307,7 @@ impl DockerComputeDriver {
guest_tls,
daemon_version: version.version.unwrap_or_else(|| "unknown".to_string()),
supports_gpu,
sandbox_pids_limit: docker_config.sandbox_pids_limit,
},
events: broadcast::channel(WATCH_BUFFER).0,
supervisor_readiness,
Expand Down Expand Up @@ -1163,6 +1173,7 @@ fn build_container_create_body(
host_config: Some(HostConfig {
nano_cpus: resource_limits.nano_cpus,
memory: resource_limits.memory_bytes,
pids_limit: docker_pids_limit(config.sandbox_pids_limit)?,
device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
binds: Some(build_binds(sandbox, config)?),
restart_policy: Some(RestartPolicy {
Expand Down Expand Up @@ -1454,6 +1465,28 @@ fn docker_resource_limits(
})
}

fn validate_sandbox_pids_limit(value: i64) -> CoreResult<()> {
if value < 0 {
return Err(Error::config(
"docker sandbox_pids_limit must be zero or greater",
));
}
Ok(())
}

fn docker_pids_limit(value: i64) -> Result<Option<i64>, Status> {
if value < 0 {
return Err(Status::failed_precondition(
"docker sandbox_pids_limit must be zero or greater",
));
}
if value == 0 {
Ok(None)
} else {
Ok(Some(value))
}
}

#[allow(clippy::cast_possible_truncation)]
fn parse_cpu_limit(value: &str) -> Result<Option<i64>, Status> {
let value = value.trim();
Expand Down
18 changes: 18 additions & 0 deletions crates/openshell-driver-docker/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ fn runtime_config() -> DockerDriverRuntimeConfig {
}),
daemon_version: "28.0.0".to_string(),
supports_gpu: false,
sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT,
}
}

Expand Down Expand Up @@ -411,6 +412,23 @@ fn docker_resource_limits_applies_cpu_and_memory_limits() {
assert_eq!(limits.memory_bytes, Some(2_147_483_648));
}

#[test]
fn docker_pids_limit_uses_driver_default_and_allows_runtime_inherit() {
assert_eq!(
docker_pids_limit(DEFAULT_SANDBOX_PIDS_LIMIT).unwrap(),
Some(DEFAULT_SANDBOX_PIDS_LIMIT)
);
assert_eq!(docker_pids_limit(0).unwrap(), None);
assert!(docker_pids_limit(-1).is_err());
}

#[test]
fn container_create_body_sets_driver_owned_pids_limit() {
let body = build_container_create_body(&test_sandbox(), &runtime_config()).unwrap();
let host_config = body.host_config.expect("host config");
assert_eq!(host_config.pids_limit, Some(DEFAULT_SANDBOX_PIDS_LIMIT));
}

#[test]
fn build_environment_sets_docker_tls_paths() {
let env = build_environment(&test_sandbox(), &runtime_config());
Expand Down
1 change: 1 addition & 0 deletions crates/openshell-driver-podman/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ Podman resources after out-of-band container removal or label drift.
| `OPENSHELL_NETWORK_NAME` | `--network-name` | `openshell` | Podman bridge network name. |
| `OPENSHELL_SANDBOX_SSH_SOCKET_PATH` | `--sandbox-ssh-socket-path` | `/run/openshell/ssh.sock` | Supervisor Unix socket path in `PodmanComputeConfig`. |
| `OPENSHELL_STOP_TIMEOUT` | `--stop-timeout` | `10` | Container stop timeout in seconds. |
| `OPENSHELL_SANDBOX_PIDS_LIMIT` | `--sandbox-pids-limit` | `2048` | Podman cgroup PID limit for sandbox containers. Set `0` to inherit Podman's runtime/default PID limit. |
| `OPENSHELL_SUPERVISOR_IMAGE` | `--supervisor-image` | `ghcr.io/nvidia/openshell/supervisor:latest` through the gateway, required standalone | OCI image containing the supervisor binary. |
| `OPENSHELL_PODMAN_TLS_CA` | `--podman-tls-ca` | unset | Host path to the CA certificate mounted for sandbox mTLS. |
| `OPENSHELL_PODMAN_TLS_CERT` | `--podman-tls-cert` | unset | Host path to the client certificate mounted for sandbox mTLS. |
Expand Down
34 changes: 34 additions & 0 deletions crates/openshell-driver-podman/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::str::FromStr;

/// Default Podman bridge network name.
pub const DEFAULT_NETWORK_NAME: &str = "openshell";
pub const DEFAULT_SANDBOX_PIDS_LIMIT: i64 = 2048;

/// Image pull policy for sandbox and supervisor images.
///
Expand Down Expand Up @@ -106,6 +107,10 @@ pub struct PodmanComputeConfig {
pub guest_tls_cert: Option<PathBuf>,
/// Host path to the client private key for sandbox mTLS.
pub guest_tls_key: Option<PathBuf>,
/// Container cgroup PID limit for Podman-managed sandboxes.
///
/// Set to `0` to leave Podman's runtime/default PID limit unchanged.
pub sandbox_pids_limit: i64,
}

impl PodmanComputeConfig {
Expand Down Expand Up @@ -149,6 +154,16 @@ impl PodmanComputeConfig {
)))
}

/// Validate runtime resource-limit configuration.
pub fn validate_runtime_limits(&self) -> Result<(), crate::client::PodmanApiError> {
if self.sandbox_pids_limit < 0 {
return Err(crate::client::PodmanApiError::InvalidInput(
"sandbox_pids_limit must be zero or greater".to_string(),
));
}
Ok(())
}

/// Resolve the default socket path from the environment.
///
/// - **macOS**: `$HOME/.local/share/containers/podman/machine/podman.sock`
Expand Down Expand Up @@ -191,6 +206,7 @@ impl Default for PodmanComputeConfig {
guest_tls_ca: None,
guest_tls_cert: None,
guest_tls_key: None,
sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT,
}
}
}
Expand All @@ -210,6 +226,7 @@ impl std::fmt::Debug for PodmanComputeConfig {
.field("guest_tls_ca", &self.guest_tls_ca)
.field("guest_tls_cert", &self.guest_tls_cert)
.field("guest_tls_key", &self.guest_tls_key)
.field("sandbox_pids_limit", &self.sandbox_pids_limit)
.finish()
}
}
Expand Down Expand Up @@ -251,6 +268,23 @@ mod tests {
});
}

#[test]
fn default_config_sets_driver_owned_pids_limit() {
let cfg = PodmanComputeConfig::default();
assert_eq!(cfg.sandbox_pids_limit, DEFAULT_SANDBOX_PIDS_LIMIT);
assert!(cfg.validate_runtime_limits().is_ok());
}

#[test]
fn runtime_limit_validation_rejects_negative_pids_limit() {
let cfg = PodmanComputeConfig {
sandbox_pids_limit: -1,
..PodmanComputeConfig::default()
};
let err = cfg.validate_runtime_limits().unwrap_err();
assert!(err.to_string().contains("sandbox_pids_limit"));
}

#[test]
#[cfg(target_os = "macos")]
fn default_socket_path_uses_podman_machine_on_macos() {
Expand Down
25 changes: 23 additions & 2 deletions crates/openshell-driver-podman/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ struct HealthConfig {
struct ResourceLimits {
cpu: CpuLimits,
memory: MemoryLimits,
#[serde(rename = "PidsLimit", skip_serializing_if = "Option::is_none")]
pids_limit: Option<i64>,
}

#[derive(Serialize)]
Expand Down Expand Up @@ -340,7 +342,7 @@ fn build_labels(sandbox: &DriverSandbox) -> BTreeMap<String, String> {
}

/// Parse resource limits from the sandbox template, falling back to defaults.
fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {
fn build_resource_limits(sandbox: &DriverSandbox, config: &PodmanComputeConfig) -> ResourceLimits {
let resources = sandbox
.spec
.as_ref()
Expand All @@ -363,9 +365,14 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {
period: DEFAULT_CPU_PERIOD,
},
memory: MemoryLimits { limit: mem_bytes },
pids_limit: podman_pids_limit(config.sandbox_pids_limit),
}
}

fn podman_pids_limit(value: i64) -> Option<i64> {
if value > 0 { Some(value) } else { None }
}

/// Build CDI GPU device list if GPU is requested.
fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
let spec = sandbox.spec.as_ref()?;
Expand Down Expand Up @@ -396,7 +403,7 @@ pub fn build_container_spec_with_token(

let env = build_env(sandbox, config, image);
let labels = build_labels(sandbox);
let resource_limits = build_resource_limits(sandbox);
let resource_limits = build_resource_limits(sandbox, config);
let devices = build_devices(sandbox);

// Network configuration -- always bridge mode.
Expand Down Expand Up @@ -735,6 +742,20 @@ mod tests {
spec["resource_limits"]["memory"]["limit"].as_u64(),
Some(2 * 1024 * 1024 * 1024)
);
assert_eq!(
spec["resource_limits"]["PidsLimit"].as_i64(),
Some(crate::config::DEFAULT_SANDBOX_PIDS_LIMIT)
);
}

#[test]
fn container_spec_can_inherit_runtime_pids_limit() {
let sandbox = test_sandbox("test-id", "test-name");
let mut config = test_config();
config.sandbox_pids_limit = 0;
let spec = build_container_spec(&sandbox, &config);

assert!(spec["resource_limits"].get("PidsLimit").is_none());
}

#[test]
Expand Down
1 change: 1 addition & 0 deletions crates/openshell-driver-podman/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ impl PodmanComputeDriver {
// (e.g. CA set but cert/key missing) are rejected early so operators
// get a clear error instead of a silent fallback to plaintext HTTP.
config.validate_tls_config()?;
config.validate_runtime_limits()?;

let client = PodmanClient::new(config.socket_path.clone());

Expand Down
14 changes: 13 additions & 1 deletion crates/openshell-driver-podman/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ use tracing_subscriber::EnvFilter;
use openshell_core::VERSION;
use openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS;
use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer;
use openshell_driver_podman::config::{DEFAULT_NETWORK_NAME, ImagePullPolicy};
use openshell_driver_podman::config::{
DEFAULT_NETWORK_NAME, DEFAULT_SANDBOX_PIDS_LIMIT, ImagePullPolicy,
};
use openshell_driver_podman::{ComputeDriverService, PodmanComputeConfig, PodmanComputeDriver};

#[derive(Parser)]
Expand Down Expand Up @@ -71,6 +73,15 @@ struct Args {
#[arg(long, env = "OPENSHELL_STOP_TIMEOUT", default_value_t = DEFAULT_STOP_TIMEOUT_SECS)]
stop_timeout: u32,

/// Container cgroup PID limit for sandbox containers. Set 0 to inherit
/// Podman's runtime/default PID limit.
#[arg(
long,
env = "OPENSHELL_SANDBOX_PIDS_LIMIT",
default_value_t = DEFAULT_SANDBOX_PIDS_LIMIT
)]
sandbox_pids_limit: i64,

/// OCI image containing the openshell-sandbox supervisor binary.
#[arg(long, env = "OPENSHELL_SUPERVISOR_IMAGE")]
supervisor_image: String,
Expand Down Expand Up @@ -114,6 +125,7 @@ async fn main() -> Result<()> {
guest_tls_ca: args.podman_tls_ca,
guest_tls_cert: args.podman_tls_cert,
guest_tls_key: args.podman_tls_key,
sandbox_pids_limit: args.sandbox_pids_limit,
})
.await
.into_diagnostic()?;
Expand Down
10 changes: 10 additions & 0 deletions crates/openshell-sandbox/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,16 @@ pub async fn run_sandbox(
// Prepare filesystem: create and chown read_write directories
prepare_filesystem(&policy)?;

#[cfg(target_os = "linux")]
{
let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() {
process::RuntimePidLimitMode::Require
} else {
process::RuntimePidLimitMode::Warn
};
process::check_runtime_pid_limit(pid_limit_mode)?;
}

// Initialize the agent-proposals feature flag. Default false until the
// initial settings fetch (or the poll loop) tells us otherwise. The flag
// gates the skill install, the policy.local route handler, and the L7
Expand Down
Loading
Loading