From e655f59b43bcbfa03663b1aaf40207acfb05d97b Mon Sep 17 00:00:00 2001 From: joeyczheng Date: Tue, 2 Jun 2026 15:02:22 +0800 Subject: [PATCH 1/2] feat(CubeAPI): support e2b V3 template-build protocol end-to-end Add V3 routes: POST /v3/templates, GET .../files/{hash}, POST /v2/.../builds/{bid}, GET .../status (handlers/templates_v3.rs) Signed-off-by: joeyczheng --- CubeAPI/Cargo.lock | 7 + CubeAPI/Cargo.toml | 2 +- CubeAPI/src/config/mod.rs | 59 + CubeAPI/src/constants.rs | 4 + CubeAPI/src/handlers/mod.rs | 2 + CubeAPI/src/handlers/registry.rs | 240 +++ CubeAPI/src/handlers/templates.rs | 26 +- CubeAPI/src/handlers/templates_v3.rs | 70 + CubeAPI/src/main.rs | 51 + CubeAPI/src/models/mod.rs | 273 +++- CubeAPI/src/routes.rs | 74 +- CubeAPI/src/services/builds.rs | 189 +++ CubeAPI/src/services/mod.rs | 12 +- CubeAPI/src/services/sandboxes.rs | 136 +- CubeAPI/src/services/templates.rs | 1447 +++++++++++++++-- deploy/one-click/scripts/one-click/up.sh | 2 +- docs/.vitepress/config.mjs | 2 + docs/guide/tutorials/template-from-e2b-sdk.md | 395 +++++ docs/guide/tutorials/template-from-image.md | 6 + .../guide/tutorials/template-from-e2b-sdk.md | 396 +++++ .../zh/guide/tutorials/template-from-image.md | 6 + 21 files changed, 3195 insertions(+), 204 deletions(-) create mode 100644 CubeAPI/src/handlers/registry.rs create mode 100644 CubeAPI/src/handlers/templates_v3.rs create mode 100644 CubeAPI/src/services/builds.rs create mode 100644 docs/guide/tutorials/template-from-e2b-sdk.md create mode 100644 docs/zh/guide/tutorials/template-from-e2b-sdk.md diff --git a/CubeAPI/Cargo.lock b/CubeAPI/Cargo.lock index f547088be..4178201e6 100644 --- a/CubeAPI/Cargo.lock +++ b/CubeAPI/Cargo.lock @@ -2425,6 +2425,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + [[package]] name = "sha2" version = "0.10.9" @@ -3298,6 +3304,7 @@ dependencies = [ "getrandom 0.4.2", "js-sys", "serde_core", + "sha1_smol", "wasm-bindgen", ] diff --git a/CubeAPI/Cargo.toml b/CubeAPI/Cargo.toml index aaa9c4d47..d5e52f563 100644 --- a/CubeAPI/Cargo.toml +++ b/CubeAPI/Cargo.toml @@ -54,7 +54,7 @@ config = "0.13" dotenvy = "0.15" # ── UUID ────────────────────────────────────────────────────────────────── -uuid = { version = "1", features = ["v4", "serde"] } +uuid = { version = "1", features = ["v4", "v5", "serde"] } # ── High-concurrency in-memory state ────────────────────────────────────── # Lock-free concurrent HashMap: O(1) reads without global lock diff --git a/CubeAPI/src/config/mod.rs b/CubeAPI/src/config/mod.rs index 07036b95e..417573283 100644 --- a/CubeAPI/src/config/mod.rs +++ b/CubeAPI/src/config/mod.rs @@ -64,6 +64,59 @@ pub struct ServerConfig { /// Example: mysql://cube:cube_pass@127.0.0.1:3306/cube_mvp #[serde(default = "default_database_url")] pub database_url: Option, + + /// E2B-compatible OCI registry upstream URL. When set, /v2/* requests are + /// reverse-proxied to this address so that `e2b template build` (which uses + /// `docker push`) can upload images that CubeMaster will later consume. + /// + /// Recommended deployment: run `distribution/distribution` (CNCF Registry) + /// as a sidecar listening on 127.0.0.1:5000 and set + /// CUBE_API_REGISTRY_UPSTREAM=http://127.0.0.1:5000. + /// + /// When unset, /v2/* returns 503 and `dockerfile`-based template requests + /// are rejected with 501. + #[serde(default)] + pub registry_upstream: Option, + + /// Public host (no scheme) advertised to E2B clients as the docker-push + /// target, e.g. "cube.example.com". Defaults to the Host header of the + /// originating /templates request when unset. + #[serde(default)] + pub registry_public_host: Option, + + /// Repository namespace prefix for uploaded build images. The full image + /// reference returned to CubeMaster will be: + /// //: + /// Default: "e2b". + #[serde(default = "default_registry_repo_prefix")] + pub registry_repo_prefix: String, + + /// Internal registry host CubeMaster nodes should pull from (e.g. + /// "10.0.0.1:5000"). Defaults to `registry_upstream` host:port when unset. + #[serde(default)] + pub registry_pull_host: Option, + + /// Optional shared secret printed back as `registry.password` in + /// POST /templates responses. Empty → "_anon". + #[serde(default)] + pub registry_token: Option, + + /// Default `writable_layer_size` to send to CubeMaster when the client + /// (e.g. the E2B Python SDK) does not specify one. CubeMaster validates + /// this field as required, so a non-empty default is needed for the V3 + /// flow to work out of the box. + /// + /// Env var: CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE | Default: "1G". + #[serde(default = "default_writable_layer_size")] + pub default_writable_layer_size: String, +} + +fn default_registry_repo_prefix() -> String { + "e2b".to_string() +} + +fn default_writable_layer_size() -> String { + std::env::var("CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE").unwrap_or_else(|_| "1G".to_string()) } fn default_bind() -> String { @@ -142,6 +195,12 @@ impl Default for ServerConfig { log_prefix: default_log_prefix(), auth_callback_url: None, database_url: default_database_url(), + registry_upstream: None, + registry_public_host: None, + registry_repo_prefix: default_registry_repo_prefix(), + registry_pull_host: None, + registry_token: None, + default_writable_layer_size: default_writable_layer_size(), } } } diff --git a/CubeAPI/src/constants.rs b/CubeAPI/src/constants.rs index 2667875d7..199bc13b9 100644 --- a/CubeAPI/src/constants.rs +++ b/CubeAPI/src/constants.rs @@ -6,3 +6,7 @@ /// Reported `envdVersion` for sandbox APIs (create, connect, list, get, resume, etc.). pub const ENVD_VERSION: &str = "0.2.0"; + +/// E2B `envd` listens on this port inside every sandbox. +pub const ENVD_PORT: u32 = 49983; +pub const ENVD_PORT_STR: &str = "49983"; diff --git a/CubeAPI/src/handlers/mod.rs b/CubeAPI/src/handlers/mod.rs index 1c92c91db..d5281077e 100644 --- a/CubeAPI/src/handlers/mod.rs +++ b/CubeAPI/src/handlers/mod.rs @@ -6,7 +6,9 @@ pub mod agenthub; pub mod cluster; pub mod config; pub mod health; +pub mod registry; pub mod sandboxes; pub mod snapshots; pub mod store; pub mod templates; +pub mod templates_v3; diff --git a/CubeAPI/src/handlers/registry.rs b/CubeAPI/src/handlers/registry.rs new file mode 100644 index 000000000..2d751b795 --- /dev/null +++ b/CubeAPI/src/handlers/registry.rs @@ -0,0 +1,240 @@ +// Copyright (c) 2024 Tencent Inc. +// SPDX-License-Identifier: Apache-2.0 +// + + +use axum::{ + body::{Body, Bytes}, + extract::{Path, Request, State}, + http::{header, HeaderMap, HeaderName, HeaderValue, Method, StatusCode}, + response::Response, +}; +use std::str::FromStr; + +use crate::{ + error::{AppError, AppResult}, + state::AppState, +}; + +/// Headers that must NOT be propagated end-to-end. +const HOP_BY_HOP: &[&str] = &[ + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailer", + "transfer-encoding", + "upgrade", + "host", +]; + +/// `GET /v2/` — registry ping. Always returns `200 OK` with the version header +/// when an upstream is configured. +pub async fn ping(State(state): State) -> AppResult { + let upstream = state + .config + .registry_upstream + .as_deref() + .filter(|s| !s.is_empty()) + .ok_or_else(registry_disabled)?; + + forward(&state, Method::GET, upstream, "/v2/", "", &HeaderMap::new(), Bytes::new()).await +} + +/// `ANY /v2/*path` — generic reverse-proxy. +pub async fn proxy( + State(state): State, + Path(path): Path, + request: Request, +) -> AppResult { + let upstream = state + .config + .registry_upstream + .as_deref() + .filter(|s| !s.is_empty()) + .ok_or_else(registry_disabled)? + .to_string(); + + let method = request.method().clone(); + let query = request.uri().query().unwrap_or("").to_string(); + let headers = request.headers().clone(); + let body = match axum::body::to_bytes(request.into_body(), 512 * 1024 * 1024).await { + Ok(b) => b, + Err(e) => { + return Err(AppError::BadRequest(format!( + "failed to read /v2/* request body: {}", + e + ))) + } + }; + + let normalized = normalize_subpath(&path); + let response = forward(&state, method.clone(), &upstream, &normalized, &query, &headers, body) + .await?; + + // After a successful manifest PUT we mark the build as image-pushed so + // that the orchestrator stage proceeds. + if method == Method::PUT && response.status().is_success() { + if let Some(parsed) = parse_manifest_path(&normalized) { + // tag carries either the buildID (preferred) or a digest. Pull the + // build context by tag first, then fall back to no-op. + if !parsed.tag.starts_with("sha256:") { + tracing::info!( + repo = %parsed.repo, + tag = %parsed.tag, + "manifest pushed; marking build as image-pushed" + ); + state.services.templates.mark_image_pushed(&parsed.tag); + } + } + } + + Ok(response) +} + +async fn forward( + state: &AppState, + method: Method, + upstream: &str, + path: &str, + query: &str, + in_headers: &HeaderMap, + body: Bytes, +) -> AppResult { + let upstream = upstream.trim_end_matches('/'); + let path = if path.starts_with('/') { + path.to_string() + } else { + format!("/{}", path) + }; + let url = if query.is_empty() { + format!("{}{}", upstream, path) + } else { + format!("{}{}?{}", upstream, path, query) + }; + + let mut req = state.http_client.request(method, &url); + + for (name, value) in in_headers { + let key = name.as_str().to_ascii_lowercase(); + if HOP_BY_HOP.contains(&key.as_str()) { + continue; + } + req = req.header(name.clone(), value.clone()); + } + + if !body.is_empty() { + req = req.body(body.to_vec()); + } + + let upstream_resp = req.send().await.map_err(|e| { + tracing::error!(error = %e, url = %url, "registry upstream request failed"); + AppError::Internal(anyhow::anyhow!("registry upstream unreachable: {}", e)) + })?; + + let status = upstream_resp.status(); + let mut headers = HeaderMap::new(); + for (name, value) in upstream_resp.headers() { + let key = name.as_str().to_ascii_lowercase(); + if HOP_BY_HOP.contains(&key.as_str()) || key == "content-length" { + continue; + } + if let (Ok(name), Ok(value)) = ( + HeaderName::from_str(name.as_str()), + HeaderValue::from_bytes(value.as_bytes()), + ) { + headers.insert(name, value); + } + } + + let body_bytes = upstream_resp + .bytes() + .await + .map_err(|e| AppError::Internal(anyhow::anyhow!("registry response read failed: {}", e)))?; + + let mut response = Response::builder() + .status(StatusCode::from_u16(status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY)) + .body(Body::from(body_bytes)) + .map_err(|e| AppError::Internal(anyhow::anyhow!("response build failed: {}", e)))?; + + *response.headers_mut() = headers; + response + .headers_mut() + .entry(header::HeaderName::from_static("docker-distribution-api-version")) + .or_insert(HeaderValue::from_static("registry/2.0")); + + Ok(response) +} + +fn registry_disabled() -> AppError { + AppError::NotImplemented( + "registry upstream is not configured: set CUBE_API_REGISTRY_UPSTREAM \ + to enable the bundled OCI registry" + .to_string(), + ) +} + +fn normalize_subpath(path: &str) -> String { + if path.starts_with("/v2") { + path.to_string() + } else if path.starts_with("v2/") { + format!("/{}", path) + } else { + format!("/v2/{}", path.trim_start_matches('/')) + } +} + +#[derive(Debug)] +struct ManifestPath { + repo: String, + tag: String, +} + +/// Parse `/v2//manifests/` (where `` may itself contain +/// slashes). Returns `None` for blob / upload / catalog endpoints. +fn parse_manifest_path(path: &str) -> Option { + let stripped = path.strip_prefix("/v2/")?; + let idx = stripped.rfind("/manifests/")?; + let repo = &stripped[..idx]; + let tag = &stripped[idx + "/manifests/".len()..]; + if repo.is_empty() || tag.is_empty() { + return None; + } + Some(ManifestPath { + repo: repo.to_string(), + tag: tag.to_string(), + }) +} + +impl ManifestPath { + #[allow(dead_code)] + fn rebuild(&self) -> String { + format!("/v2/{}/manifests/{}", self.repo, self.tag) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_manifest_path_accepts_namespaced_repo() { + let p = parse_manifest_path("/v2/e2b/tpl-abc/manifests/bld-001").unwrap(); + assert_eq!(p.repo, "e2b/tpl-abc"); + assert_eq!(p.tag, "bld-001"); + } + + #[test] + fn parse_manifest_path_rejects_blob_paths() { + assert!(parse_manifest_path("/v2/e2b/tpl-abc/blobs/sha256:abc").is_none()); + assert!(parse_manifest_path("/v2/").is_none()); + } + + #[test] + fn normalize_subpath_handles_axum_capture_variants() { + assert_eq!(normalize_subpath("v2/foo/bar"), "/v2/foo/bar"); + assert_eq!(normalize_subpath("/foo/bar"), "/v2/foo/bar"); + assert_eq!(normalize_subpath("/v2/foo/bar"), "/v2/foo/bar"); + } +} diff --git a/CubeAPI/src/handlers/templates.rs b/CubeAPI/src/handlers/templates.rs index 861510b47..a5dbf72c3 100644 --- a/CubeAPI/src/handlers/templates.rs +++ b/CubeAPI/src/handlers/templates.rs @@ -146,44 +146,44 @@ pub async fn delete_template( pub async fn start_template_build( State(state): State, - Path((template_id, _build_id)): Path<(String, String)>, + Path((template_id, build_id)): Path<(String, String)>, ) -> AppResult { let job = state .services .templates - .start_template_build(template_id) + .start_template_build(template_id, Some(build_id)) .await?; Ok((StatusCode::ACCEPTED, Json(job))) } // ─── GET /templates/:templateID/builds/:buildID/status ──────────────────────── -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Default)] pub struct BuildStatusQuery { - #[serde(default)] - #[allow(dead_code)] + /// E2B SDK polls with `?logsOffset=N` to receive only the new lines + /// added since the last response. Snake-case alias is accepted too. + #[serde(rename = "logsOffset", alias = "logs_offset", default)] pub logs_offset: i32, } pub async fn get_template_build_status( State(state): State, Path((template_id, build_id)): Path<(String, String)>, - Query(_params): Query, + Query(params): Query, ) -> AppResult { let out = state .services .templates - .get_template_build_status(&template_id, &build_id) + .get_template_build_status(&template_id, &build_id, params.logs_offset) .await?; Ok((StatusCode::OK, Json(out))) } // ─── GET /templates/:templateID/builds/:buildID/logs ───────────────────────── -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Default)] pub struct BuildLogsQuery { - #[serde(default)] - #[allow(dead_code)] + #[serde(rename = "logsOffset", alias = "offset", alias = "logs_offset", default)] pub offset: i32, #[serde(default = "default_log_limit")] #[allow(dead_code)] @@ -195,13 +195,13 @@ fn default_log_limit() -> i32 { pub async fn get_template_build_logs( State(state): State, - Path((_template_id, build_id)): Path<(String, String)>, - Query(_params): Query, + Path((template_id, build_id)): Path<(String, String)>, + Query(params): Query, ) -> AppResult { let logs = state .services .templates - .get_template_build_logs(&build_id) + .get_template_build_logs(&template_id, &build_id, params.offset) .await?; Ok((StatusCode::OK, Json(logs))) } diff --git a/CubeAPI/src/handlers/templates_v3.rs b/CubeAPI/src/handlers/templates_v3.rs new file mode 100644 index 000000000..4319765b8 --- /dev/null +++ b/CubeAPI/src/handlers/templates_v3.rs @@ -0,0 +1,70 @@ +// Copyright (c) 2026 Tencent Inc. +// SPDX-License-Identifier: Apache-2.0 +// + +use axum::{ + extract::{Path, Query, State}, + http::StatusCode, + response::IntoResponse, + Json, +}; + +use crate::{ + error::AppResult, + models::{ + V2TemplateBuildStart, V3BuildStatusQuery, V3TemplateBuildRequest, + }, + state::AppState, +}; + +/// `POST /v3/templates` — register template + first build attempt. +pub async fn v3_create_template( + State(state): State, + Json(body): Json, +) -> AppResult { + let resp = state.services.templates.v3_create_template(body)?; + Ok((StatusCode::ACCEPTED, Json(resp))) +} + +/// `GET /templates/{templateID}/files/{hash}` — file-cache probe used by the +/// SDK before uploading build context tarballs. We always answer +/// `present=true` because the current CubeMaster pipeline only consumes +/// `from_image` references (no Dockerfile-from-context build yet). +pub async fn v3_get_files_hash( + State(state): State, + Path((template_id, hash)): Path<(String, String)>, +) -> AppResult { + let resp = state + .services + .templates + .v3_get_file_upload(&template_id, &hash)?; + Ok((StatusCode::CREATED, Json(resp))) +} + +/// `POST /v2/templates/{templateID}/builds/{buildID}` — kick off the build. +pub async fn v2_trigger_build( + State(state): State, + Path((template_id, build_id)): Path<(String, String)>, + Json(body): Json, +) -> AppResult { + state + .services + .templates + .v3_trigger_build(template_id, build_id, body) + .await?; + Ok(StatusCode::ACCEPTED) +} + +/// `GET /templates/{templateID}/builds/{buildID}/status?logsOffset=N&limit=M` +pub async fn v3_get_build_status( + State(state): State, + Path((template_id, build_id)): Path<(String, String)>, + Query(params): Query, +) -> AppResult { + let info = state + .services + .templates + .v3_get_build_status(&template_id, &build_id, params.logs_offset, params.limit) + .await?; + Ok((StatusCode::OK, Json(info))) +} diff --git a/CubeAPI/src/main.rs b/CubeAPI/src/main.rs index a2f6fca9b..0392eba69 100644 --- a/CubeAPI/src/main.rs +++ b/CubeAPI/src/main.rs @@ -116,6 +116,39 @@ struct Cli { #[arg(long, value_name = "DOMAIN")] sandbox_domain: Option, + /// Upstream OCI registry URL used for `e2b template build` image push + /// (default: unset). When unset, /v2/* returns 503 and dockerfile-based + /// requests fail with 501. + /// + /// Overrides the CUBE_API_REGISTRY_UPSTREAM environment variable. + #[arg(long, value_name = "URL")] + registry_upstream: Option, + + /// Public host advertised to E2B clients for docker push (no scheme). + /// Default: the request Host header at template-create time. + #[arg(long, value_name = "HOST")] + registry_public_host: Option, + + /// Repo prefix applied to pushed build images (default: "e2b"). + #[arg(long, value_name = "PREFIX")] + registry_repo_prefix: Option, + + /// Internal registry host CubeMaster nodes pull from (e.g. + /// "10.0.0.1:5000"). Defaults to upstream registry host:port. + #[arg(long, value_name = "HOST")] + registry_pull_host: Option, + + /// Shared password returned to E2B clients as registry.password. + #[arg(long, value_name = "TOKEN")] + registry_token: Option, + + /// Default `writable_layer_size` to send to CubeMaster when the client + /// (e.g. E2B V3 SDK) does not provide one. Default: "1G". + /// + /// Overrides CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE. + #[arg(long, value_name = "SIZE")] + default_writable_layer_size: Option, + /// Export the current OpenAPI spec to a YAML file and exit. #[arg(long, value_name = "PATH")] export_openapi: Option, @@ -168,6 +201,24 @@ fn main() -> anyhow::Result<()> { if let Some(v) = cli.sandbox_domain { cfg.sandbox_domain = v; } + if let Some(v) = cli.registry_upstream { + cfg.registry_upstream = Some(v); + } + if let Some(v) = cli.registry_public_host { + cfg.registry_public_host = Some(v); + } + if let Some(v) = cli.registry_repo_prefix { + cfg.registry_repo_prefix = v; + } + if let Some(v) = cli.registry_pull_host { + cfg.registry_pull_host = Some(v); + } + if let Some(v) = cli.registry_token { + cfg.registry_token = Some(v); + } + if let Some(v) = cli.default_writable_layer_size { + cfg.default_writable_layer_size = v; + } // ── Tracing (stdout) ─────────────────────────────────────────────────── // RUST_LOG env var takes precedence; --debug / --log-level / config is fallback. diff --git a/CubeAPI/src/models/mod.rs b/CubeAPI/src/models/mod.rs index eebb38f79..bf3bef242 100644 --- a/CubeAPI/src/models/mod.rs +++ b/CubeAPI/src/models/mod.rs @@ -466,8 +466,28 @@ pub struct TemplateDetail { } /// Body for POST /templates (create from image). -#[derive(Debug, Deserialize, Validate, ToSchema)] +/// +/// Two mutually exclusive modes are supported on the same endpoint, matching +/// both the **CubeSandbox-native** and **E2B-standard** template build flows: +/// +/// 1. CubeSandbox-native (`image` is provided): CubeMaster will pull +/// `image` from an external OCI registry and build the rootfs directly. +/// All extra fields (`exposed_ports`, `cpu`, `memory`, ...) override the +/// image defaults. +/// +/// 2. E2B-standard (`dockerfile` is provided): the server allocates a +/// `templateID` + `buildID`, returns a short-lived push credential, and the +/// client (`e2b template build`) pushes the locally-built image to the +/// bundled OCI registry. The actual rootfs build is then triggered by +/// `POST /templates/{tid}/builds/{bid}`. +/// +/// Field naming follows the E2B SDK conventions where they collide with +/// CubeSandbox legacy fields (camelCase for IDs, snake_case for +/// `start_cmd`/`ready_cmd`). +#[derive(Debug, Deserialize, Validate, Clone, ToSchema)] +#[allow(dead_code)] pub struct CreateTemplateRequest { + // ── Common fields (both modes) ───────────────────────────────────────── /// Deprecated and ignored. Template IDs are always generated server-side /// with the `tpl-` prefix; clients must use the returned `templateID`. #[serde(rename = "templateID", default)] @@ -475,9 +495,27 @@ pub struct CreateTemplateRequest { pub template_id: String, #[serde(rename = "instanceType", default)] pub instance_type: Option, - /// Container image reference, e.g. `registry.example.com/code:latest`. - #[validate(length(min = 1))] - pub image: String, + + /// Optional human-readable alias (E2B field: `alias`). + #[serde(default)] + pub alias: Option, + + /// E2B `teamID`. Currently only logged; reserved for multi-tenant rollout. + #[serde(rename = "teamID", default)] + pub team_id: Option, + + /// Container image reference (CubeSandbox-native mode), e.g. + /// `registry.example.com/code:latest`. Mutually exclusive with `dockerfile`. + #[serde(default)] + pub image: Option, + + /// Inline Dockerfile content (E2B-standard mode). Currently NOT built + /// server-side — the client is expected to build & push the image locally + /// using the credentials returned by this endpoint. Stored verbatim for + /// future in-cluster builds. + #[serde(default)] + pub dockerfile: Option, + /// Writable layer size for the rootfs, e.g. "1G". #[serde(rename = "writableLayerSize", default)] pub writable_layer_size: Option, @@ -490,15 +528,32 @@ pub struct CreateTemplateRequest { /// HTTP probe path, e.g. "/health". Defaults to "/health" when `probePort` is set. #[serde(rename = "probePath", default)] pub probe_path: Option, - /// CPU in millicores, e.g. 2000 means 2000m. + + /// CPU in millicores (legacy CubeSandbox field). #[serde(default)] pub cpu: Option, - /// Memory in MiB, e.g. 2000. + /// Memory in MiB (legacy CubeSandbox field). #[serde(default)] pub memory: Option, - /// Environment variables as "KEY=VALUE" strings. + + /// E2B-style integer CPU count (cores). Mapped to `cpu * 1000` millicores + /// when `cpu` is not supplied. + #[serde(rename = "cpuCount", default)] + pub cpu_count: Option, + + /// E2B-style memory in MiB. Mapped to `memory` when the legacy field is + /// not supplied. + #[serde(rename = "memoryMB", default)] + pub memory_mb: Option, + + /// Environment variables as "KEY=VALUE" strings (legacy CubeSandbox). #[serde(default)] pub env: Option>, + + /// E2B-style env-vars map. Merged into `env` when present. + #[serde(rename = "envVars", default)] + pub env_vars: Option>, + /// Allow internet (public) access. #[serde(rename = "allowInternetAccess", default)] pub allow_internet_access: Option, @@ -529,6 +584,16 @@ pub struct CreateTemplateRequest { /// Denied outbound CIDRs for CubeVS egress policy. #[serde(rename = "denyOut", default)] pub deny_out: Option>, + + /// E2B-style `startCmd`: shell command to execute inside the container + /// once the rootfs is mounted. Mapped to CubeMaster `args`. + #[serde(rename = "startCmd", alias = "start_cmd", default)] + pub start_cmd: Option, + + /// E2B-style `readyCmd`: shell command used as readiness probe. + /// Translated into a CubeMaster `Probe.Exec` when `probe_port` is empty. + #[serde(rename = "readyCmd", alias = "ready_cmd", default)] + pub ready_cmd: Option, } /// Body for POST /templates/:id (rebuild). @@ -539,21 +604,60 @@ pub struct RebuildTemplateRequest { } /// Job envelope returned by create / rebuild. -#[derive(Debug, Serialize, ToSchema)] +/// +/// E2B's CLI expects (besides the bare job state): +/// - `buildID` — opaque token that subsequent `/builds/{buildID}/...` +/// calls use to refer to *this* attempt. +/// - `uploadUrl` — URL the CLI should `docker push` to. +/// - `registry` — credentials matched against `Authorization` on /v2/*. +/// +/// All of these are emitted as *Optional* so existing CubeSandbox clients, +/// which only look at `templateID`/`status`, continue to deserialize. +#[derive(Debug, Serialize, ToSchema, Default)] pub struct TemplateBuildJob { #[serde(rename = "jobID")] pub job_id: String, #[serde(rename = "templateID")] pub template_id: String, + /// E2B-required identifier of this build attempt. Equals `jobID` when + /// CubeMaster returns one; otherwise a server-side uuid. + #[serde(rename = "buildID")] + pub build_id: String, pub status: String, pub phase: String, pub progress: i32, #[serde(rename = "errorMessage", skip_serializing_if = "String::is_empty")] pub error_message: String, + + /// E2B-style `uploadUrl`: where the CLI should push the locally-built + /// dockerfile image. Same as `registry.url` for convenience. + #[serde(rename = "uploadUrl", skip_serializing_if = "Option::is_none")] + pub upload_url: Option, + + /// Registry credentials advertised to E2B clients. + #[serde(skip_serializing_if = "Option::is_none")] + pub registry: Option, +} + +/// Short-lived push credential returned alongside a new template build. +#[derive(Debug, Serialize, Clone, ToSchema)] +pub struct RegistryCredential { + /// Full base URL of the registry endpoint, e.g. `https://cube.example.com`. + pub url: String, + /// Repository the client should push to, e.g. `e2b/tpl-abc:bld-001`. + pub repository: String, + /// Username for `docker login` / Basic auth. + pub username: String, + /// Password for `docker login` / Basic auth. + pub password: String, } /// Response for GET /templates/:id/builds/:bid/status -#[derive(Debug, Serialize, ToSchema)] +/// +/// E2B's CLI polls this endpoint with `?logsOffset=N` and expects: +/// - `status` : "building" | "ready" | "error" | "uploading" | ... +/// - `logs: string[]`: the new lines added since the previous offset. +#[derive(Debug, Serialize, ToSchema, Default)] pub struct TemplateBuildStatus { #[serde(rename = "buildID")] pub build_id: String, @@ -562,6 +666,12 @@ pub struct TemplateBuildStatus { pub status: String, pub progress: i32, pub message: String, + /// Incremental log lines starting from the offset given in the query. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub logs: Vec, + /// Offset to send back next round to receive only newer lines. + #[serde(rename = "logsOffset", skip_serializing_if = "Option::is_none")] + pub logs_offset: Option, } // ─── Cluster & Nodes ─────────────────────────────────────────────────────── @@ -717,3 +827,148 @@ pub struct VersionMatrixView { pub components: Vec, pub nodes: Vec, } + +// ─── E2B V3 protocol (real e2b SDK contract) ────────────────────────────── +// +// The e2b Python/JS SDK calls this trio of endpoints (camelCase JSON): +// +// 1. POST /v3/templates → register, returns +// {templateID, buildID, ...} +// 2. GET /templates/{tid}/files/{hash} → resolve cache, returns +// {present, url?} +// 3. POST /v2/templates/{tid}/builds/{bid} → trigger build, body has +// fromImage / startCmd / +// readyCmd / steps / ... +// 4. GET /templates/{tid}/builds/{bid}/status?logsOffset=N&limit=M +// → poll, returns +// {buildID, templateID, +// status, logs[], logEntries[]} +#[derive(Debug, Deserialize, Default, ToSchema)] +#[allow(dead_code)] +pub struct V3TemplateBuildRequest { + /// New-style "name" or "name:tag". The SDK *prefers* this over `alias`. + #[serde(default)] + pub name: Option, + /// Deprecated. Some older SDKs still send this. + #[serde(default)] + pub alias: Option, + /// Tag list to attach to the resulting build. + #[serde(default)] + pub tags: Option>, + /// CPU cores (whole number). + #[serde(rename = "cpuCount", default)] + pub cpu_count: Option, + /// Memory in MiB. + #[serde(rename = "memoryMB", default)] + pub memory_mb: Option, + /// Team identifier — currently only logged. + #[serde(rename = "teamID", default)] + pub team_id: Option, +} + +/// Response for `POST /v3/templates` — must match `TemplateRequestResponseV3` +/// exactly: the SDK calls `from_dict` and **fails fast on missing keys**. +#[derive(Debug, Serialize, ToSchema)] +pub struct V3TemplateBuildResponse { + #[serde(rename = "templateID")] + pub template_id: String, + #[serde(rename = "buildID")] + pub build_id: String, + pub names: Vec, + pub aliases: Vec, + pub tags: Vec, + pub public: bool, +} + +/// Response for `GET /templates/{tid}/files/{hash}` — the SDK only checks +/// `present`/`url` and (when `present=false`) PUTs the tarball to `url`. +#[derive(Debug, Serialize, ToSchema)] +pub struct V3TemplateFileUpload { + pub present: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, +} + +/// Body for `POST /v2/templates/{tid}/builds/{bid}` — the moment the build is +/// actually dispatched to CubeMaster. +#[derive(Debug, Deserialize, Default, ToSchema)] +#[allow(dead_code)] +pub struct V2TemplateBuildStart { + /// Skip-cache flag. + #[serde(default)] + pub force: Option, + /// External base image (CubeMaster `SourceImageRef`). + #[serde(rename = "fromImage", default)] + pub from_image: Option, + /// Optional registry credential block (AWS/GCP/General). Stored verbatim + /// for now; CubeMaster doesn't yet consume it. + #[serde(rename = "fromImageRegistry", default)] + pub from_image_registry: Option, + /// Reuse another already-built CubeSandbox template as the base. + #[serde(rename = "fromTemplate", default)] + pub from_template: Option, + /// E2B `readyCmd` — translated into CubeMaster `Probe.Exec`. + #[serde(rename = "readyCmd", default)] + pub ready_cmd: Option, + /// E2B `startCmd` — translated into container `args`. + #[serde(rename = "startCmd", default)] + pub start_cmd: Option, + /// Multi-step build instructions (RUN/COPY/ENV/...). Currently only used + /// for hashing & log breadcrumbs; full Dockerfile-equivalent semantics + /// require the in-cluster builder (Phase 4). + #[serde(default)] + pub steps: Option>, +} + +/// Response for `GET /templates/{tid}/builds/{bid}/status` — must round-trip +/// to E2B's `TemplateBuildInfo` to satisfy the SDK's strict `from_dict`. +#[derive(Debug, Serialize, ToSchema, Default)] +pub struct V3TemplateBuildInfo { + #[serde(rename = "buildID")] + pub build_id: String, + #[serde(rename = "templateID")] + pub template_id: String, + /// One of: "waiting" | "building" | "ready" | "error". + pub status: String, + /// Plain log lines (already filtered by `logsOffset`). + pub logs: Vec, + /// Structured log entries — same content with timestamps + level. + #[serde(rename = "logEntries")] + pub log_entries: Vec, + /// Failure reason payload (only when `status == "error"`). + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, +} + +#[derive(Debug, Serialize, ToSchema)] +pub struct V3BuildLogEntry { + pub timestamp: DateTime, + pub message: String, + /// "debug" | "info" | "warn" | "error" + pub level: String, +} + +#[derive(Debug, Serialize, ToSchema)] +pub struct V3BuildStatusReason { + #[serde(rename = "stepIndex", skip_serializing_if = "Option::is_none")] + pub step_index: Option, + pub message: String, +} + +/// Query string for `GET /v3` status endpoint. +#[derive(Debug, Deserialize, Default, IntoParams)] +#[into_params(parameter_in = Query)] +#[allow(dead_code)] +pub struct V3BuildStatusQuery { + #[serde(rename = "logsOffset", alias = "logs_offset", default)] + pub logs_offset: i32, + #[serde(default = "default_v3_log_limit")] + pub limit: i32, + #[serde(default)] + pub level: Option, +} + +fn default_v3_log_limit() -> i32 { + 100 +} + diff --git a/CubeAPI/src/routes.rs b/CubeAPI/src/routes.rs index 676343d7d..7402fc5f9 100644 --- a/CubeAPI/src/routes.rs +++ b/CubeAPI/src/routes.rs @@ -18,7 +18,10 @@ use tower_http::{ }; use crate::{ - handlers::{agenthub, cluster, config, health, sandboxes, snapshots, store, templates}, + handlers::{ + agenthub, cluster, config, health, registry, sandboxes, snapshots, store, templates, + templates_v3, + }, middleware::{auth::unified_auth, rate_limit::rate_limit}, state::AppState, }; @@ -58,10 +61,15 @@ pub fn build_router(state: AppState) -> Router { ), SNAPSHOT_LONG_ROUTE_TIMEOUT, ); + let registry_router = apply_http_layers( + build_registry_router(&state), + SNAPSHOT_LONG_ROUTE_TIMEOUT, + ); Router::new() .merge(standard_router) .merge(snapshot_long_router) + .merge(registry_router) .with_state(state) } @@ -170,6 +178,20 @@ fn build_template_routes(state: &AppState, auth_configured: bool) -> Router Router Router Router { + use axum::routing::{any, get}; + Router::new() + .route("/v2/", get(registry::ping)) + .route("/v2", get(registry::ping)) + .route("/v2/*path", any(registry::proxy)) +} + fn with_auth( routes: Router, state: &AppState, @@ -526,4 +556,42 @@ mod tests { resp.text(), ); } -} + + /// Regression: e2b Python SDK `Template.build()` calls `POST /v3/templates` + /// first; prior to the V3 routes we returned 404 with an empty body, which + /// surfaced to users as `BuildException: 404: b''`. After the fix, the + /// route exists and returns the V3 envelope. + #[tokio::test] + async fn v3_template_build_routes_are_reachable() { + let server = test_server().await; + + // POST /v3/templates → 202 with templateID/buildID/names/aliases/tags/public + let resp = server + .post("/v3/templates") + .json(&serde_json::json!({ + "name": "my-tpl:dev", + "cpuCount": 1, + "memoryMB": 1024, + })) + .await; + resp.assert_status(StatusCode::ACCEPTED); + let body: serde_json::Value = resp.json(); + assert!(body["templateID"].as_str().is_some()); + assert!(body["buildID"].as_str().is_some()); + assert!(body["names"].as_array().is_some()); + assert!(body["aliases"].as_array().is_some()); + assert_eq!(body["public"].as_bool(), Some(false)); + // The trailing `:dev` should have been folded into tags. + let tags = body["tags"].as_array().expect("tags array"); + assert!(tags.iter().any(|t| t == "dev")); + + // GET /templates/{tid}/files/{hash} → 201 with present=true (cache hit) + let tid = body["templateID"].as_str().unwrap(); + let r = server + .get(&format!("/templates/{}/files/abc123", tid)) + .await; + r.assert_status(StatusCode::CREATED); + let fb: serde_json::Value = r.json(); + assert_eq!(fb["present"].as_bool(), Some(true)); + } +} \ No newline at end of file diff --git a/CubeAPI/src/services/builds.rs b/CubeAPI/src/services/builds.rs new file mode 100644 index 000000000..34ffd2d05 --- /dev/null +++ b/CubeAPI/src/services/builds.rs @@ -0,0 +1,189 @@ +// Copyright (c) 2026 Tencent Inc. +// SPDX-License-Identifier: Apache-2.0 +// + +//! Build registry — keeps E2B-compatible per-build context in memory. +//! +//! When a client invokes the E2B-style `POST /templates`, we allocate a fresh +//! `(templateID, buildID)` pair and remember: +//! +//! - the create request snapshot (so `POST /templates/{tid}/builds/{bid}` +//! can resolve into the actual CubeMaster pipeline), +//! - the docker-push registry credentials we just handed back to the client, +//! - an append-only log buffer so the polling-based `?logsOffset=N` protocol +//! keeps working, +//! - the CubeMaster `jobID` once the build is dispatched, used by every +//! subsequent status / logs lookup. +//! +//! The store is in-memory + bounded; restart of CubeAPI invalidates inflight +//! builds. This is acceptable for a build flow that always reaches a terminal +//! state within minutes — durable persistence can be added later as a separate +//! storage trait without changing the call sites. + +use chrono::{DateTime, Utc}; +use dashmap::DashMap; +use std::sync::Arc; +use uuid::Uuid; + +use crate::models::{CreateTemplateRequest, RegistryCredential}; + +/// Lifecycle stage as understood by the E2B CLI. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BuildStage { + /// Initial state: template has been registered, push credentials issued, + /// waiting for the client to upload the image. + WaitingPush, + /// Image has been pushed; CubeMaster pipeline is running. + Building, + /// Image-build pipeline finished successfully. + Ready, + /// Image-build pipeline failed. + Error, +} + +impl BuildStage { + pub fn as_str(self) -> &'static str { + match self { + BuildStage::WaitingPush => "waiting", + BuildStage::Building => "building", + BuildStage::Ready => "ready", + BuildStage::Error => "error", + } + } +} + +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct BuildContext { + pub template_id: String, + pub build_id: String, + /// Original create request — replayed when the client calls + /// `POST /templates/{tid}/builds/{bid}`. + pub create_request: Arc, + /// Registry credentials issued at create time. Pull host (used by + /// CubeMaster) is encoded into `image_ref` so the rest of the system + /// stays oblivious of registry internals. + pub credential: RegistryCredential, + /// Image reference CubeMaster will pull from once the client has pushed. + pub image_ref: String, + /// CubeMaster `jobID` — empty until the build is actually dispatched. + pub job_id: String, + /// Append-only log lines (timestamps + plain message). + pub logs: Vec, + pub stage: BuildStage, + pub progress: i32, + pub message: String, + pub created_at: DateTime, + + // ── V3 protocol-only metadata (populated by POST /v3/templates) ──────── + /// Template name (E2B `name`), e.g. "my-template" or "my-template:v1". + pub name: String, + /// Tag list assigned at create time; the trailing ":tag" of `name` is + /// pre-pended into this list when present. + pub tags: Vec, + /// CPU cores requested via E2B `cpuCount`. + pub cpu_count: u32, + /// Memory in MiB requested via E2B `memoryMB`. + pub memory_mb: u32, + /// Aliases list returned to the client (currently == [name without tag]). + pub aliases: Vec, +} + +#[derive(Debug, Clone)] +pub struct BuildLogLine { + pub timestamp: DateTime, + pub line: String, +} + +/// Thread-safe, in-process build registry. +#[derive(Clone, Default)] +pub struct BuildRegistry { + inner: Arc>, +} + +impl BuildRegistry { + pub fn new() -> Self { + Self::default() + } + + /// Register a brand-new build attempt. Returns the freshly allocated + /// build_id alongside the stored context (cloned for read-only use by the + /// caller). + pub fn create( + &self, + template_id: String, + request: CreateTemplateRequest, + credential: RegistryCredential, + image_ref: String, + ) -> BuildContext { + let build_id = format!("bld-{}", Uuid::new_v4().simple()); + let ctx = BuildContext { + template_id: template_id.clone(), + build_id: build_id.clone(), + create_request: Arc::new(request), + credential, + image_ref, + job_id: String::new(), + logs: Vec::new(), + stage: BuildStage::WaitingPush, + progress: 0, + message: "build registered, waiting for image push".to_string(), + created_at: Utc::now(), + name: String::new(), + tags: Vec::new(), + cpu_count: 0, + memory_mb: 0, + aliases: Vec::new(), + }; + + // Index under both bid and (tid, bid) so lookups by either key work. + self.inner.insert(build_id.clone(), ctx.clone()); + self.inner.insert(compose_key(&template_id, &build_id), ctx.clone()); + ctx + } + + pub fn get(&self, build_id: &str) -> Option { + self.inner.get(build_id).map(|r| r.value().clone()) + } + + pub fn get_by_pair(&self, template_id: &str, build_id: &str) -> Option { + self.inner + .get(&compose_key(template_id, build_id)) + .or_else(|| self.inner.get(build_id)) + .map(|r| r.value().clone()) + } + + /// Apply a mutation to a build context. Updates both index entries. + pub fn update(&self, build_id: &str, mutate: F) -> Option + where + F: FnOnce(&mut BuildContext), + { + let mut ctx = self.inner.get(build_id).map(|r| r.value().clone())?; + mutate(&mut ctx); + + let pair_key = compose_key(&ctx.template_id, &ctx.build_id); + self.inner.insert(build_id.to_string(), ctx.clone()); + self.inner.insert(pair_key, ctx.clone()); + Some(ctx) + } + + /// Append one log line. Truncates the head to bound memory at ~10k lines. + pub fn append_log(&self, build_id: &str, line: impl Into) { + let line = line.into(); + self.update(build_id, |ctx| { + ctx.logs.push(BuildLogLine { + timestamp: Utc::now(), + line, + }); + const MAX_LOGS: usize = 10_000; + if ctx.logs.len() > MAX_LOGS { + let drop = ctx.logs.len() - MAX_LOGS; + ctx.logs.drain(0..drop); + } + }); + } +} + +fn compose_key(template_id: &str, build_id: &str) -> String { + format!("{}::{}", template_id, build_id) +} diff --git a/CubeAPI/src/services/mod.rs b/CubeAPI/src/services/mod.rs index 29a4edf73..43cb524a7 100644 --- a/CubeAPI/src/services/mod.rs +++ b/CubeAPI/src/services/mod.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +pub mod builds; pub mod cluster; pub mod sandboxes; pub mod snapshots; @@ -15,10 +16,13 @@ pub struct AppServices { pub sandboxes: sandboxes::SandboxService, pub snapshots: snapshots::SnapshotService, pub templates: templates::TemplateService, + #[allow(dead_code)] + pub builds: builds::BuildRegistry, } impl AppServices { pub fn new(config: &ServerConfig, cubemaster: CubeMasterClient) -> Self { + let builds = builds::BuildRegistry::new(); Self { cluster: cluster::ClusterService::new(cubemaster.clone()), sandboxes: sandboxes::SandboxService::new( @@ -30,7 +34,13 @@ impl AppServices { cubemaster.clone(), config.instance_type.clone(), ), - templates: templates::TemplateService::new(cubemaster, config.instance_type.clone()), + templates: templates::TemplateService::new( + cubemaster, + config.instance_type.clone(), + builds.clone(), + config.clone(), + ), + builds, } } } diff --git a/CubeAPI/src/services/sandboxes.rs b/CubeAPI/src/services/sandboxes.rs index 0b56b4efb..28d69c643 100644 --- a/CubeAPI/src/services/sandboxes.rs +++ b/CubeAPI/src/services/sandboxes.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use uuid::Uuid; use crate::{ - constants::ENVD_VERSION, + constants::{ENVD_PORT_STR, ENVD_VERSION}, cubemaster::{ datetime_from_unix_nanos, extract_template_id, CreateSandboxRequest, CubeMasterClient, CubeMasterError, CubeVSContext, DeleteSandboxRequest, ListSandboxRequest, SandboxInfo, @@ -27,6 +27,17 @@ const RET_CODE_NOT_FOUND: i32 = 130404; const RET_CODE_CONFLICT: i32 = 130409; const HOSTDIR_MOUNT_KEY: &str = "host-mount"; +/// CubeMaster annotation key for the list of TCP ports we want cubelet to +/// expose on the host (colon-separated, e.g. `"49983:8080"`). The port list +/// is consumed in `CubeMaster/pkg/service/sandbox/util.go::getExposedPorts` +/// and ends up in the per-sandbox redis metadata read by cube-proxy. +const ANNO_EXPOSED_PORTS: &str = "com.exposed_ports"; + +/// Optional `metadata` key that lets callers override the exposed port list +/// for a specific sandbox (colon-separated). When absent we just publish the +/// default envd port so the e2b SDK can connect. +const META_EXPOSED_PORTS: &str = "exposed-ports"; + #[derive(Clone)] pub struct SandboxService { cubemaster: CubeMasterClient, @@ -126,12 +137,28 @@ impl SandboxService { ), ]); + // The e2b SDK always talks to envd on port 49983 via the + // `-.` host scheme. cube-proxy looks up the + // host-port mapping for that container port in redis, which is only + // populated when the sandbox creation request advertises it through + // the `com.exposed_ports` annotation. Keep this annotation in sync, + // but allow callers to override the list via `metadata.exposed-ports` + // (colon-separated) when they need to expose more ports. let labels = body.metadata.map(|mut meta| { if let Some(value) = meta.remove(HOSTDIR_MOUNT_KEY) { annotations.insert(HOSTDIR_MOUNT_KEY.to_string(), value); } + if let Some(raw) = meta.remove(META_EXPOSED_PORTS) { + annotations.insert( + ANNO_EXPOSED_PORTS.to_string(), + merge_exposed_ports(&raw), + ); + } meta }); + annotations + .entry(ANNO_EXPOSED_PORTS.to_string()) + .or_insert_with(|| ENVD_PORT_STR.to_string()); let req = CreateSandboxRequest { request_id: new_request_id(), @@ -146,15 +173,57 @@ impl SandboxService { cubevs_context: build_cubevs_context(body.allow_internet_access, body.network.as_ref()), }; + tracing::info!( + template_id = %template_id, + request_id = %req.request_id, + instance_type = %req.instance_type, + exposed_ports = %req.annotations + .get(ANNO_EXPOSED_PORTS) + .cloned() + .unwrap_or_else(|| "".to_string()), + annotations = ?req.annotations, + "creating sandbox from template" + ); + let resp = self .cubemaster .create_sandbox(&req) .await - .map_err(internal_error)?; + .map_err(|e| { + tracing::error!( + template_id = %template_id, + request_id = %req.request_id, + error = %e, + "cubemaster create_sandbox transport failed" + ); + internal_error(format!( + "cubemaster transport failed (templateID={}, requestID={}): {}", + template_id, req.request_id, e + )) + })?; - resp.ret.into_result().map_err(internal_error)?; + let sandbox_id = resp.sandbox_id.clone(); + let resp_request_id = resp.request_id.clone(); + if let Err(e) = resp.ret.into_result() { + tracing::error!( + template_id = %template_id, + request_id = %req.request_id, + cubemaster_error = %e, + "cubemaster rejected sandbox creation \ + — likely a microVM-level failure (rootfs mount / agent restore). \ + Inspect cube-agent and cubelet logs for stderr from \ + 'do_exec_mount' / 'start_exec_process'." + ); + return Err(internal_error(format!( + "sandbox creation failed for templateID={} (requestID={}): {} \ + — this happens at the microVM layer (cube-agent restore/mount); \ + check cube-agent / cubelet logs on the host for the underlying \ + mount error", + template_id, req.request_id, e + ))); + } - Ok(self.sandbox_response(template_id, resp.sandbox_id, resp.request_id)) + Ok(self.sandbox_response(template_id, sandbox_id, resp_request_id)) } pub async fn kill_sandbox(&self, sandbox_id: &str) -> AppResult<()> { @@ -470,6 +539,33 @@ fn internal_error(error: impl std::fmt::Display) -> AppError { AppError::Internal(anyhow::anyhow!(error.to_string())) } +/// Merge a caller-supplied colon-separated port list with the mandatory envd +/// port (49983), preserving order, removing duplicates and silently dropping +/// non-numeric entries. The result is the value to set on the +/// `com.exposed_ports` annotation that CubeMaster understands. +fn merge_exposed_ports(raw: &str) -> String { + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + let mut ordered: Vec = Vec::new(); + for part in raw.split(':') { + let trimmed = part.trim(); + if trimmed.is_empty() { + continue; + } + if trimmed.parse::().is_err() { + // Skip silently — CubeMaster's getExposedPorts would otherwise + // reject the whole sandbox creation with an InvalidParam error. + continue; + } + if seen.insert(trimmed.to_string()) { + ordered.push(trimmed.to_string()); + } + } + if seen.insert(ENVD_PORT_STR.to_string()) { + ordered.push(ENVD_PORT_STR.to_string()); + } + ordered.join(":") +} + fn sandbox_not_found_or_internal(e: CubeMasterError, sandbox_id: &str) -> AppError { if e.is_not_found() { AppError::NotFound(format!("sandbox {} not found", sandbox_id)) @@ -635,10 +731,40 @@ pub(crate) fn build_cubevs_context( mod tests { use std::collections::HashMap; - use super::{build_cubevs_context, filter_by_metadata, from_cubemaster_info}; + use super::{build_cubevs_context, filter_by_metadata, from_cubemaster_info, merge_exposed_ports}; use crate::cubemaster::{ListSandboxResponse, SandboxInfo}; use crate::models::{SandboxNetworkConfig, SandboxState}; + #[test] + fn merge_exposed_ports_appends_envd_port_when_missing() { + // Caller supplied two ports but no envd port → 49983 must be appended. + assert_eq!(merge_exposed_ports("80:8080"), "80:8080:49983"); + } + + #[test] + fn merge_exposed_ports_keeps_envd_port_position_when_already_listed() { + // 49983 already listed first → no duplicate, original order preserved. + assert_eq!(merge_exposed_ports("49983:8080"), "49983:8080"); + } + + #[test] + fn merge_exposed_ports_skips_non_numeric_and_dedupes() { + // Empty segments, dupes and garbage tokens get filtered without + // poisoning the request (CubeMaster would otherwise reject the whole + // sandbox create). + assert_eq!( + merge_exposed_ports("80::80:abc:8080:49983"), + "80:8080:49983" + ); + } + + #[test] + fn merge_exposed_ports_handles_empty_input() { + // Empty caller input still publishes the envd port so cube-proxy can + // route the e2b SDK's `49983-.` host. + assert_eq!(merge_exposed_ports(""), "49983"); + } + #[test] fn metadata_filter_matches_all_pairs() { let metadata = HashMap::from([ diff --git a/CubeAPI/src/services/templates.rs b/CubeAPI/src/services/templates.rs index 605769074..e65473145 100644 --- a/CubeAPI/src/services/templates.rs +++ b/CubeAPI/src/services/templates.rs @@ -2,9 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::collections::HashMap; + use uuid::Uuid; use crate::{ + config::ServerConfig, cubemaster::{ CreateTemplateContainerOverrides, CreateTemplateCubeVSContext, CreateTemplateEnv, CreateTemplateFromImageReq, CreateTemplateResources, CubeMasterClient, CubeMasterError, @@ -13,22 +16,34 @@ use crate::{ }, error::{AppError, AppResult}, models::{ - CreateTemplateRequest, RebuildTemplateRequest, TemplateBuildJob, TemplateBuildStatus, - TemplateDetail, TemplateSummary, + CreateTemplateRequest, RebuildTemplateRequest, RegistryCredential, TemplateBuildJob, + TemplateBuildStatus, TemplateDetail, TemplateSummary, V2TemplateBuildStart, + V3BuildLogEntry, V3BuildStatusReason, V3TemplateBuildInfo, V3TemplateBuildRequest, + V3TemplateBuildResponse, }, + services::builds::{BuildRegistry, BuildStage}, }; #[derive(Clone)] pub struct TemplateService { cubemaster: CubeMasterClient, instance_type: String, + builds: BuildRegistry, + config: ServerConfig, } impl TemplateService { - pub fn new(cubemaster: CubeMasterClient, instance_type: String) -> Self { + pub fn new( + cubemaster: CubeMasterClient, + instance_type: String, + builds: BuildRegistry, + config: ServerConfig, + ) -> Self { Self { cubemaster, instance_type, + builds, + config, } } @@ -74,13 +89,7 @@ impl TemplateService { .as_ref() .and_then(|v| v.get("network_type")) .and_then(|v| v.as_str()) - .and_then(|s| { - if s.is_empty() { - None - } else { - Some(s.to_string()) - } - }); + .and_then(|s| if s.is_empty() { None } else { Some(s.to_string()) }); let allow_internet_access = resp .create_request .as_ref() @@ -101,45 +110,147 @@ impl TemplateService { }) } + /// Create a new template. + /// + /// Two paths converge here: + /// + /// - **CubeSandbox-native** (`image` provided): immediately dispatches to + /// CubeMaster `POST /cube/template/from-image` and returns the resulting + /// job. No registry credential is issued. + /// + /// - **E2B-standard** (`dockerfile` provided, or no `image`): allocates a + /// fresh `buildID`, returns the docker-push credential pointing at the + /// bundled OCI registry, and *does not* trigger CubeMaster yet — the + /// client must complete the docker push and then call + /// `POST /templates/{tid}/builds/{bid}` to dispatch the actual rootfs + /// build. pub async fn create_template( &self, body: CreateTemplateRequest, ) -> AppResult { - if body.image.trim().is_empty() { - return Err(AppError::BadRequest("image is required".to_string())); + if body.dockerfile.is_some() || body.image.is_none() { + return self.create_template_e2b_mode(body).await; } + self.create_template_native_mode(body).await + } - let dns_servers = validate_dns_servers(body.dns.as_deref())?; - let container_overrides = build_template_container_overrides(&body, dns_servers.as_deref()); - let cubevs_context = build_template_cubevs_context(&body); - - let req = CreateTemplateFromImageReq { - request_id: new_request_id(), - instance_type: body - .instance_type - .unwrap_or_else(|| self.instance_type.clone()), - // template_id is intentionally left empty — CubeMaster always - // auto-generates it with the "tpl-" prefix via - // normalizeTemplateImageRequest. - template_id: String::new(), - source_image_ref: body.image.trim().to_string(), - writable_layer_size: body.writable_layer_size, - exposed_ports: body.exposed_ports, - network_type: non_empty_option(body.network_type), - registry_username: non_empty_option(body.registry_username), - registry_password: non_empty_option(body.registry_password), - distribution_scope: non_empty_vec(body.nodes), - container_overrides, - cubevs_context, - }; - + /// Path 1: CubeSandbox-native — `image` field carries an existing OCI + /// reference, dispatch directly. + async fn create_template_native_mode( + &self, + body: CreateTemplateRequest, + ) -> AppResult { + let image = body.image.clone().unwrap_or_default(); + if image.trim().is_empty() { + return Err(AppError::BadRequest("image is required".to_string())); + } + // Validate DNS servers up-front so callers see a clear error before + // we hand off to CubeMaster. + validate_dns_servers(body.dns.as_deref())?; + let req = self.build_cubemaster_request(&body, image.trim().to_string()); let resp = self .cubemaster .create_template_from_image(&req) .await .map_err(map_err)?; + Ok(to_job(resp, None)) + } + + /// Path 2: E2B-standard — allocate `(templateID, buildID)`, return docker + /// push credentials. Actual build is dispatched by `start_template_build`. + async fn create_template_e2b_mode( + &self, + body: CreateTemplateRequest, + ) -> AppResult { + let upstream = self.config.registry_upstream.as_deref().unwrap_or(""); + if upstream.trim().is_empty() { + return Err(AppError::NotImplemented( + "registry upstream is not configured: set CUBE_API_REGISTRY_UPSTREAM \ + to enable e2b-style template build (dockerfile push)" + .to_string(), + )); + } + + // Allocate / honour template id. + let template_id = if body.template_id.trim().is_empty() { + format!("tpl-{}", Uuid::new_v4().simple()) + } else { + body.template_id.trim().to_string() + }; - Ok(to_job(resp)) + // Decide repo + tag and the public URL the CLI should push to. + let repo_prefix = self.config.registry_repo_prefix.trim(); + let repo_prefix = if repo_prefix.is_empty() { + "e2b" + } else { + repo_prefix + }; + let public_host = self + .config + .registry_public_host + .clone() + .or_else(|| host_from_url(upstream)) + .unwrap_or_else(|| "localhost".to_string()); + + let credential_url = if upstream.starts_with("https://") || upstream.starts_with("http://") { + // strip path, keep scheme://host:port + base_url(upstream).to_string() + } else { + format!("https://{}", public_host) + }; + + let credential = RegistryCredential { + url: credential_url, + repository: format!("{}/{}", repo_prefix, template_id), + username: "_token".to_string(), + password: self + .config + .registry_token + .clone() + .unwrap_or_else(|| "_anon".to_string()), + }; + + // Image ref CubeMaster will pull from once push is complete. + let pull_host = self + .config + .registry_pull_host + .clone() + .or_else(|| host_from_url(upstream)) + .unwrap_or_else(|| public_host.clone()); + let image_ref_template = format!("{}/{}/{}", pull_host, repo_prefix, template_id); + + // Reserve the build context up-front; the buildID becomes the docker tag. + let ctx = self.builds.create( + template_id.clone(), + body, + credential.clone(), + image_ref_template.clone(), + ); + let image_ref_full = format!("{}:{}", image_ref_template, ctx.build_id); + + // Patch the stored ref to include the buildID-as-tag now that we know it. + self.builds.update(&ctx.build_id, |c| { + c.image_ref = image_ref_full.clone(); + }); + self.builds.append_log( + &ctx.build_id, + format!( + "[register] templateID={} buildID={} repo={}", + template_id, ctx.build_id, credential.repository + ), + ); + + Ok(TemplateBuildJob { + job_id: ctx.build_id.clone(), + template_id: template_id.clone(), + build_id: ctx.build_id.clone(), + status: "accepted".to_string(), + phase: "waiting".to_string(), + progress: 0, + error_message: String::new(), + upload_url: Some(credential.url.clone()), + registry: Some(credential), + }) } pub async fn rebuild_template( @@ -154,8 +265,7 @@ impl TemplateService { }; let resp = self.cubemaster.redo_template(&req).await.map_err(map_err)?; - - Ok(to_job(resp)) + Ok(to_job(resp, None)) } pub async fn delete_template( @@ -179,23 +289,124 @@ impl TemplateService { Ok(()) } - pub async fn start_template_build(&self, template_id: String) -> AppResult { + /// Dispatch the CubeMaster pipeline for a previously-registered E2B + /// build. Falls back to a plain `redoTemplate` for builds that were not + /// registered through `create_template_e2b_mode` (e.g. CLI invokes + /// `start_template_build` directly on a CubeSandbox-native template). + pub async fn start_template_build( + &self, + template_id: String, + build_id: Option, + ) -> AppResult { + if let Some(bid) = build_id.as_deref() { + if let Some(ctx) = self.builds.get_by_pair(&template_id, bid) { + self.builds.append_log( + bid, + format!("[dispatch] image_ref={}", ctx.image_ref), + ); + + let req = self + .build_cubemaster_request(&ctx.create_request, ctx.image_ref.clone()); + let resp = self + .cubemaster + .create_template_from_image(&req) + .await + .map_err(map_err)?; + + let job = resp.job.clone().unwrap_or_else(default_template_job); + let job_id = job.job_id.clone(); + self.builds.update(bid, |c| { + c.job_id = job_id.clone(); + c.stage = BuildStage::Building; + c.message = "build dispatched to cubemaster".to_string(); + }); + + return Ok(to_job(resp, Some(bid.to_string()))); + } + } + + // Fallback for legacy `redo` semantics. let req = RedoTemplateReq { request_id: new_request_id(), template_id, extra: Default::default(), }; - let resp = self.cubemaster.redo_template(&req).await.map_err(map_err)?; - - Ok(to_job(resp)) + Ok(to_job(resp, build_id)) } pub async fn get_template_build_status( &self, template_id: &str, build_id: &str, + logs_offset: i32, ) -> AppResult { + // E2B mode: serve from the in-memory build registry, falling back to + // CubeMaster for the canonical job state when it's been dispatched. + if let Some(ctx) = self.builds.get_by_pair(template_id, build_id) { + let mut status = ctx.stage.as_str().to_string(); + let mut progress = ctx.progress; + let mut message = ctx.message.clone(); + + if !ctx.job_id.is_empty() { + if let Ok(remote) = self + .cubemaster + .get_template_build_status(&ctx.job_id) + .await + { + status = remap_cubemaster_status(&remote.status); + progress = remote.progress; + message = remote.message.clone(); + + // Persist progress / terminal state into the local registry. + let new_stage = match status.as_str() { + "ready" => BuildStage::Ready, + "error" => BuildStage::Error, + _ => BuildStage::Building, + }; + self.builds.update(build_id, |c| { + c.stage = new_stage; + c.progress = progress; + c.message = message.clone(); + }); + + if !remote.message.is_empty() { + self.builds.append_log( + build_id, + format!("[{}] {}", remote.status, remote.message), + ); + } + } + } + + // Slice logs starting at the requested offset. + let total = ctx.logs.len() as i32; + let offset = logs_offset.max(0).min(total); + let lines: Vec = self + .builds + .get(build_id) + .map(|c| { + c.logs + .iter() + .skip(offset as usize) + .map(|l| format!("{} {}", l.timestamp.to_rfc3339(), l.line)) + .collect() + }) + .unwrap_or_default(); + let next_offset = offset + lines.len() as i32; + + return Ok(TemplateBuildStatus { + build_id: build_id.to_string(), + template_id: template_id.to_string(), + status, + progress, + message, + logs: lines, + logs_offset: Some(next_offset), + }); + } + + // Legacy native mode: forward to CubeMaster directly (no log buffer). let resp = self .cubemaster .get_template_build_status(build_id) @@ -205,27 +416,713 @@ impl TemplateService { Ok(TemplateBuildStatus { build_id: string_or(resp.build_id, build_id), template_id: string_or(resp.template_id, template_id), - status: resp.status, + status: remap_cubemaster_status(&resp.status), progress: resp.progress, message: resp.message, + logs: Vec::new(), + logs_offset: None, + }) + } + + pub async fn get_template_build_logs( + &self, + template_id: &str, + build_id: &str, + offset: i32, + ) -> AppResult { + let status = self + .get_template_build_status(template_id, build_id, offset) + .await?; + + Ok(serde_json::json!({ + "buildID": status.build_id, + "templateID": status.template_id, + "status": status.status, + "progress": status.progress, + "logs": status.logs, + "logsOffset": status.logs_offset, + })) + } + + /// Mark a build as image-pushed (called by the registry handler once the + /// manifest PUT for `repo:tag` succeeds). Idempotent. + pub fn mark_image_pushed(&self, build_id: &str) { + self.builds.update(build_id, |ctx| { + ctx.append_log_inline("[push] image upload complete"); + if matches!(ctx.stage, BuildStage::WaitingPush) { + ctx.stage = BuildStage::Building; + ctx.message = "image uploaded, waiting for build dispatch".to_string(); + } + }); + } + + /// Build a CubeMaster create-from-image request from the user's intent + /// (used by both create paths so behaviour stays in lockstep). + fn build_cubemaster_request( + &self, + body: &CreateTemplateRequest, + image_ref: String, + ) -> CreateTemplateFromImageReq { + let probe = build_probe(body); + let resources = build_resources(body); + let envs = merge_envs(body); + let command = non_empty_vec(body.command.clone()); + let args = non_empty_vec(body.args.clone()); + // We've already validated DNS servers up the call stack; here we just + // canonicalise and drop empties. + let dns_servers: Option> = body.dns.as_ref().and_then(|servers| { + let cleaned: Vec = servers + .iter() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + if cleaned.is_empty() { + None + } else { + Some(cleaned) + } + }); + let dns_config = dns_servers.map(|servers| DnsConfig { + servers, + searches: Vec::new(), + }); + + let container_overrides = if probe.is_some() + || resources.is_some() + || envs.is_some() + || command.is_some() + || args.is_some() + || dns_config.is_some() + { + Some(CreateTemplateContainerOverrides { + command, + args, + probe, + resources, + envs, + dns_config, + }) + } else { + None + }; + + let allow_out = body.allow_out.clone().unwrap_or_default(); + let deny_out = body.deny_out.clone().unwrap_or_default(); + let cubevs_context = if body.allow_internet_access.is_some() + || !allow_out.is_empty() + || !deny_out.is_empty() + { + Some(CreateTemplateCubeVSContext { + allow_internet_access: body.allow_internet_access, + allow_out, + deny_out, + }) + } else { + None + }; + + CreateTemplateFromImageReq { + request_id: new_request_id(), + instance_type: body + .instance_type + .clone() + .unwrap_or_else(|| self.instance_type.clone()), + template_id: body.template_id.clone(), + source_image_ref: image_ref, + // CubeMaster validates `writable_layer_size` as required; fall back + // to the configured default (env CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE, + // "1G" by default) when the caller hasn't specified one. The E2B V3 + // SDK never sends this field, so without a default the build would + // fail with "writable_layer_size is required". + writable_layer_size: body + .writable_layer_size + .clone() + .filter(|s| !s.trim().is_empty()) + .or_else(|| Some(self.config.default_writable_layer_size.clone())) + .filter(|s| !s.trim().is_empty()), + exposed_ports: body.exposed_ports.clone(), + network_type: non_empty_option(body.network_type.clone()), + registry_username: non_empty_option(body.registry_username.clone()), + registry_password: non_empty_option(body.registry_password.clone()), + distribution_scope: non_empty_vec(body.nodes.clone()), + container_overrides, + cubevs_context, + } + } + + // ── V3 protocol (real e2b SDK contract) ──────────────────────────────── + + /// `POST /v3/templates` — register a template + build attempt. + /// + /// Returns the V3 envelope shape the SDK strictly expects. We allocate + /// `(templateID, buildID)` deterministically from `name` so subsequent + /// builds against the same name reuse the same templateID (matching E2B's + /// "alias is also a primary key" semantics). + pub fn v3_create_template( + &self, + body: V3TemplateBuildRequest, + ) -> AppResult { + // Resolve final name + tag list (the SDK packs "name:tag" or relies on + // the explicit `tags` array). + let raw_name = body + .name + .clone() + .or(body.alias.clone()) + .filter(|s| !s.trim().is_empty()) + .ok_or_else(|| AppError::BadRequest("template name is required".to_string()))?; + let (name_part, name_tag) = match raw_name.split_once(':') { + Some((n, t)) if !t.is_empty() => (n.to_string(), Some(t.to_string())), + _ => (raw_name.clone(), None), + }; + let mut tags = body.tags.clone().unwrap_or_default(); + if let Some(t) = name_tag.clone() { + if !tags.contains(&t) { + tags.insert(0, t); + } + } + + let template_id = stable_template_id(&name_part); + + // Build the legacy request shell so the V2 trigger step has uniform + // metadata regardless of whether create-time fields are sparse. + let create_req = CreateTemplateRequest { + template_id: template_id.clone(), + instance_type: None, + alias: Some(name_part.clone()), + team_id: body.team_id.clone(), + image: None, + dockerfile: None, + writable_layer_size: None, + exposed_ports: None, + probe_port: None, + probe_path: None, + cpu: None, + memory: None, + cpu_count: body.cpu_count, + memory_mb: body.memory_mb, + env: None, + env_vars: None, + allow_internet_access: None, + network_type: None, + nodes: None, + registry_username: None, + registry_password: None, + command: None, + args: None, + dns: None, + allow_out: None, + deny_out: None, + start_cmd: None, + ready_cmd: None, + }; + + // Reserve a build context. Registry credential is attached for the + // benefit of the OCI-push flow (`/v2/...` reverse proxy); SDK V3 won't + // actually use it — it ships a tarball through `/templates/.../files/`. + let credential = self.issue_registry_credential(&template_id); + let pull_host = self + .config + .registry_pull_host + .clone() + .or_else(|| self.config.registry_upstream.as_deref().and_then(host_from_url)) + .unwrap_or_else(|| { + self.config + .registry_public_host + .clone() + .unwrap_or_else(|| "localhost".to_string()) + }); + let repo_prefix = if self.config.registry_repo_prefix.trim().is_empty() { + "e2b" + } else { + self.config.registry_repo_prefix.trim() + }; + let image_ref_template = format!("{}/{}/{}", pull_host, repo_prefix, template_id); + + let ctx = self.builds.create( + template_id.clone(), + create_req, + credential, + image_ref_template.clone(), + ); + + let build_id = ctx.build_id.clone(); + let cpu_count = body.cpu_count.unwrap_or(2); + let memory_mb = body.memory_mb.unwrap_or(1024); + self.builds.update(&build_id, |c| { + c.image_ref = format!("{}:{}", image_ref_template, build_id); + c.name = name_part.clone(); + c.tags = tags.clone(); + c.cpu_count = cpu_count; + c.memory_mb = memory_mb; + c.aliases = vec![name_part.clone()]; + c.message = "template registered, awaiting build trigger".to_string(); + }); + self.builds.append_log( + &build_id, + format!( + "[register-v3] templateID={} buildID={} name={} cpu={} memMB={}", + template_id, build_id, name_part, cpu_count, memory_mb + ), + ); + + Ok(V3TemplateBuildResponse { + template_id, + build_id, + names: vec![name_part.clone()], + aliases: vec![name_part], + tags, + public: false, }) } - pub async fn get_template_build_logs(&self, build_id: &str) -> AppResult { + /// `GET /templates/{tid}/files/{hash}` — file-cache probe. + /// + /// Until the in-cluster builder lands we don't actually consume uploaded + /// tarballs. We answer `present=true` so the SDK skips uploading; this is + /// safe because `from_image`-based builds (the only flow CubeMaster + /// currently supports) don't need the build context. + pub fn v3_get_file_upload(&self, _template_id: &str, _files_hash: &str) -> AppResult { + Ok(crate::models::V3TemplateFileUpload { + present: true, + url: None, + }) + } + + /// `POST /v2/templates/{tid}/builds/{bid}` — the real "start build" call. + /// + /// At this point CubeMaster needs an OCI image reference. We resolve one + /// in this priority order: + /// + /// 1. `body.from_image` — the standard E2B flow, e.g. + /// `python:3.11-slim`. + /// 2. The image already pushed to the bundled registry under + /// `/:` (when the OCI Distribution path + /// was used). + /// 3. `body.from_template` — copy from another known CubeSandbox + /// template (resolved via CubeMaster `get_template`). + /// + /// `start_cmd` becomes container `args`; `ready_cmd` becomes a Probe.Exec. + pub async fn v3_trigger_build( + &self, + template_id: String, + build_id: String, + body: V2TemplateBuildStart, + ) -> AppResult<()> { + let ctx = self + .builds + .get_by_pair(&template_id, &build_id) + .ok_or_else(|| { + AppError::NotFound(format!( + "build {} of template {} is unknown — call POST /v3/templates first", + build_id, template_id + )) + })?; + + // Resolve the source image. + let source_image = if let Some(img) = body + .from_image + .as_ref() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + { + img + } else if let Some(parent) = body + .from_template + .as_ref() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + { + // Re-use an already-built CubeSandbox template as the base. We + // synthesise a CubeMaster reference of the form `cube://`, + // letting downstream callers resolve it. Adjust to your local + // convention if needed. + format!("cube://{}", parent) + } else if !ctx.image_ref.is_empty() { + ctx.image_ref.clone() + } else { + return Err(AppError::BadRequest( + "either fromImage, fromTemplate, or a previously-pushed image is required" + .to_string(), + )); + }; + + // Patch the cached create_request with the V2-time fields and dispatch. + let mut create_req: CreateTemplateRequest = (*ctx.create_request).clone(); + if create_req.start_cmd.is_none() { + create_req.start_cmd = body.start_cmd.clone(); + } + if create_req.ready_cmd.is_none() { + create_req.ready_cmd = body.ready_cmd.clone(); + } + + self.builds.append_log( + &build_id, + format!( + "[dispatch-v3] from_image={} start_cmd={:?} ready_cmd={:?} steps={}", + source_image, + body.start_cmd.as_deref().unwrap_or(""), + body.ready_cmd.as_deref().unwrap_or(""), + body.steps.as_ref().map(|s| s.len()).unwrap_or(0), + ), + ); + + // Cubelet/CubeMaster only support TcpSocket | Ping | HttpGet probes, + // so the E2B `readyCmd` (a shell snippet) cannot be forwarded + // verbatim. To still honour the SDK's `wait_for_url(...)` semantics + // we attempt a best-effort parse of the well-known + // `http://:/` form embedded in the readyCmd. When + // that succeeds (and the caller did not already pin probe_port/path + // via the v3 body), we synthesise an HttpGet probe so cubelet's + // `doProbe` blocks until the user process is actually listening + // before sandbox creation returns. + let ready_cmd = body + .ready_cmd + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()); + if let Some(cmd) = ready_cmd { + match parse_ready_url(cmd) { + Some((port, path)) if create_req.probe_port.is_none() => { + create_req.probe_port = Some(port); + if create_req.probe_path.is_none() { + create_req.probe_path = Some(path.clone()); + } + self.builds.append_log( + &build_id, + format!( + "[dispatch-v3] readyCmd parsed → HttpGet probe on \ + port={} path={} (probe blocks sandbox creation \ + until ready)", + port, path + ), + ); + } + Some(_) => { + // probe_port already set by caller — keep their override + // but make the precedence explicit in the build log. + self.builds.append_log( + &build_id, + "[dispatch-v3] readyCmd parsed but probePort was \ + supplied explicitly — keeping caller's probePort \ + and ignoring the URL inside readyCmd", + ); + } + None + if create_req.probe_port.is_none() + && create_req + .exposed_ports + .as_ref() + .map(|p| p.is_empty()) + .unwrap_or(true) => + { + self.builds.append_log( + &build_id, + "[dispatch-v3] note: readyCmd is recorded but could \ + not be parsed into an HttpGet probe (only \ + `http://host:port/path` URLs are recognised); \ + supply `probePort` (or build with `exposedPorts`) \ + to enable readiness checks", + ); + } + None => { + // Caller already supplied probe_port or exposed_ports; + // build_probe() will pick those up on its own. + } + } + } + + let req = self.build_cubemaster_request(&create_req, source_image.clone()); let resp = self .cubemaster - .get_template_build_status(build_id) + .create_template_from_image(&req) .await .map_err(map_err)?; - let line = build_log_line(&resp.status, resp.progress, &resp.message); + let job = resp.job.unwrap_or_else(default_template_job); + let job_id = job.job_id.clone(); + self.builds.update(&build_id, |c| { + c.job_id = job_id.clone(); + c.stage = BuildStage::Building; + c.message = "build dispatched to cubemaster".to_string(); + }); - Ok(serde_json::json!({ - "buildID": build_id, - "status": resp.status, - "progress": resp.progress, - "lines": [line], - })) + Ok(()) + } + + /// `GET /templates/{tid}/builds/{bid}/status` — V3 status envelope. + pub async fn v3_get_build_status( + &self, + template_id: &str, + build_id: &str, + logs_offset: i32, + limit: i32, + ) -> AppResult { + // Reuse the existing get_template_build_status (which already knows + // how to refresh against CubeMaster), then convert into the V3 shape. + let internal = self + .get_template_build_status(template_id, build_id, logs_offset) + .await?; + + let limit = if limit <= 0 { 100 } else { limit as usize }; + let logs: Vec = internal + .logs + .iter() + .take(limit) + .cloned() + .collect(); + let log_entries: Vec = logs + .iter() + .map(|line| V3BuildLogEntry { + timestamp: chrono::Utc::now(), + message: line.clone(), + level: "info".to_string(), + }) + .collect(); + + let status = match internal.status.as_str() { + "ready" => "ready", + "error" => "error", + "waiting" | "pending" => "waiting", + _ => "building", + } + .to_string(); + + let reason = if status == "error" { + Some(V3BuildStatusReason { + step_index: None, + message: if internal.message.is_empty() { + "build failed".to_string() + } else { + internal.message.clone() + }, + }) + } else { + None + }; + + Ok(V3TemplateBuildInfo { + build_id: internal.build_id, + template_id: internal.template_id, + status, + logs, + log_entries, + reason, + }) + } + + fn issue_registry_credential(&self, template_id: &str) -> RegistryCredential { + let upstream = self.config.registry_upstream.as_deref().unwrap_or(""); + let url = if upstream.starts_with("http://") || upstream.starts_with("https://") { + base_url(upstream) + } else if let Some(host) = self.config.registry_public_host.clone() { + format!("https://{}", host) + } else { + "http://localhost".to_string() + }; + let repo_prefix = if self.config.registry_repo_prefix.trim().is_empty() { + "e2b" + } else { + self.config.registry_repo_prefix.trim() + }; + RegistryCredential { + url, + repository: format!("{}/{}", repo_prefix, template_id), + username: "_token".to_string(), + password: self + .config + .registry_token + .clone() + .unwrap_or_else(|| "_anon".to_string()), + } + } +} + +// ─── helpers ─────────────────────────────────────────────────────────────── + +/// Build the CubeMaster `Probe` from the user's intent. +/// +/// **Important — limitations imposed by the downstream stack**: +/// +/// - Cubelet (`Cubelet/services/cubebox/check.go::checkProbe`) only accepts +/// `TcpSocket | Ping | HttpGet` handlers. Anything else is rejected with +/// `invalid probe.probe_handler param`. +/// - CubeMaster's `handleProbeHandler` (in `pkg/service/sandbox/util.go`) +/// similarly has no Exec branch — passing one yields an empty handler +/// object, which Cubelet then rejects. +/// +/// As a result the E2B-style `readyCmd` (a shell snippet) **cannot** be +/// translated into a CubeMaster probe. We only synthesise a probe when the +/// caller (or template store) gives us an explicit port. `readyCmd` is +/// recorded into the build log for diagnostic purposes (see +/// `v3_trigger_build`) but never forwarded to CubeMaster as a probe. +fn build_probe(body: &CreateTemplateRequest) -> Option { + let port = body + .probe_port + .or_else(|| body.exposed_ports.as_ref().and_then(|p| p.first().copied()))?; + + Some(Probe { + probe_handler: ProbeHandler { + http_get: Some(HttpGetAction { + path: body + .probe_path + .clone() + .unwrap_or_else(|| "/health".to_string()), + port, + host: None, + scheme: None, + }), + exec: None, + }, + timeout_ms: Some(30_000), + period_ms: Some(500), + success_threshold: Some(1), + failure_threshold: Some(60), + }) +} + +/// Best-effort parser for the SDK's `wait_for_url(...)` ready command. +/// +/// The E2B SDK ultimately sends the ready check as a free-form shell snippet +/// in `readyCmd`, e.g. +/// +/// * `wait_for_url("http://localhost:49999/health")` +/// * `curl -fsS http://127.0.0.1:8080/ready` +/// * `until curl -fsS http://0.0.0.0:3000; do sleep 1; done` +/// +/// Any of these collapses to "HTTP GET on `` of the sandbox" once +/// you discard the surrounding shell. We extract `(port, path)` from the +/// first `http(s)://:[/path]` substring whose host is one of the +/// localhost aliases so we never accidentally point the probe at an +/// off-VM service. +/// +/// Returns `None` when no recognisable URL is present — callers fall back to +/// `probe_port` / `exposedPorts` or skip the probe entirely. +fn parse_ready_url(ready_cmd: &str) -> Option<(u16, String)> { + // Iterate over each `http(s)://` occurrence; the first parseable one + // wins. We bound the scanning at 64 to keep this cheap. + let mut search = ready_cmd; + for _ in 0..64 { + let scheme_idx = search.find("http")?; + let after_http = &search[scheme_idx..]; + let rest = after_http + .strip_prefix("https://") + .or_else(|| after_http.strip_prefix("http://")); + let rest = match rest { + Some(r) => r, + None => { + // Found "http" but not as a scheme — advance one char and + // try again. + let next = scheme_idx + 1; + if next >= search.len() { + return None; + } + search = &search[next..]; + continue; + } + }; + + // `rest` now points at `[:][/path...][?query]...` followed + // by whatever shell tokens come next (space, `"`, `'`, `)`, `;`, ...). + let end = rest + .find(|c: char| { + matches!( + c, + ' ' | '\t' | '\n' | '"' | '\'' | ')' | ';' | '|' | '&' | '`' | '<' | '>' + ) + }) + .unwrap_or(rest.len()); + let url_body = &rest[..end]; + + // Split host[:port] / path[?query] + let (authority, path_with_query) = match url_body.find('/') { + Some(i) => (&url_body[..i], &url_body[i..]), + None => (url_body, ""), + }; + + // Drop ?query — probes don't carry it. + let path = match path_with_query.find('?') { + Some(i) => &path_with_query[..i], + None => path_with_query, + }; + + // Authority must contain an explicit port and resolve to a localhost + // alias — otherwise we refuse to invent a probe target. + let (host, port_str) = authority.rsplit_once(':')?; + if !is_localhost_alias(host) { + return None; + } + let port: u16 = port_str.parse().ok()?; + if port == 0 { + return None; + } + + let path = if path.is_empty() { + "/".to_string() + } else { + path.to_string() + }; + return Some((port, path)); + } + None +} + +/// `wait_for_url` only makes sense when pointed at the sandbox itself, so we +/// limit the host whitelist to the well-known loopback aliases. Anything else +/// is almost certainly a misconfiguration we'd rather surface than silently +/// translate into a probe. +fn is_localhost_alias(host: &str) -> bool { + matches!( + host, + "localhost" | "127.0.0.1" | "0.0.0.0" | "::1" | "[::1]" + ) +} + +fn build_resources(body: &CreateTemplateRequest) -> Option { + // E2B `cpuCount` (cores) → `cpu * 1000` millicores; legacy `cpu` already + // in millicores wins when both are set. + let cpu_millicores = body.cpu.or_else(|| body.cpu_count.map(|n| n * 1000)); + let mem_mb = body.memory.or(body.memory_mb); + + if cpu_millicores.is_none() && mem_mb.is_none() { + return None; + } + + Some(CreateTemplateResources { + cpu: cpu_millicores.map(|v| format!("{}m", v)), + mem: mem_mb.map(|v| format!("{}Mi", v)), + }) +} + +fn merge_envs(body: &CreateTemplateRequest) -> Option> { + let mut out: HashMap = HashMap::new(); + + if let Some(envs) = &body.env { + for s in envs { + let mut parts = s.splitn(2, '='); + if let Some(k) = parts.next() { + let k = k.trim().to_string(); + if k.is_empty() { + continue; + } + let v = parts.next().unwrap_or("").to_string(); + out.insert(k, v); + } + } + } + if let Some(map) = &body.env_vars { + for (k, v) in map { + out.insert(k.clone(), v.clone()); + } + } + + if out.is_empty() { + None + } else { + Some( + out.into_iter() + .map(|(key, value)| CreateTemplateEnv { key, value }) + .collect(), + ) } } @@ -261,23 +1158,21 @@ fn string_or(value: String, fallback: &str) -> String { } } -fn build_log_line(status: &str, progress: i32, message: &str) -> String { - if message.is_empty() { - format!("[{}] progress={}%", status, progress) - } else { - format!("[{}] {}", status, message) - } -} - -fn to_job(resp: TemplateJobResponse) -> TemplateBuildJob { +fn to_job(resp: TemplateJobResponse, build_id_override: Option) -> TemplateBuildJob { let job = resp.job.unwrap_or_else(default_template_job); + let build_id = build_id_override + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| job.job_id.clone()); TemplateBuildJob { job_id: job.job_id, template_id: job.template_id, + build_id, status: job.status, phase: job.phase, progress: job.progress, error_message: job.error_message, + upload_url: None, + registry: None, } } @@ -336,122 +1231,124 @@ fn validate_dns_servers(servers: Option<&[String]>) -> AppResult Option { - body.probe_port - .or_else(|| body.exposed_ports.as_ref().and_then(|p| p.first().copied())) - .map(|port| Probe { - probe_handler: ProbeHandler { - http_get: Some(HttpGetAction { - path: body - .probe_path - .clone() - .unwrap_or_else(|| "/health".to_string()), - port, - host: None, - scheme: None, - }), - exec: None, - }, - timeout_ms: Some(30000), - period_ms: Some(500), - success_threshold: Some(1), - failure_threshold: Some(60), - }) +/// Translate CubeMaster-internal phase strings into E2B-style status tokens. +fn remap_cubemaster_status(raw: &str) -> String { + match raw.trim().to_lowercase().as_str() { + "" => "pending".to_string(), + "ready" | "succeeded" | "success" | "completed" | "complete" => "ready".to_string(), + "failed" | "error" | "errored" => "error".to_string(), + // CubeMaster intermediate phases — bucket all of them into "building" + // to match what the E2B CLI expects. + "pending" | "queued" | "running" | "pulling" | "extracting" | "rootfs" + | "snapshotting" | "distributing" | "uploading" | "ready_pending" => "building".to_string(), + other => other.to_string(), + } } -fn build_template_resources(body: &CreateTemplateRequest) -> Option { - if body.cpu.is_none() && body.memory.is_none() { - return None; +fn host_from_url(url: &str) -> Option { + // Best-effort URL parse without pulling in a new crate. + let after_scheme = url + .splitn(2, "://") + .nth(1) + .or_else(|| Some(url)) + .unwrap_or(url); + let host = after_scheme.split('/').next().unwrap_or(""); + if host.is_empty() { + None + } else { + Some(host.to_string()) } - Some(CreateTemplateResources { - cpu: body.cpu.map(|v| format!("{v}m")), - mem: body.memory.map(|v| format!("{v}Mi")), - }) } -fn build_template_envs(body: &CreateTemplateRequest) -> Option> { - body.env - .as_ref() - .map(|envs| { - envs.iter() - .filter_map(|s| { - let mut parts = s.splitn(2, '='); - let key = parts.next()?.trim().to_string(); - let value = parts.next().unwrap_or("").to_string(); - if key.is_empty() { - None - } else { - Some(CreateTemplateEnv { key, value }) - } - }) - .collect::>() - }) - .filter(|envs| !envs.is_empty()) +/// Hash `name` into a stable templateID so repeated `Template.build()` calls +/// against the same name reuse the same ID. We use the first 12 hex chars of +/// a v5 UUID derived from the DNS namespace + name. +fn stable_template_id(name: &str) -> String { + let ns = uuid::Uuid::NAMESPACE_DNS; + let id = uuid::Uuid::new_v5(&ns, name.as_bytes()); + let simple = id.simple().to_string(); + format!("tpl-{}", &simple[..16]) } -fn build_template_container_overrides( - body: &CreateTemplateRequest, - dns_servers: Option<&[String]>, -) -> Option { - let command = non_empty_vec(body.command.clone()); - let args = non_empty_vec(body.args.clone()); - let probe = build_template_probe(body); - let resources = build_template_resources(body); - let envs = build_template_envs(body); - let dns_config = dns_servers.map(|servers| DnsConfig { - servers: servers.to_vec(), - searches: Vec::new(), - }); - - if command.is_none() - && args.is_none() - && probe.is_none() - && resources.is_none() - && envs.is_none() - && dns_config.is_none() - { - return None; +fn base_url(url: &str) -> String { + if let Some(rest) = url.strip_prefix("http://") { + let host = rest.split('/').next().unwrap_or(""); + format!("http://{}", host) + } else if let Some(rest) = url.strip_prefix("https://") { + let host = rest.split('/').next().unwrap_or(""); + format!("https://{}", host) + } else { + url.to_string() } - - Some(CreateTemplateContainerOverrides { - command, - args, - probe, - resources, - envs, - dns_config, - }) } -fn build_template_cubevs_context(body: &CreateTemplateRequest) -> Option { - let allow_out = body.allow_out.clone().unwrap_or_default(); - let deny_out = body.deny_out.clone().unwrap_or_default(); - if body.allow_internet_access.is_none() && allow_out.is_empty() && deny_out.is_empty() { - return None; +// Adapter helper used inside dashmap update closures. +impl crate::services::builds::BuildContext { + pub(crate) fn append_log_inline(&mut self, line: impl Into) { + self.logs.push(crate::services::builds::BuildLogLine { + timestamp: chrono::Utc::now(), + line: line.into(), + }); } - Some(CreateTemplateCubeVSContext { - allow_internet_access: body.allow_internet_access, - allow_out, - deny_out, - }) } #[cfg(test)] mod tests { use super::*; + use crate::models::CreateTemplateRequest; + fn empty_request() -> CreateTemplateRequest { + CreateTemplateRequest { + template_id: String::new(), + instance_type: None, + alias: None, + team_id: None, + image: None, + dockerfile: None, + writable_layer_size: None, + exposed_ports: None, + probe_port: None, + probe_path: None, + cpu: None, + memory: None, + cpu_count: None, + memory_mb: None, + env: None, + env_vars: None, + allow_internet_access: None, + network_type: None, + nodes: None, + registry_username: None, + registry_password: None, + command: None, + args: None, + dns: None, + allow_out: None, + deny_out: None, + start_cmd: None, + ready_cmd: None, + } + } + + #[allow(dead_code)] fn sample_request() -> CreateTemplateRequest { CreateTemplateRequest { template_id: String::new(), instance_type: Some("cubebox".to_string()), - image: "python:3.11-slim".to_string(), + alias: None, + team_id: None, + image: Some("python:3.11-slim".to_string()), + dockerfile: None, writable_layer_size: Some("1G".to_string()), exposed_ports: Some(vec![8080]), probe_port: Some(8080), probe_path: Some("/health".to_string()), cpu: Some(2000), memory: Some(2048), + cpu_count: None, + memory_mb: None, env: Some(vec!["A=1".to_string()]), + env_vars: None, allow_internet_access: Some(true), network_type: Some("tap".to_string()), nodes: Some(vec!["node-1".to_string()]), @@ -462,41 +1359,249 @@ mod tests { dns: Some(vec!["8.8.8.8".to_string(), "1.1.1.1".to_string()]), allow_out: Some(vec!["172.67.0.0/16".to_string()]), deny_out: Some(vec!["10.0.0.0/8".to_string()]), + start_cmd: None, + ready_cmd: None, } } #[test] - fn build_template_container_overrides_maps_cli_fields() { - let body = sample_request(); - let overrides = build_template_container_overrides(&body, Some(&["8.8.8.8".to_string()])) - .expect("overrides"); + fn validate_dns_servers_rejects_invalid_ip() { + let err = validate_dns_servers(Some(&["not-an-ip".to_string()])).unwrap_err(); + assert!(matches!(err, AppError::BadRequest(_))); + } + + #[test] + fn build_resources_maps_e2b_cpu_count_to_millicores() { + let mut req = empty_request(); + req.cpu_count = Some(2); + req.memory_mb = Some(4096); + let r = build_resources(&req).expect("resources should be present"); + assert_eq!(r.cpu.as_deref(), Some("2000m")); + assert_eq!(r.mem.as_deref(), Some("4096Mi")); + } + + #[test] + fn build_resources_prefers_legacy_fields_when_both_supplied() { + let mut req = empty_request(); + req.cpu = Some(500); // millicores + req.cpu_count = Some(8); + req.memory = Some(512); + req.memory_mb = Some(8192); + let r = build_resources(&req).expect("resources should be present"); + assert_eq!(r.cpu.as_deref(), Some("500m")); + assert_eq!(r.mem.as_deref(), Some("512Mi")); + } + + #[test] + fn merge_envs_overrides_kv_strings_with_envvars_map() { + let mut req = empty_request(); + req.env = Some(vec!["FOO=bar".to_string(), "EMPTY=".to_string()]); + req.env_vars = Some({ + let mut m = HashMap::new(); + m.insert("FOO".to_string(), "baz".to_string()); // wins + m.insert("EXTRA".to_string(), "yes".to_string()); + m + }); + let mut envs = merge_envs(&req).expect("envs should be present"); + envs.sort_by(|a, b| a.key.cmp(&b.key)); + assert_eq!(envs.len(), 3); + let foo = envs.iter().find(|e| e.key == "FOO").unwrap(); + assert_eq!(foo.value, "baz"); + } + #[test] + fn build_probe_picks_http_get_when_port_provided() { + let mut req = empty_request(); + req.probe_port = Some(8080); + req.probe_path = Some("/healthz".to_string()); + let probe = build_probe(&req).expect("probe should be present"); + assert!(probe.probe_handler.http_get.is_some()); + let http = probe.probe_handler.http_get.unwrap(); + assert_eq!(http.port, 8080); + assert_eq!(http.path, "/healthz"); + } + + /// Regression: previously we synthesised an Exec probe from + /// `readyCmd`, but neither CubeMaster nor Cubelet support Exec probes + /// (`invalid probe.probe_handler param`). The fix is to **not** emit a + /// probe at all when the caller hasn't provided a port — Cubelet treats + /// nil probes as "no readiness check", which is the right thing to do. + #[test] + fn build_probe_returns_none_when_only_ready_cmd_is_provided() { + let mut req = empty_request(); + req.ready_cmd = Some("curl -fsS localhost:1234/ok".to_string()); + // No probe_port, no exposed_ports → no probe. + assert!(build_probe(&req).is_none()); + } + + /// Regression: when the caller provides exposedPorts but no explicit + /// probe_port, we still want an HttpGet probe on the first exposed port + /// (matches our previous behaviour and keeps templates that listed ports + /// working out of the box). + #[test] + fn build_probe_picks_http_get_from_first_exposed_port() { + let mut req = empty_request(); + req.exposed_ports = Some(vec![3000, 8080]); + let probe = build_probe(&req).expect("probe should be present"); + let http = probe.probe_handler.http_get.expect("http probe"); + assert_eq!(http.port, 3000); + assert!(probe.probe_handler.exec.is_none()); + } + + #[test] + fn parse_ready_url_extracts_port_and_path_from_localhost_url() { assert_eq!( - overrides.command, - Some(vec!["/bin/sh".to_string(), "-c".to_string()]) + parse_ready_url("wait_for_url(\"http://localhost:49999/health\")"), + Some((49999, "/health".to_string())) ); - assert_eq!(overrides.args, Some(vec!["sleep infinity".to_string()])); + } + + #[test] + fn parse_ready_url_handles_curl_with_127_0_0_1_and_query_string() { assert_eq!( - overrides.dns_config.as_ref().map(|d| d.servers.clone()), - Some(vec!["8.8.8.8".to_string()]) + parse_ready_url("curl -fsS http://127.0.0.1:8080/ready?retries=3 || exit 1"), + Some((8080, "/ready".to_string())) ); - assert!(overrides.probe.is_some()); - assert!(overrides.resources.is_some()); - assert_eq!(overrides.envs.as_ref().map(|envs| envs.len()), Some(1)); } #[test] - fn build_template_cubevs_context_includes_egress_rules() { - let body = sample_request(); - let ctx = build_template_cubevs_context(&body).expect("cubevs"); - assert_eq!(ctx.allow_internet_access, Some(true)); - assert_eq!(ctx.allow_out, vec!["172.67.0.0/16".to_string()]); - assert_eq!(ctx.deny_out, vec!["10.0.0.0/8".to_string()]); + fn parse_ready_url_defaults_path_to_root_when_omitted() { + assert_eq!( + parse_ready_url("until nc -z 0.0.0.0:3000; do sleep 0.2; done; \ + curl http://0.0.0.0:3000"), + Some((3000, "/".to_string())) + ); } #[test] - fn validate_dns_servers_rejects_invalid_ip() { - let err = validate_dns_servers(Some(&["not-an-ip".to_string()])).unwrap_err(); - assert!(matches!(err, AppError::BadRequest(_))); + fn parse_ready_url_rejects_non_loopback_hosts() { + // We must not silently rewrite a probe to point at an external + // service — that would generate noisy traffic and probably never + // succeed against the sandbox itself. + assert_eq!( + parse_ready_url("curl http://api.example.com:443/healthz"), + None + ); + } + + #[test] + fn parse_ready_url_returns_none_when_no_url_is_present() { + assert_eq!(parse_ready_url("/usr/local/bin/wait-for-it.sh --quiet"), None); + assert_eq!(parse_ready_url(""), None); + assert_eq!(parse_ready_url("curl localhost:1234"), None); // missing http:// + } + + #[test] + fn parse_ready_url_requires_explicit_port() { + // Probes must target a specific port — defaulting to 80/443 here + // would mask real misconfigurations. + assert_eq!(parse_ready_url("curl http://localhost/health"), None); + } + + #[test] + fn parse_ready_url_rejects_zero_port() { + assert_eq!(parse_ready_url("curl http://127.0.0.1:0/"), None); + } + + #[test] + fn host_from_url_extracts_host_with_port() { + assert_eq!(host_from_url("http://10.0.0.1:5000"), Some("10.0.0.1:5000".to_string())); + assert_eq!( + host_from_url("https://registry.example.com/path"), + Some("registry.example.com".to_string()) + ); + } + + #[test] + fn base_url_strips_path_keeps_scheme() { + assert_eq!(base_url("http://10.0.0.1:5000/v2/"), "http://10.0.0.1:5000"); + assert_eq!( + base_url("https://reg.example.com/foo/bar"), + "https://reg.example.com" + ); + } + + #[test] + fn remap_cubemaster_status_normalizes_phases_to_e2b_tokens() { + assert_eq!(remap_cubemaster_status(""), "pending"); + assert_eq!(remap_cubemaster_status("Ready"), "ready"); + assert_eq!(remap_cubemaster_status("succeeded"), "ready"); + assert_eq!(remap_cubemaster_status("Failed"), "error"); + assert_eq!(remap_cubemaster_status("PULLING"), "building"); + assert_eq!(remap_cubemaster_status("distributing"), "building"); + assert_eq!(remap_cubemaster_status("custom_phase"), "custom_phase"); + } + + fn make_service(registry_upstream: Option) -> TemplateService { + let mut cfg = ServerConfig::default(); + cfg.registry_upstream = registry_upstream; + cfg.registry_public_host = Some("cube.example.com".to_string()); + cfg.registry_repo_prefix = "e2b".to_string(); + let http = reqwest::Client::new(); + let cm = CubeMasterClient::new("http://127.0.0.1:9", http); + TemplateService::new(cm, "cubebox".to_string(), BuildRegistry::new(), cfg) + } + + #[tokio::test] + async fn create_template_e2b_mode_rejects_when_registry_disabled() { + let svc = make_service(None); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let err = svc.create_template(req).await.expect_err("should fail"); + assert!(matches!(err, AppError::NotImplemented(_))); + } + + #[tokio::test] + async fn create_template_e2b_mode_returns_push_credential_and_registers_build() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu\nCMD echo hi".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + // Build identity is well-formed and emitted in both legacy & E2B fields. + assert!(!job.template_id.is_empty()); + assert!(job.template_id.starts_with("tpl-")); + assert!(job.build_id.starts_with("bld-")); + assert_eq!(job.status, "accepted"); + assert_eq!(job.phase, "waiting"); + + // Push credential points at the configured public host. + let cred = job.registry.expect("registry credential"); + assert_eq!(cred.url, "http://127.0.0.1:5000"); + assert!(cred.repository.starts_with("e2b/tpl-")); + assert_eq!(cred.username, "_token"); + + // Internal BuildRegistry now knows about this build and stores the + // image_ref CubeMaster will later pull from. + let ctx = svc + .builds + .get(&job.build_id) + .expect("build context should be registered"); + assert!(ctx.image_ref.starts_with("127.0.0.1:5000/e2b/")); + assert!(ctx.image_ref.ends_with(&format!(":{}", job.build_id))); + } + + /// Regression: CubeMaster validates `writable_layer_size` as required and + /// the E2B V3 SDK never sends it. Verify the service injects the + /// configured default so the request reaches CubeMaster non-empty. + #[test] + fn build_cubemaster_request_fills_default_writable_layer_size() { + let svc = make_service(None); + let req = empty_request(); + let cm_req = svc.build_cubemaster_request(&req, "image:tag".to_string()); + assert_eq!(cm_req.writable_layer_size.as_deref(), Some("1G")); + } + + #[test] + fn build_cubemaster_request_preserves_caller_writable_layer_size() { + let svc = make_service(None); + let mut req = empty_request(); + req.writable_layer_size = Some("4G".to_string()); + let cm_req = svc.build_cubemaster_request(&req, "image:tag".to_string()); + assert_eq!(cm_req.writable_layer_size.as_deref(), Some("4G")); } } + diff --git a/deploy/one-click/scripts/one-click/up.sh b/deploy/one-click/scripts/one-click/up.sh index ef6b8d38c..0d9a6e583 100755 --- a/deploy/one-click/scripts/one-click/up.sh +++ b/deploy/one-click/scripts/one-click/up.sh @@ -85,7 +85,7 @@ start_with_pidfile \ start_with_pidfile \ "cube-api" \ - "export LOG_DIR=\"${CUBE_API_LOG_DIR}\" CUBE_API_BIND=\"${CUBE_API_BIND:-0.0.0.0:3000}\" CUBE_API_SANDBOX_DOMAIN=\"${CUBE_API_SANDBOX_DOMAIN:-cube.app}\"; ${CUBE_API_OPTIONAL_EXPORTS}\"${CUBE_API_BIN}\"" + "export LOG_DIR=\"${CUBE_API_LOG_DIR}\" CUBE_API_BIND=\"${CUBE_API_BIND:-0.0.0.0:3000}\" CUBE_API_SANDBOX_DOMAIN=\"${CUBE_API_SANDBOX_DOMAIN:-cube.app}\"; ${CUBE_API_OPTIONAL_EXPORTS}\"${CUBE_API_BIN}\" --registry-upstream http://127.0.0.1:5000 --registry-public-host cube.app --registry-pull-host 127.0.0.1:5000 --registry-repo-prefix e2b" start_with_pidfile \ "cubelet" \ diff --git a/docs/.vitepress/config.mjs b/docs/.vitepress/config.mjs index 0c4554694..e5283bc17 100644 --- a/docs/.vitepress/config.mjs +++ b/docs/.vitepress/config.mjs @@ -127,6 +127,7 @@ export default withMermaid(defineConfig({ { text: 'Tutorials', items: [ + { text: 'Create Templates with the e2b SDK', link: '/guide/tutorials/template-from-e2b-sdk' }, { text: 'Create Templates from OCI Image', link: '/guide/tutorials/template-from-image' }, { text: 'Examples', link: '/guide/tutorials/examples' }, { text: 'Custom Image', link: '/guide/tutorials/bring-your-own-image' } @@ -229,6 +230,7 @@ export default withMermaid(defineConfig({ { text: '场景教程', items: [ + { text: '使用 e2b SDK 创建模板', link: '/zh/guide/tutorials/template-from-e2b-sdk' }, { text: '从 OCI 镜像制作模板', link: '/zh/guide/tutorials/template-from-image' }, { text: '示例项目', link: '/zh/guide/tutorials/examples' }, { text: '自定义镜像', link: '/zh/guide/tutorials/bring-your-own-image' } diff --git a/docs/guide/tutorials/template-from-e2b-sdk.md b/docs/guide/tutorials/template-from-e2b-sdk.md new file mode 100644 index 000000000..f70983f8b --- /dev/null +++ b/docs/guide/tutorials/template-from-e2b-sdk.md @@ -0,0 +1,395 @@ +--- +title: Create Templates with the e2b SDK +lang: en-US +description: End-to-end practical guide for building CubeSandbox templates with the e2b Python / JS SDK — V3 protocol contract, OCI Registry reverse proxy, wait_for_url probe bridging, deployment configuration, and troubleshooting. +--- + +# Create Templates with the e2b SDK + +CubeSandbox is wire-compatible with the [e2b](https://e2b.dev/) **V3 template and sandbox protocol**. Starting from a ready-made e2b-style image, this page walks through how to use the official e2b Python / JS SDK to **register → build → run** a template on a CubeSandbox cluster, plus the technical reference and best practices that go with it. + +> Available in **CubeSandbox v0.2.3+**. +> +> - For the `cubemastercli` workflow, see [Create Templates from OCI Image](./template-from-image.md); +> - For adding envd to an existing image first, see [Bring Your Own Image](./bring-your-own-image.md). + +--- + +## 1. Overall architecture + +How the e2b SDK client, CubeAPI, CubeMaster, and the bundled OCI Registry cooperate: + +```mermaid +flowchart LR + subgraph Client[e2b client] + SDK[Python / JS SDK
Template.build] + CLI[e2b CLI
docker push] + end + + subgraph Edge[CubeAPI edge] + V3[V3 template routes
/v3/templates
/templates/.../files/
/v2/.../builds/
.../status] + REG[OCI proxy
/v2/*] + Reg[(OCI Registry
distribution/distribution
:5000)] + end + + subgraph CP[Control plane] + Master[CubeMaster] + Cubelet[cubelet] + end + + SDK -->|HTTPS| V3 + CLI -->|docker push| REG + REG --> Reg + V3 --> Master + Master -->|RunSandbox + AppSnapshot| Cubelet + Cubelet -->|doProbe HttpGet| Cubelet +``` + +Key points: + +1. **CubeAPI** acts as the e2b V3 protocol edge, translating V3 calls into CubeMaster's internal `CreateTemplateFromImage` / build-job semantics. +2. **OCI Registry** is an independent sidecar (default `distribution/distribution` on `127.0.0.1:5000`); CubeAPI exposes `/v2/*` as a verbatim reverse proxy for `docker push`. +3. Once **CubeMaster + cubelet** see a `//:` reference, the rest of the pipeline (OCI image → ext4 rootfs → temporary sandbox → probe → snapshot → register) is the same as any other build path. + +--- + +## 2. Quick start + +> Prerequisite: you already have an image **with envd (49983)** built per [Bring Your Own Image](./bring-your-own-image.md) and pushed to a registry the cluster can reach (the `from_image` reference below). + +### 2.1 Install the SDK and configure the environment + +```bash +pip install e2b python-dotenv +``` + +Drop CubeAPI's endpoint and your API key into a project-root `.env`: + +```dotenv +E2B_API_KEY=your-cube-api-key # any value if CubeAPI auth is disabled +E2B_DOMAIN=cube.example.com # CubeAPI ingress (no scheme) +``` + +### 2.2 Define the template + +```python +# build_template.py + +from dotenv import load_dotenv +from e2b import Template, default_build_logger, wait_for_url + +load_dotenv() + +if __name__ == '__main__': + template = ( + Template() + .from_image("cube-sandbox-cn.tencentcloudcr.com/cube-sandbox/sandbox-code:latest") # ← 也可以改成自己的镜像 + .set_start_cmd( + "sudo /root/.jupyter/start-up.sh", + wait_for_url("http://localhost:49999/health") # <- 将被作用于probe探针 + ) + ) + Template.build( + template, + 'template-tag-code', + cpu_count=1, + memory_mb=1024, + on_build_logs=default_build_logger(), + ) +``` + +### 2.3 Build + use + +```bash +python build_template.py +# Once "[7/7] READY" prints, you can create sandboxes +``` + +```python +# use_sandbox.py +from e2b import Sandbox + +sbx = Sandbox(template="template-tag-code", timeout=120) +print(sbx.run_code("print('hello from cube sandbox')").text) +sbx.kill() +``` + +In the happy path the **first `run_code` works immediately — no `time.sleep` needed**. As long as `wait_for_url` blocked the build until the user process was actually ready, the snapshot already captures that ready state. + +--- + +## 3. Technical reference + +### 3.1 V3 protocol endpoint contract + +CubeAPI exposes the four V3 endpoints the e2b SDK speaks: + +| # | Method + path | Handler | Purpose | +|---|---|---|---| +| ① | `POST /v3/templates` | `templates_v3::v3_create_template` | Register a template + allocate the first build attempt; returns `{templateID, buildID, names, aliases, tags, public}` | +| ② | `GET /templates/{tid}/files/{hash}` | `templates_v3::v3_get_files_hash` | Cache probe before SDK uploads a build context tarball; CubeAPI always answers `present=true` so the SDK skips upload (the V3 flow currently consumes only `from_image`) | +| ③ | `POST /v2/templates/{tid}/builds/{bid}` | `templates_v3::v2_trigger_build` | Actually triggers the build: resolves `from_image` / `from_template` / a previously-pushed image and dispatches a `CreateTemplateFromImageReq` to CubeMaster | +| ④ | `GET /templates/{tid}/builds/{bid}/status` | `templates_v3::v3_get_build_status` | Polls build status; returns the strict `{buildID, templateID, status, logs[], logEntries[], reason?}` envelope the SDK expects | + +End-to-end SDK call timeline: + +```mermaid +sequenceDiagram + participant SDK as e2b SDK + participant CLI as e2b CLI / docker + participant API as CubeAPI + participant Reg as OCI Registry + participant Master as CubeMaster + participant Cubelet as cubelet + + SDK->>API: POST /v3/templates {name, cpuCount, memoryMB} + API-->>SDK: 202 {templateID, buildID, ...} + + Note over SDK,Reg: Push only happens for Dockerfile builds;
pure from_image flow skips ②③ and goes straight to ④ + SDK->>API: GET /templates/{tid}/files/{hash} + API-->>SDK: 201 {present:true} + CLI->>API: PUT /v2//manifests/ + API->>Reg: forward + Reg-->>API: 201 Created + API->>API: mark_image_pushed(bid) + API-->>CLI: 201 Created + + SDK->>API: POST /v2/templates/{tid}/builds/{bid}
{fromImage, startCmd, readyCmd, ...} + API->>API: parse_ready_url → probe_port/path + API->>Master: CreateTemplateFromImage + Probe.HttpGet + API-->>SDK: 202 Accepted + + loop poll every N seconds + SDK->>API: GET /.../builds/{bid}/status?logsOffset=K + API->>Master: get_template_build_status + API-->>SDK: 200 {status, logs[], reason?} + end + + Master->>Cubelet: AppSnapshot(req with Probe) + Cubelet->>Cubelet: doProbe blocks until user process is ready + Cubelet-->>Master: snapshot captures ready state + Master-->>API: build READY + API-->>SDK: status="ready" +``` + +### 3.2 OCI Registry reverse proxy + +CubeAPI exposes `/v2/*` as a verbatim reverse proxy that forwards e2b CLI / docker push traffic to an upstream OCI Registry. Notable design points: + +| Behaviour | Notes | +|---|---| +| **Bypasses `unified_auth`** | docker push uses the registry's own Basic / Bearer credentials, which are in a separate trust domain from CubeAPI's `Authorization: Bearer `; therefore `/v2/*` does not run through `unified_auth`. | +| **240 s timeout** | A single layer-blob PUT can take minutes, so `/v2/*` lives on its own 240 s `TimeoutLayer`, separate from the default 30 s router (see `routes.rs::SNAPSHOT_LONG_ROUTE_TIMEOUT`). | +| **Hop-by-hop header stripping** | Per RFC 7230 §6.1, `connection` / `keep-alive` / `transfer-encoding` etc. are stripped on both directions to keep HTTP/1.1 implementations on either end happy. | +| **`mark_image_pushed` hook** | When `PUT /v2//manifests/` succeeds, CubeAPI uses `` as the `buildID` and moves the matching BuildContext to the `Building` stage so the subsequent trigger-build call can dispatch immediately. | +| **Graceful degradation** | If `registry_upstream` is unset, every `/v2/*` request returns 503 `registry_disabled`; pure `from_image` flows still work in this deployment shape. | + +The default deployment **enables** this stack out of the box (`deploy/one-click/scripts/one-click/up.sh`): + +If there is no image repository, you can quickly start an image repository with `docker run -d -p 5000:5000 --restart always --name registry registry:3`. + +```bash +cube-api \ + --registry-upstream http://127.0.0.1:5000 \ + --registry-public-host cube.app \ + --registry-pull-host 127.0.0.1:5000 \ + --registry-repo-prefix e2b +``` + +See [Section 4 — Deployment Configuration](#_4-deployment-configuration) for details. + +### 3.3 `wait_for_url` and the readiness probe + +`wait_for_url(...)` is the key to the "create-and-immediately-use" property of templates. Semantically: **during template build**, wait for the URL to return 2xx **before** snapshotting — every sandbox restored from such a template comes back with the user process already serving traffic, so `sbx.run_code(...)` works immediately. + +#### How the bridging works + +The e2b SDK serialises `wait_for_url(...)` into a shell-form `readyCmd` (ultimately `curl ...`). CubeAPI does **not** run the shell — instead, in `services/templates.rs::v3_trigger_build` it does a lightweight parse: + +1. Find an `http(s)://:[/]` URL inside `readyCmd`; +2. Require `host` to be a loopback alias (`localhost` / `127.0.0.1` / `0.0.0.0` / `::1` / `[::1]`) — never invent a probe target pointing at the public internet; +3. Require an explicit, non-zero port; +4. On success, populate `probe_port` / `probe_path`, which `build_probe()` turns into a `Probe.HttpGet` and forwards to CubeMaster; +5. Cubelet **blocks** on this probe (`doProbe`) after container creation, only committing the snapshot once it returns 2xx. + +The whole bridging is transparent — no extra SDK-side configuration needed. + +#### Parsing rules at a glance + +| `readyCmd` input | Parsed result | Notes | +|---|---|---| +| `wait_for_url("http://localhost:49999/health")` | `(49999, "/health")` | Canonical form | +| curl -fsS http://127.0.0.1:8080/ready?retries=3 \|\| exit 1 | `(8080, "/ready")` | Query string is stripped | +| `until nc -z 0.0.0.0:3000; do sleep 0.2; done; curl http://0.0.0.0:3000` | `(3000, "/")` | Path defaults to `/` when omitted | +| `curl http://api.example.com:443/healthz` | ❌ `None` | Non-loopback hosts rejected | +| `curl http://localhost/health` | ❌ `None` | Port must be explicit | +| `curl http://127.0.0.1:0/` | ❌ `None` | Port must be > 0 | +| `/usr/local/bin/wait-for-it.sh --quiet` | ❌ `None` | No recognisable URL | + +#### Three-tier source priority + +`probe_port` is resolved in this order: + +1. **Caller override** — `probePort` / `probePath` in the V3 request body; +2. **`readyCmd` parsing** — auto-extracted from `wait_for_url(...)` / `curl ...`; +3. **`exposedPorts[0]` + `/health`** — last-resort fallback (preserves legacy behaviour). + +If any tier fires, `Probe.HttpGet` is generated. If all three are empty, **no probe is emitted** — sandbox creation returns the moment `Create` completes (today's behaviour); still works, but users may need a `time.sleep`. + +#### Probe parameters (cubelet defaults) + +| Field | Default | Meaning | +|---|---|---| +| `timeout_ms` | 30 000 | Total budget for the probe loop (30 s) | +| `period_ms` | 500 | Probe every 500 ms | +| `success_threshold` | 1 | First 2xx wins | +| `failure_threshold` | 60 | Up to 60 failures (~30 s) before giving up | + +> If your user process needs more than 30 s to come up (rare), use `cubemastercli`'s explicit override path, or follow up with a CubeAPI extension that surfaces `probeTimeoutMs`. + +### 3.4 Build state machine + +CubeAPI keeps an in-memory `BuildRegistry` tracking every `(templateID, buildID)` lifecycle (`services/builds.rs`): + +``` +WaitingPush ──manifest PUT succeeds──► Building ──CubeMaster job terminal──► Ready / Error +``` + +| Stage | Meaning | +|---|---| +| `WaitingPush` | Template registered, registry credentials issued, waiting for client docker push | +| `Building` | manifest PUT succeeded / trigger-build received; CubeMaster pipeline running | +| `Ready` | Template build successful, sandboxes can use it | +| `Error` | Build failed; `reason.message` contains the CubeMaster error | + +Each `BuildContext` also keeps: the original `CreateTemplateRequest` (replayed at trigger time), registry credentials, CubeMaster `jobID`, an append-only log buffer (capped at 10 000 lines, head-trimmed on overflow), and the V3-specific fields (`name` / `tags` / `cpuCount` / `memoryMB` / `aliases`). + +CubeAPI restart loses the in-memory state — a deliberate trade-off: builds normally reach a terminal state in minutes, and a build truncated mid-flight is naturally retried by the SDK. When stronger consistency is needed, swap the `BuildRegistry` backend to durable storage (the trait abstraction is in place). + +### 3.5 ID and timeout rules + +#### `templateID` + +Derived from `name` via UUIDv5 (DNS namespace), with the `tpl-` prefix: + +```rust +fn stable_template_id(name: &str) -> String { + let id = Uuid::new_v5(&Uuid::NAMESPACE_DNS, name.as_bytes()); + format!("tpl-{}", &id.simple().to_string()[..16]) +} +``` + +- Same `name` always maps to the **same** `templateID`, matching e2b's "alias is also a primary key" semantics; +- Re-building the same template name reuses the `templateID`, avoiding stale templates in the control plane. + +#### `buildID` + +Allocated fresh on every `POST /v3/templates`: `bld-`. Stateless, unguessable. + +#### Timeout tiers + +| Routes | Timeout | Reason | +|---|---|---| +| Default (e.g. `/v3/templates`, `.../builds/{bid}/status`) | 30 s | Regular synchronous calls | +| Long routes (`POST /sandboxes/:id/snapshots`, `POST /sandboxes/:id/rollback`, `DELETE /templates/:id`) | 240 s | Synchronous calls into cubelet's LVM/snapshot cleanup | +| OCI Registry proxy (`/v2/*`) | 240 s | Large layer-blob PUTs can take minutes | + +This is implemented in `routes.rs` by wrapping each sub-router in its own `TimeoutLayer` and `Router::merge`-ing them together. The `merge_preserves_per_router_timeout_layers` unit test specifically guards this invariant. + +--- + +## 4. Deployment configuration + +### 4.1 One-click defaults + +`deploy/one-click/scripts/one-click/up.sh` already starts CubeAPI with: + +```bash +--registry-upstream http://127.0.0.1:5000 # local distribution sidecar +--registry-public-host cube.app # docker push target advertised to clients +--registry-pull-host 127.0.0.1:5000 # CubeMaster node-side pull address +--registry-repo-prefix e2b # image namespace +``` + +So out-of-the-box `e2b template build` + docker push **just work** in a standard deployment. For other deployment shapes, pass the corresponding flags below. + +### 4.2 Full parameter reference + +| CLI flag | Env var | Default | Meaning | +|---|---|---|---| +| `--registry-upstream URL` | `CUBE_API_REGISTRY_UPSTREAM` | *unset* | Upstream OCI Registry URL; when unset `/v2/*` returns 503 and dockerfile flows are rejected | +| `--registry-public-host HOST` | `CUBE_API_REGISTRY_PUBLIC_HOST` | request Host header | Hostname advertised to clients for docker push | +| `--registry-pull-host HOST` | `CUBE_API_REGISTRY_PULL_HOST` | upstream's host:port | Internal address CubeMaster nodes use to pull images | +| `--registry-repo-prefix PREFIX` | `CUBE_API_REGISTRY_REPO_PREFIX` | `e2b` | Repo namespace for pushed images | +| `--registry-token TOKEN` | `CUBE_API_REGISTRY_TOKEN` | `_anon` | The `registry.password` field returned by `POST /templates` | +| `--default-writable-layer-size SIZE` | `CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE` | `1G` | Default `writable_layer_size` when the client doesn't provide one (CubeMaster validates this field as required) | +| `--sandbox-domain DOMAIN` | `CUBE_API_SANDBOX_DOMAIN` | `cube.app` | The `domain` field on sandbox API responses | +| `--auth-callback-url URL` | `AUTH_CALLBACK_URL` | *unset* | Callback URL for unified auth (see [Authentication](../authentication.md)) | + +### 4.3 Hooking up a private / restricted OCI Registry + +The most common case is pushing to your team's private registry. Three steps: + +1. **Deploy a registry that speaks OCI Distribution v1** (CNCF `distribution/distribution`, Harbor, AWS ECR, GCR all qualify); +2. **CubeAPI side**: set `--registry-upstream` to point at it; `--registry-public-host` is whatever hostname users docker push to (typically your ingress); +3. **CubeMaster side**: make sure `--registry-pull-host` resolves on the cluster network — if the registry is on another machine, **don't** use `127.0.0.1`. + +If the registry has htpasswd / token-server auth, the docker client's `Authorization` header is forwarded verbatim by CubeAPI — no special handling needed at the API layer. + +--- + +## 5. Best practices + +### 5.1 Image preparation + +**Hard constraint**: any image used as a CubeSandbox template must have envd listening on `:49983` at startup. Two fastest paths: + +| Path | Best for | How | +|---|---|---| +| **`FROM ghcr.io/tencentcloud/cubesandbox-base:2026.16`** | Greenfield business images | Base image ships with envd + `cube-entrypoint.sh`, which backgrounds envd for you | +| **`COPY --from=cubesandbox-base ...`** | Existing business images (e.g. `e2bdev/code-interpreter`) | Inject envd binary + entrypoint into your image, switch ENTRYPOINT to `cube-entrypoint.sh` | + +Detailed Dockerfile templates, the `cube-entrypoint.sh` contract, and local smoke tests are in [Bring Your Own Image](./bring-your-own-image.md). + +> ⚠️ **Don't use `e2bdev/code-interpreter:latest` directly**: it ships e2b's upstream init but not the envd CubeSandbox needs, so the build-time probe will hit `connection refused` and time out. + +### 5.2 SDK usage + +- **Always use the two-arg `set_start_cmd(cmd, wait_for_url(...))`** form so the build blocks on actual readiness; +- The `wait_for_url` URL must be of the form `http(s)://:[/]` — host must be `localhost` / `127.0.0.1` / `0.0.0.0`; +- The `from_image(...)` reference must be **pullable from CubeMaster nodes**; +- `cpu_count` / `memory_mb` set the template default; override per `Sandbox(...)` call as needed; +- A build log line like `[dispatch-v3] readyCmd parsed → HttpGet probe on port=... path=...` confirms the bridging fired. + +### 5.3 Sandbox usage + +- **No `time.sleep` needed**: as long as the build's `wait_for_url` actually waited, the first `run_code` is immediately usable; +- Reusing a single sandbox across `run_code` calls is an order of magnitude cheaper than creating new sandboxes; +- Always `sbx.kill()` explicitly instead of relying on timeout reclamation. + +--- + +## 6. Troubleshooting + +| Symptom | Root cause | Fix | +|---|---|---| +| `BuildException: 404: b''` | CubeAPI lacks the V3 routes — likely v0.2.2 or earlier | Upgrade to v0.2.3+ | +| Build stuck in `PULLING_IMAGE` | CubeMaster nodes can't pull the image | Use a cluster-reachable registry; for private registries check `--registry-pull-host` | +| Build log says `readyCmd is recorded but not enforced` | URL parsing failed | Check that `wait_for_url` carries `http://localhost:[/]`, host is a loopback alias, port is explicit | +| Build log says `readyCmd parsed`, but build still times out | Probe runs but the user process really isn't ready | Verify locally: `docker run` and `curl 127.0.0.1:/`. Confirm `cube-entrypoint.sh` `exec`'s the user command rather than fork-and-exit | +| `Sandbox(template=...)` then `run_code` returns 502 | User process still warming up (probe ineffective) | Upgrade to v0.2.3+; confirm build log contains `readyCmd parsed → HttpGet probe`; check inter-node port reachability — see [Networking (CubeVS)](../../architecture/network.md) | +| `run_code` returns `404 not found` | envd is not running inside the sandbox | envd was not injected, or ENTRYPOINT was overridden — see [Bring Your Own Image](./bring-your-own-image.md#_3-alternative-injecting-envd-into-an-existing-image) | +| docker push returns `503 registry_disabled` | CubeAPI `--registry-upstream` is not set | Enable the OCI proxy per [Deployment Configuration](#_4-deployment-configuration) | +| docker push returns `request timeout` | layer blob upload exceeded the 240 s long timeout | Check upstream registry storage IO; or shrink layers (`--squash` / multi-stage builds) | + +For more template-related issues see [Templates Troubleshooting](../troubleshooting/templates.md). + +--- + +## 7. Further reading + +- [Bring Your Own Image](./bring-your-own-image.md) — Dockerfile templates, `cube-entrypoint.sh` contract, local smoke tests +- [Create Templates from OCI Image](./template-from-image.md) — explicit `--probe` / `--probe-path` configuration via `cubemastercli` +- [Networking (CubeVS)](../../architecture/network.md) — how cross-node port forwarding works +- [Templates Troubleshooting](../troubleshooting/templates.md) — common build-time issues +- [Authentication](../authentication.md) — `unified_auth` middleware and API key configuration diff --git a/docs/guide/tutorials/template-from-image.md b/docs/guide/tutorials/template-from-image.md index 2f4591745..6b476c14b 100644 --- a/docs/guide/tutorials/template-from-image.md +++ b/docs/guide/tutorials/template-from-image.md @@ -244,3 +244,9 @@ template deleted: tpl-748094d2f2374b0a8a37e6ec | `status: FAILED` after BUILDING | Build error (disk full, Dockerfile issue, etc.) | Re-run `tpl status --job-id --json` and inspect `last_error` | | `distribution: 0/N ready` after READY | Artifact distribution still in progress (normal briefly) | Wait and re-run `tpl info`; if stuck check Cubelet logs on target nodes | | Sandbox fails readiness probe | Service not listening on the expected port/path at startup | Verify your container starts the HTTP server before signalling ready; adjust `--probe-path` if needed | + +--- + +## Further reading + +The `--probe` / `--probe-path` flags above target the `cubemastercli` workflow. If you build templates through the [e2b](https://e2b.dev/) Python / JS SDK (`Template().set_start_cmd(..., wait_for_url(...))`), **you don't have to specify probe parameters by hand** — CubeAPI parses `(port, path)` straight out of `wait_for_url(...)` and synthesises an equivalent HttpGet probe. See [Create Templates with the e2b SDK](./template-from-e2b-sdk.md). diff --git a/docs/zh/guide/tutorials/template-from-e2b-sdk.md b/docs/zh/guide/tutorials/template-from-e2b-sdk.md new file mode 100644 index 000000000..96a087c10 --- /dev/null +++ b/docs/zh/guide/tutorials/template-from-e2b-sdk.md @@ -0,0 +1,396 @@ +--- +title: 通过 e2b SDK 创建模板 +lang: zh-CN +description: 在 CubeSandbox 上使用 e2b Python / JS SDK 制作模板的端到端实践指南,含 V3 协议契约、OCI Registry 反代、wait_for_url 就绪探针桥接、运维配置与故障排查。 +--- + +# 通过 e2b SDK 创建模板 + +CubeSandbox 在协议层完整兼容了 [e2b](https://e2b.dev/) **V3 模板与沙箱协议**。本文从一份"现成的 e2b 风格镜像"出发,讲清楚如何使用 e2b 官方 Python / JS SDK 在 CubeSandbox 集群上 **创建模板 → 构建 → 创建沙箱执行代码** 的完整路径,并给出技术参考和最佳实践。 + +> 适用版本:CubeSandbox **v0.2.3+**。 +> +> - 如果你想用 `cubemastercli` 命令行制作模板,请参考[从 OCI 镜像制作模板](./template-from-image.md); +> - 如果你只是想给现有镜像加上 envd,请先读[自带镜像接入 (envd)](./bring-your-own-image.md)。 + +--- + +## 一、整体架构 + +e2b SDK 客户端、CubeAPI、CubeMaster、bundled OCI Registry 之间的协作关系: + +```mermaid +flowchart LR + subgraph Client[e2b 客户端] + SDK[Python / JS SDK
Template.build] + CLI[e2b CLI
docker push] + end + + subgraph Edge[CubeAPI 边缘服务] + V3[V3 模板路由
/v3/templates
/templates/.../files/
/v2/.../builds/
.../status] + REG[OCI 反代
/v2/*] + Reg[(OCI Registry
distribution/distribution
:5000)] + end + + subgraph CP[控制面] + Master[CubeMaster] + Cubelet[cubelet] + end + + SDK -->|HTTPS| V3 + CLI -->|docker push| REG + REG --> Reg + V3 --> Master + Master -->|RunSandbox + AppSnapshot| Cubelet + Cubelet -->|doProbe HttpGet| Cubelet +``` + +要点: + +1. **CubeAPI** 充当 e2b V3 协议的"协议边缘",把 V3 调用翻译成 CubeMaster 内部的 `CreateTemplateFromImage` / 构建作业语义。 +2. **OCI Registry** 是一个独立的 sidecar(默认 `distribution/distribution`,监听 `127.0.0.1:5000`),CubeAPI 用 `/v2/*` 路由原样反向代理 docker push 流量。 +3. **CubeMaster + cubelet** 收到 `//:` 形式的镜像引用后,再走 OCI 镜像 → ext4 rootfs → 创建临时 sandbox → 探活 → 快照 → 注册的常规流水线。 + +--- + +## 二、快速开始 + +> 前置:你已经按 [自带镜像接入](./bring-your-own-image.md) 准备好了一个**自带 envd(49983)**的镜像,并推送到了一个集群可达的 OCI Registry(即下面这个 `from_image` 中的镜像)。 + +### 2.1 安装 SDK 并配置环境 + +```bash +pip install e2b python-dotenv +``` + +把 CubeAPI 入口和 API Key 写进项目根的 `.env` 文件: + +```dotenv +E2B_API_KEY=e2b_0000000000000000000000000000000000000000 # 如果 CubeAPI 没启用鉴权,这里填任意值 +E2B_API_URL=http://localhost:3000 +SSL_CERT_FILE="/root/.local/share/mkcert/rootCA.pem" +``` + +### 2.2 写模板定义 + +```python +# build_template.py + +from dotenv import load_dotenv +from e2b import Template, default_build_logger, wait_for_url + +load_dotenv() + +if __name__ == '__main__': + template = ( + Template() + .from_image("cube-sandbox-cn.tencentcloudcr.com/cube-sandbox/sandbox-code:latest") # ← 也可以改成自己的镜像 + .set_start_cmd( + "sudo /root/.jupyter/start-up.sh", + wait_for_url("http://localhost:49999/health") # <- 将被作用于probe探针 + ) + ) + Template.build( + template, + 'template-tag-code', + cpu_count=1, + memory_mb=1024, + on_build_logs=default_build_logger(), + ) +``` + +### 2.3 构建 + 使用 + +```bash +python build_template.py +# 看到 "[7/7] READY" 后即可创建沙箱 +``` + +```python +# use_sandbox.py +from e2b import Sandbox + +sbx = Sandbox(template="template-tag-code", timeout=120) +print(sbx.run_code("print('hello from cube sandbox')").text) +sbx.kill() +``` + +正常情况下:**第一次 `run_code` 立即可用,不需要 `time.sleep`**——只要构建期 `wait_for_url` 真的等到业务 ready,沙箱恢复完成那一刻业务进程就已在监听。 + +--- + +## 三、技术参考 + +### 3.1 V3 协议端点契约 + +CubeAPI 暴露下列 4 个 V3 协议端点(与 e2b 上游 SDK 一一对应): + +| 顺序 | 方法 + 路径 | Handler | 作用 | +|---|---|---|---| +| ① | `POST /v3/templates` | `templates_v3::v3_create_template` | 注册模板 + 分配第一次 build attempt,返回 `{templateID, buildID, names, aliases, tags, public}` | +| ② | `GET /templates/{tid}/files/{hash}` | `templates_v3::v3_get_files_hash` | SDK 上传 build context 前的缓存探测;CubeAPI 当前固定返回 `present=true` 让 SDK 跳过上传(V3 流目前只走 `from_image`) | +| ③ | `POST /v2/templates/{tid}/builds/{bid}` | `templates_v3::v2_trigger_build` | 真正触发构建:解析 `from_image` / `from_template` / 已推送镜像,组装 `CreateTemplateFromImageReq` 派发到 CubeMaster | +| ④ | `GET /templates/{tid}/builds/{bid}/status` | `templates_v3::v3_get_build_status` | 轮询构建状态,返回 e2b 严格匹配的 `{buildID, templateID, status, logs[], logEntries[], reason?}` 信封 | + +整条 SDK 调用链时序: + +```mermaid +sequenceDiagram + participant SDK as e2b SDK + participant CLI as e2b CLI / docker + participant API as CubeAPI + participant Reg as OCI Registry + participant Master as CubeMaster + participant Cubelet as cubelet + + SDK->>API: POST /v3/templates {name, cpuCount, memoryMB} + API-->>SDK: 202 {templateID, buildID, ...} + + Note over SDK,Reg: 仅当走 Dockerfile build 时才有 push 流;
纯 from_image 流跳过 ②③ 步直接走 ④ + SDK->>API: GET /templates/{tid}/files/{hash} + API-->>SDK: 201 {present:true} + CLI->>API: PUT /v2//manifests/ + API->>Reg: 反代 + Reg-->>API: 201 Created + API->>API: mark_image_pushed(bid) + API-->>CLI: 201 Created + + SDK->>API: POST /v2/templates/{tid}/builds/{bid}
{fromImage, startCmd, readyCmd, ...} + API->>API: parse_ready_url → probe_port/path + API->>Master: CreateTemplateFromImage + Probe.HttpGet + API-->>SDK: 202 Accepted + + loop 每 N 秒轮询 + SDK->>API: GET /.../builds/{bid}/status?logsOffset=K + API->>Master: get_template_build_status + API-->>SDK: 200 {status, logs[], reason?} + end + + Master->>Cubelet: AppSnapshot(req with Probe) + Cubelet->>Cubelet: doProbe 阻塞探针 → 业务 ready + Cubelet-->>Master: snapshot 已包含 ready 状态 + Master-->>API: build READY + API-->>SDK: status="ready" +``` + +### 3.2 OCI Registry 反代 + +CubeAPI 通过一组 `/v2/*` 路由把 e2b CLI / docker push 的流量原样反代到上游 OCI Registry。关键设计: + +| 行为 | 说明 | +|---|---| +| **绕过 unified_auth** | docker push 用的是 registry 自己签发的 Basic / Bearer,与 CubeAPI 的 `Authorization: Bearer ` 不在同一个域,因此 `/v2/*` 路径不走 `unified_auth` 中间件。 | +| **240 s 超时** | 单个 layer blob PUT 可能耗时数分钟,因此 `/v2/*` 路径独享一组 240 s 的 `TimeoutLayer`,与默认的 30 s 路由分开(详见 `routes.rs::SNAPSHOT_LONG_ROUTE_TIMEOUT`)。 | +| **Hop-by-hop 头剥离** | 转发前后都按 RFC 7230 §6.1 剥掉 `connection` / `keep-alive` / `transfer-encoding` 等连接级头,保证两端 HTTP/1.1 实现兼容。 | +| **`mark_image_pushed` 钩子** | 当 `PUT /v2//manifests/` 成功时,CubeAPI 用 `` 作为 `buildID` 标记对应的 BuildContext 进入 `Building` 阶段,让随后的 trigger build 调用可以无缝衔接。 | +| **未配置时降级** | 若 `registry_upstream` 未配置,`/v2/*` 一律返回 503 `registry_disabled`;这种部署形态下纯 `from_image` 流仍可工作。 | + +部署时**默认开启**这条链路(`deploy/one-click/scripts/one-click/up.sh` 中已配置): + +如果没有镜像仓库,可以通过`docker run -d -p 5000:5000 --restart always --name registry registry:3`快速启动一个镜像仓库 + +```bash +cube-api \ + --registry-upstream http://127.0.0.1:5000 \ + --registry-public-host cube.app \ + --registry-pull-host 127.0.0.1:5000 \ + --registry-repo-prefix e2b +``` + +详见下文[四、运维配置](#四运维配置)。 + +### 3.3 `wait_for_url` 与就绪探针桥接 + +`wait_for_url(...)` 是模板"创建即可用"语义的关键。它的语义是:**模板构建期间** 等到指定 URL 返回 2xx **再** 对沙箱做快照——这样所有从该模板恢复的沙箱都已经"业务在监听",SDK `sbx.run_code(...)` 立即可用。 + +#### 桥接逻辑 + +e2b SDK 把 `wait_for_url(...)` 序列化为一段 shell 形式的 `readyCmd`(最终是 `curl ...`)。CubeAPI 不直接执行这段 shell,而是在 `services/templates.rs::v3_trigger_build` 中做一次轻量解析: + +1. 在 `readyCmd` 中找 `http(s)://:[/]` 形式的 URL; +2. 校验 `host` 必须是 loopback 别名(`localhost` / `127.0.0.1` / `0.0.0.0` / `::1` / `[::1]`)—— 防止意外把探针指向外部服务; +3. 校验端口必须显式给出且 ≠ 0; +4. 解析成功 → 自动填入 `probe_port` / `probe_path`,由 `build_probe()` 生成原生 `Probe.HttpGet` 透传给 CubeMaster; +5. cubelet 在容器创建后 **阻塞性** 轮询该探针(`doProbe`),直到 2xx 才 commit 快照。 + +整条链路对用户完全透明,SDK 端**不需要**额外配置。 + +#### 解析规则一览 + +| `readyCmd` 输入 | 解析结果 | 备注 | +|---|---|---| +| `wait_for_url("http://localhost:49999/health")` | `(49999, "/health")` | 标准用法 | +| curl -fsS http://127.0.0.1:8080/ready?retries=3 \|\| exit 1 | `(8080, "/ready")` | query string 自动剥掉 | +| `until nc -z 0.0.0.0:3000; do sleep 0.2; done; curl http://0.0.0.0:3000` | `(3000, "/")` | 路径缺省时填 `/` | +| `curl http://api.example.com:443/healthz` | ❌ `None` | 非 loopback 主机会被拒绝 | +| `curl http://localhost/health` | ❌ `None` | 必须显式给出端口 | +| `curl http://127.0.0.1:0/` | ❌ `None` | 端口必须 > 0 | +| `/usr/local/bin/wait-for-it.sh --quiet` | ❌ `None` | 没有可识别的 URL | + +#### 三级优先级 + +`probe_port` 的来源按以下优先级解析: + +1. **caller 显式设置** — V3 请求体中的 `probePort` / `probePath`; +2. **`readyCmd` 解析** — 自动从 `wait_for_url(...)` / `curl ...` 中抽取; +3. **`exposedPorts[0]` + `/health`** — 兜底(与 e2b 历史行为兼容)。 + +任意一级生效即生成 `Probe.HttpGet`;三级全部失效则**不生成探针**,退化为"`Create` 一返回就视为 ready"行为,仍可工作但 SDK 端可能需要 `time.sleep`。 + +#### 探针参数(cubelet 默认行为) + +| 字段 | 默认值 | 含义 | +|---|---|---| +| `timeout_ms` | 30 000 | 整个探针流程的总预算(30 秒) | +| `period_ms` | 500 | 每 500 ms 探一次 | +| `success_threshold` | 1 | 第一次 2xx 即视为 ready | +| `failure_threshold` | 60 | 最多 60 次失败(约 30 s)后宣告失败 | + +> 业务启动需要超过 30 秒(罕见)时,可以走 `cubemastercli` 显式覆盖路径,或在后续向 CubeAPI 增加 `probeTimeoutMs` 字段。 + +### 3.4 Build 状态机 + +CubeAPI 在内存里维护一个 `BuildRegistry` 来跟踪每个 `(templateID, buildID)` 的生命周期(`services/builds.rs`): + +``` +WaitingPush ─push manifest成功─► Building ─CubeMaster job终态─► Ready / Error +``` + +| 阶段 | 含义 | +|---|---| +| `WaitingPush` | template 已注册,registry 凭据已签发,等待客户端 docker push | +| `Building` | manifest PUT 成功 / 触发 build 调用收到,CubeMaster 流水线运行中 | +| `Ready` | 模板构建成功,可被沙箱使用 | +| `Error` | 构建失败,`reason.message` 中包含 CubeMaster 的错误 | + +`BuildContext` 同时保留:原始 `CreateTemplateRequest`(重放用)、registry 凭据、CubeMaster `jobID`、append-only 日志缓冲(最多 10 000 行,溢出滚动)、SDK 期望的 V3 字段(`name` / `tags` / `cpuCount` / `memoryMB` / `aliases`)。 + +CubeAPI 重启会丢失内存状态——这是一个有意识的取舍:build 流通常在数分钟内到达终态,启动失败的 build SDK 会自然重试。需要更强一致性时,可以把 `BuildRegistry` 后端切到持久化存储(trait 已留好抽象点)。 + +### 3.5 ID 与超时规则 + +#### `templateID` + +由 `name` 通过 UUIDv5(DNS 命名空间)派生,前缀 `tpl-`: + +```rust +fn stable_template_id(name: &str) -> String { + let id = Uuid::new_v5(&Uuid::NAMESPACE_DNS, name.as_bytes()); + format!("tpl-{}", &id.simple().to_string()[..16]) +} +``` + +- 同一个 `name` 永远映射到**同一个** `templateID`,与 e2b 的 "alias 也是主键" 语义一致; +- 重复构建同名模板会复用 `templateID`,避免在控制面留下孤立模板。 + +#### `buildID` + +每次 `POST /v3/templates` 现场分配:`bld-`,无状态、不可猜测。 + +#### 超时分级 + +| 路由 | 超时 | 原因 | +|---|---|---| +| 默认(如 `/v3/templates` / `.../builds/{bid}/status`) | 30 s | 普通同步调用 | +| 长路由(`POST /sandboxes/:id/snapshots`、`POST /sandboxes/:id/rollback`、`DELETE /templates/:id`) | 240 s | 同步调用 cubelet 的 LVM/快照清理 | +| OCI Registry 反代(`/v2/*`) | 240 s | 大 layer blob PUT 可能数分钟 | + +在 `routes.rs` 中通过把不同子 router 用各自的 `TimeoutLayer` 包起来再 `Router::merge` 实现——`merge_preserves_per_router_timeout_layers` 单测专门覆盖了这个 invariant。 + +--- + +## 四、运维配置 + +### 4.1 一键部署默认值 + +`deploy/one-click/scripts/one-click/up.sh` 启动 CubeAPI 时已经默认带上: + +```bash +--registry-upstream http://127.0.0.1:5000 # 同机 distribution sidecar +--registry-public-host cube.app # 对外 docker push 域名 +--registry-pull-host 127.0.0.1:5000 # CubeMaster 节点拉镜像地址 +--registry-repo-prefix e2b # 镜像 namespace +``` + +意味着标准部署下 e2b CLI 的 `docker push` **开箱可用**。如果你用别的方式部署,请按下表把对应参数显式传给 `cube-api`。 + +### 4.2 完整参数表 + +| CLI 参数 | 环境变量 | 默认 | 含义 | +|---|---|---|---| +| `--registry-upstream URL` | `CUBE_API_REGISTRY_UPSTREAM` | *unset* | 上游 OCI Registry 的 URL;未设置时 `/v2/*` 返回 503,dockerfile 流被拒 | +| `--registry-public-host HOST` | `CUBE_API_REGISTRY_PUBLIC_HOST` | 取请求 Host 头 | 对客户端公布的 docker push 主机名 | +| `--registry-pull-host HOST` | `CUBE_API_REGISTRY_PULL_HOST` | upstream 的 host:port | CubeMaster 节点拉镜像用的内部地址 | +| `--registry-repo-prefix PREFIX` | `CUBE_API_REGISTRY_REPO_PREFIX` | `e2b` | 推送镜像的 repo namespace | +| `--registry-token TOKEN` | `CUBE_API_REGISTRY_TOKEN` | `_anon` | `POST /templates` 响应里 `registry.password` 字段 | +| `--default-writable-layer-size SIZE` | `CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE` | `1G` | 客户端没传 `writable_layer_size` 时的默认值(CubeMaster 强校验该字段) | +| `--sandbox-domain DOMAIN` | `CUBE_API_SANDBOX_DOMAIN` | `cube.app` | 沙箱响应里 `domain` 字段 | +| `--auth-callback-url URL` | `AUTH_CALLBACK_URL` | *unset* | 启用统一鉴权时回调 URL(详见[鉴权](../authentication.md)) | + +### 4.3 私有 / 受限 OCI Registry 接入 + +最常见的场景是把 docker push 推到团队的私有 registry。三步配置: + +1. **部署一个支持 OCI Distribution v1 的 registry**(CNCF `distribution/distribution`、Harbor、AWS ECR、GCR 都行); +2. CubeAPI 端:`--registry-upstream` 指向该 registry,`--registry-public-host` 是用户 docker push 的目标主机名(通常你的 ingress 域名); +3. CubeMaster 端:确保 `--registry-pull-host` 指向 CubeMaster 节点能拉到镜像的内网地址(如果 registry 在另一台机器上,**不要**用 `127.0.0.1`)。 + +如果 registry 自带 htpasswd / token server 鉴权,docker 客户端的 `Authorization` 头会被 CubeAPI 原样透传到上游——不需要在 CubeAPI 这一层做特殊处理。 + +--- + +## 五、最佳实践 + +### 5.1 镜像准备 + +**强约束**:任何用作 CubeSandbox 模板的镜像,启动后必须在 `:49983` 上有 envd 监听。两条最快的路径: + +| 路径 | 适合 | 操作 | +|---|---|---| +| **`FROM ghcr.io/tencentcloud/cubesandbox-base:2026.16`** | 全新业务镜像 | base 镜像已预装 envd + `cube-entrypoint.sh`,自动后台拉起 envd | +| **`COPY --from=cubesandbox-base ...`** | 已有业务镜像(如 `e2bdev/code-interpreter`) | 把 envd 二进制和入口脚本注入现有镜像,再把 ENTRYPOINT 换成 `cube-entrypoint.sh` | + +详细的 Dockerfile 样板、`cube-entrypoint.sh` 契约、本地 smoke test 见 [自带镜像接入](./bring-your-own-image.md)。 + +> ⚠️ **不要直接拿 `e2bdev/code-interpreter:latest` 制作模板**:它只有 e2b 上游的 init,没有 CubeSandbox 需要的 envd,模板创建时探针会以 `connection refused` 一路失败到超时。 + +### 5.2 SDK 用法 + +- **始终用 `set_start_cmd(cmd, wait_for_url(...))` 二参形式**,让构建期阻塞到业务 ready; +- `wait_for_url` 的 URL 必须 `http(s)://:[/]` 形式,host 必须是 `localhost` / `127.0.0.1` / `0.0.0.0` 之一; +- `from_image(...)` 中的镜像引用必须是 **CubeMaster 节点能 pull 到** 的 registry; +- `cpu_count` / `memory_mb` 是模板默认资源,可在 `Sandbox(...)` 调用时按需覆盖; +- 看到 build log 中出现 `[dispatch-v3] readyCmd parsed → HttpGet probe on port=... path=...` 即代表桥接成功。 + +### 5.3 沙箱使用 + +- **不需要 `time.sleep`**:只要构建期 `wait_for_url` 真的等到 ready,沙箱第一次 `run_code` 直接可用; +- 多次调用 `sbx.run_code(...)` 复用同一个沙箱比反复创建新沙箱开销小一个量级; +- 用完显式 `sbx.kill()` 而不是依赖超时回收。 + +--- + +## 六、故障排查 + +| 现象 | 根因 | 处理 | +|---|---|---| +| `BuildException: 404: b''` | CubeAPI 没有 V3 路由,多半是 v0.2.2 及更早版本 | 升级到 v0.2.3+ | +| build 卡在 `PULLING_IMAGE` | CubeMaster 节点拉不到镜像 | 用集群可达的 registry;私有 registry 检查 `--registry-pull-host` | +| build 日志出现 `readyCmd is recorded but not enforced` | URL 没被解析出来 | 检查 `wait_for_url` 是否写成 `http://localhost:[/]`,host 必须是 loopback,端口必须显式 | +| build 日志出现 `readyCmd parsed`,但 build 仍超时失败 | 探针在跑、但业务真的没 ready | 在镜像里 `docker run` 后 `curl 127.0.0.1:/` 本地验证;确认 `cube-entrypoint.sh` 是 `exec` 业务而不是 fork-and-exit | +| `Sandbox(template=...)` 后立即 `run_code` 报 502 | 业务还在启动中(探针没真正生效)| 先升级到 v0.2.3+;再确认 build 日志里有 `readyCmd parsed → HttpGet probe`;最后检查跨节点端口连通性,参见[CubeVS 网络模型](../../architecture/network.md) | +| `run_code` 报 `404 not found` | sandbox 内 envd 没起来 | 镜像里没注入 envd 或 ENTRYPOINT 被覆盖;按 [自带镜像接入](./bring-your-own-image.md#_3-备选-往现有镜像里注入-envd) 处理 | +| docker push 报 `503 registry_disabled` | CubeAPI 未配置 `--registry-upstream` | 按 [运维配置](#四运维配置) 启用 OCI Registry 反代 | +| docker push 报 `request timeout` | layer blob 上传慢、超过 240 s 长超时 | 检查上游 registry 的存储后端 IO;或临时把 layer 切小(`--squash` / 多阶段构建) | + +更多模板共性问题见 [模板相关排障](../troubleshooting/templates.md)。 + +--- + +## 七、进一步阅读 + +- [自带镜像接入 (envd)](./bring-your-own-image.md) — Dockerfile 模板、`cube-entrypoint.sh` 契约、本地 smoke test +- [从 OCI 镜像制作模板](./template-from-image.md) — `cubemastercli` 路径下的 `--probe` / `--probe-path` 显式探针配置 +- [CubeVS 网络模型](../../architecture/network.md) — 跨节点端口转发原理 +- [模板相关排障](../troubleshooting/templates.md) — 模板构建常见故障 +- [鉴权](../authentication.md) — `unified_auth` 中间件与 API key 配置 diff --git a/docs/zh/guide/tutorials/template-from-image.md b/docs/zh/guide/tutorials/template-from-image.md index f7b44a197..7bb9bfba4 100644 --- a/docs/zh/guide/tutorials/template-from-image.md +++ b/docs/zh/guide/tutorials/template-from-image.md @@ -224,3 +224,9 @@ template deleted: tpl-748094d2f2374b0a8a37e6ec | `status: FAILED`(BUILDING 阶段) | 构建错误(磁盘满、Dockerfile 问题等) | 执行 `tpl status --job-id --json` 查看 `last_error` 字段 | | `distribution: 0/N ready`(状态已 READY) | artifact 分发仍在进行(短暂正常) | 等待后重新执行 `tpl info`;若长时间未恢复检查目标节点的 Cubelet 日志 | | 沙箱启动后就绪探针一直失败 | 容器内服务未在预期端口/路径监听,或服务尚未完全就绪时 HTTP server 已提前启动 | 确认 HTTP server 在应用完全就绪后再启动;检查 `--probe-path` 是否正确 | + +--- + +## 延伸阅读 + +上述所有 `--probe` / `--probe-path` 参数面向的是 `cubemastercli` 路径。如果你使用 [e2b](https://e2b.dev/) Python / JS SDK 制作模板(`Template().set_start_cmd(..., wait_for_url(...))`),**不需要手写探针参数** —— CubeAPI 会从 `wait_for_url(...)` 中自动解析出 `(port, path)` 并生成同样的 HttpGet 探针。详见主线教程:[使用 e2b SDK 创建模板](./template-from-e2b-sdk.md)。 From 8f8f6fa6a7a9f308448c16f480d64b90277b0ace Mon Sep 17 00:00:00 2001 From: joeyczheng Date: Wed, 10 Jun 2026 17:48:57 +0800 Subject: [PATCH 2/2] fix(CubeAPI): harden e2b V3 template-build pipeline Security: * Per-build short-lived push credentials (`bld_` + 256-bit password) replace the global `_token` shared secret. * Registry reverse-proxy validates Basic auth against an in-memory credential index and enforces repo scoping; 401 with WWW-Authenticate + 403 on cross-build access. Per-credential rate limit added. * `mark_image_pushed` cross-checks the manifest repo, not just the tag. * Startup WARN when public bind meets unauthenticated loopback upstream. Resource bounds: * `BuildRegistry` gains TTL + size-cap + background GC; in-flight builds are never evicted. New `build_registry_*` config knobs. * Registry proxy streams request/response bodies end-to-end instead of buffering, so concurrent multi-GiB pushes no longer pin the heap. * V3 build-pipeline routes moved to `with_auth_and_rate_limit`. Correctness: * `image_pushed` flag is the single source of truth for "client really pushed"; OCI fallback no longer dispatches against an unpushed ref. * `fromTemplate` and `dockerfile`/`steps`-only builds are rejected with 501 until a real resolver/builder ships, instead of failing obscurely deeper in the pipeline. * `BuildLogLine.timestamp` is preserved across status polls. * Probe synthesis comments fixed: `readyCmd` -> `Probe.HttpGet`, not `Probe.Exec`. Signed-off-by: joeyczheng --- CubeAPI/Cargo.lock | 16 + CubeAPI/Cargo.toml | 2 +- CubeAPI/src/config/mod.rs | 67 ++ CubeAPI/src/handlers/registry.rs | 447 ++++++++++++- CubeAPI/src/handlers/templates_v3.rs | 20 + CubeAPI/src/middleware/rate_limit.rs | 109 +++- CubeAPI/src/models/mod.rs | 29 +- CubeAPI/src/routes.rs | 185 +++++- CubeAPI/src/services/builds.rs | 479 +++++++++++++- CubeAPI/src/services/mod.rs | 4 +- CubeAPI/src/services/templates.rs | 911 +++++++++++++++++++++++++-- CubeAPI/src/state.rs | 60 +- 12 files changed, 2229 insertions(+), 100 deletions(-) diff --git a/CubeAPI/Cargo.lock b/CubeAPI/Cargo.lock index 4178201e6..dfc1586de 100644 --- a/CubeAPI/Cargo.lock +++ b/CubeAPI/Cargo.lock @@ -2161,6 +2161,7 @@ dependencies = [ "base64 0.22.1", "bytes", "futures-core", + "futures-util", "http 1.4.0", "http-body", "http-body-util", @@ -2180,12 +2181,14 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower 0.5.3", "tower-http 0.6.11", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots 1.0.7", ] @@ -3488,6 +3491,19 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.244.0" diff --git a/CubeAPI/Cargo.toml b/CubeAPI/Cargo.toml index d5e52f563..6191bd3ed 100644 --- a/CubeAPI/Cargo.toml +++ b/CubeAPI/Cargo.toml @@ -69,7 +69,7 @@ governor = { version = "0.6", features = ["dashmap"] } # ── HTTP client (orchestrator / backend calls) ──────────────────────────── # connection pool built-in, rustls for TLS -reqwest = { version = "0.12", features = ["json", "rustls-tls"], default-features = false } +reqwest = { version = "0.12", features = ["json", "rustls-tls", "stream"], default-features = false } # ── Validation ──────────────────────────────────────────────────────────── validator = { version = "0.16", features = ["derive"] } diff --git a/CubeAPI/src/config/mod.rs b/CubeAPI/src/config/mod.rs index 417573283..3cc7b57b8 100644 --- a/CubeAPI/src/config/mod.rs +++ b/CubeAPI/src/config/mod.rs @@ -75,6 +75,32 @@ pub struct ServerConfig { /// /// When unset, /v2/* returns 503 and `dockerfile`-based template requests /// are rejected with 501. + /// + /// ## Security contract — read this before exposing CubeAPI publicly + /// + /// CubeAPI itself enforces **per-build, short-lived push credentials** + /// on every `/v2/*` path other than the unauthenticated `GET /v2/` ping + /// (which is required by the docker / oci-distribution handshake). The + /// credential is minted at build-creation time, returned to the SDK in + /// the `registry` field of the build response, indexed inside the + /// in-memory `BuildRegistry`, and is repo-scoped: it can only push / + /// pull blobs and manifests under `/`. It is + /// dropped when the build reaches its terminal stage (TTL- or + /// size-cap-evicted by `BuildRegistry`). + /// + /// **Strongly recommended** in addition: run an authenticated upstream + /// (e.g. `distribution/distribution` with htpasswd) and bind CubeAPI + /// itself behind TLS + an HTTP authenticator. Both layers together + /// match the depth of access control most operators expect from a + /// public OCI registry. + /// + /// **Not safe**: setting `registry_upstream` to an unauthenticated + /// upstream *and* binding CubeAPI on a public interface without TLS. + /// CubeAPI's own credential gate covers the bulk of the attack + /// surface, but it cannot stop a network attacker from observing the + /// per-build password in transit. CubeAPI logs a `WARN` at startup + /// when this combination is detected (see + /// `AppState::log_registry_security_posture`). #[serde(default)] pub registry_upstream: Option, @@ -109,12 +135,50 @@ pub struct ServerConfig { /// Env var: CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE | Default: "1G". #[serde(default = "default_writable_layer_size")] pub default_writable_layer_size: String, + + /// How long (seconds) a *terminal* build (Ready / Error) is kept in the + /// in-memory `BuildRegistry` after reaching its terminal stage. Past this + /// TTL the build context (create request, credentials, logs, …) is + /// evicted by the background GC. + /// + /// 0 disables TTL-based eviction (only the size cap will fire). + /// Default: 3600 (1 hour) — comfortably covers slow log pollers without + /// retaining old builds for the lifetime of the process. + #[serde(default = "default_build_registry_terminal_ttl_secs")] + pub build_registry_terminal_ttl_secs: u64, + + /// Hard upper bound on the number of *logical* builds tracked in the + /// `BuildRegistry`. When exceeded, the oldest terminal builds are + /// evicted FIFO regardless of TTL. In-flight builds are never evicted by + /// this cap (a warning is logged if the cap can't be honoured because + /// every entry is still in-flight). + /// + /// 0 disables the cap (only TTL applies). Default: 5000. + #[serde(default = "default_build_registry_max_entries")] + pub build_registry_max_entries: usize, + + /// Interval (seconds) at which the background GC task scans the + /// `BuildRegistry` for TTL-expired terminal builds. Default: 300 (5 min). + /// 0 disables the background task entirely (size-cap eviction at + /// `create()` time still applies). + #[serde(default = "default_build_registry_gc_interval_secs")] + pub build_registry_gc_interval_secs: u64, } fn default_registry_repo_prefix() -> String { "e2b".to_string() } +fn default_build_registry_terminal_ttl_secs() -> u64 { + 3600 +} +fn default_build_registry_max_entries() -> usize { + 5000 +} +fn default_build_registry_gc_interval_secs() -> u64 { + 300 +} + fn default_writable_layer_size() -> String { std::env::var("CUBE_API_DEFAULT_WRITABLE_LAYER_SIZE").unwrap_or_else(|_| "1G".to_string()) } @@ -201,6 +265,9 @@ impl Default for ServerConfig { registry_pull_host: None, registry_token: None, default_writable_layer_size: default_writable_layer_size(), + build_registry_terminal_ttl_secs: default_build_registry_terminal_ttl_secs(), + build_registry_max_entries: default_build_registry_max_entries(), + build_registry_gc_interval_secs: default_build_registry_gc_interval_secs(), } } } diff --git a/CubeAPI/src/handlers/registry.rs b/CubeAPI/src/handlers/registry.rs index 2d751b795..743841833 100644 --- a/CubeAPI/src/handlers/registry.rs +++ b/CubeAPI/src/handlers/registry.rs @@ -2,17 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // - use axum::{ - body::{Body, Bytes}, + body::Body, extract::{Path, Request, State}, http::{header, HeaderMap, HeaderName, HeaderValue, Method, StatusCode}, response::Response, }; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; +use futures::TryStreamExt; use std::str::FromStr; use crate::{ error::{AppError, AppResult}, + models::ApiError, + services::builds::BuildContext, state::AppState, }; @@ -29,8 +32,19 @@ const HOP_BY_HOP: &[&str] = &[ "host", ]; +/// Realm string echoed back in `WWW-Authenticate` challenges so docker / +/// oci-distribution clients know to retry with `Authorization: Basic`. +const REALM: &str = "cubeapi-registry"; + /// `GET /v2/` — registry ping. Always returns `200 OK` with the version header /// when an upstream is configured. +/// +/// Note we deliberately do **not** require `Authorization` on the ping. The +/// docker / oci-distribution v2 protocol uses an unauthenticated GET /v2/ as +/// the discovery handshake — it's how the client learns the realm of the +/// auth challenge in the first place. Requiring auth here would break every +/// CLI client at the very first round-trip. The actual blob/manifest paths +/// in `proxy()` *do* require credentials, so this is not a bypass. pub async fn ping(State(state): State) -> AppResult { let upstream = state .config @@ -39,10 +53,43 @@ pub async fn ping(State(state): State) -> AppResult { .filter(|s| !s.is_empty()) .ok_or_else(registry_disabled)?; - forward(&state, Method::GET, upstream, "/v2/", "", &HeaderMap::new(), Bytes::new()).await + forward( + &state, + Method::GET, + upstream, + "/v2/", + "", + &HeaderMap::new(), + None, + ) + .await } /// `ANY /v2/*path` — generic reverse-proxy. +/// +/// Both the request body (Docker/OCI blob PATCH/PUT can be GiB-sized) and the +/// upstream response body (blob GET) are forwarded as streams; nothing is ever +/// fully buffered in CubeAPI's heap. This keeps memory pressure bounded +/// regardless of layer size or upload concurrency. +/// +/// ## Defence in depth +/// +/// Before any upstream forwarding happens, we enforce **two CubeAPI-layer +/// access controls** that do *not* rely on the upstream registry having its +/// own auth configured: +/// +/// 1. **Per-build credential validation** — the inbound `Authorization: +/// Basic` header must decode to a `(username, password)` pair that we +/// ourselves issued via `mint_registry_credential` and is still +/// attached to a *live* build. Missing / malformed / unknown / wrong +/// password → `401 Unauthorized` with a `WWW-Authenticate: Basic` +/// challenge so the docker client retries the standard way. +/// 2. **Repo scoping** — once the credential resolves to a `BuildContext`, +/// we require the request's `` segment (everything between +/// `/v2/` and the next protocol verb) to match the repo embedded in +/// that build's `image_ref`. So even a holder of a valid build A +/// credential cannot push, pull or fingerprint blobs/manifests under +/// build B's repository — the request is rejected with `403 Forbidden`. pub async fn proxy( State(state): State, Path(path): Path, @@ -59,33 +106,95 @@ pub async fn proxy( let method = request.method().clone(); let query = request.uri().query().unwrap_or("").to_string(); let headers = request.headers().clone(); - let body = match axum::body::to_bytes(request.into_body(), 512 * 1024 * 1024).await { - Ok(b) => b, - Err(e) => { - return Err(AppError::BadRequest(format!( - "failed to read /v2/* request body: {}", - e - ))) + let normalized = normalize_subpath(&path); + + let ctx = match resolve_build_credential(&state, &headers) { + CredentialOutcome::Authenticated(ctx) => ctx, + CredentialOutcome::Missing => { + tracing::debug!(path = %normalized, "registry request without Authorization"); + return Ok(challenge_response( + StatusCode::UNAUTHORIZED, + "authentication required", + )); + } + CredentialOutcome::Malformed => { + tracing::debug!(path = %normalized, "registry request with malformed Authorization"); + return Ok(challenge_response( + StatusCode::UNAUTHORIZED, + "malformed Authorization header", + )); + } + CredentialOutcome::Rejected => { + tracing::warn!( + path = %normalized, + "registry request with unknown or invalid build credential" + ); + return Ok(challenge_response( + StatusCode::UNAUTHORIZED, + "invalid build credential", + )); } }; - let normalized = normalize_subpath(&path); - let response = forward(&state, method.clone(), &upstream, &normalized, &query, &headers, body) - .await?; + if let Some(repo) = parse_repo(&normalized) { + if !repo_allowed(&ctx, repo) { + tracing::warn!( + build_id = %ctx.build_id, + requested_repo = %repo, + expected_image_ref = %ctx.image_ref, + "registry credential used against unauthorised repository" + ); + return Ok(forbidden_response( + "credential is scoped to a different repository", + )); + } + } + else if normalized != "/v2/" { + tracing::warn!( + build_id = %ctx.build_id, + path = %normalized, + "registry credential used against non-repository endpoint" + ); + return Ok(forbidden_response( + "credential is not authorised for this endpoint", + )); + } + + let body_stream = request + .into_body() + .into_data_stream() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e)); + let upstream_body = reqwest::Body::wrap_stream(body_stream); + + let response = forward( + &state, + method.clone(), + &upstream, + &normalized, + &query, + &headers, + Some(upstream_body), + ) + .await?; // After a successful manifest PUT we mark the build as image-pushed so - // that the orchestrator stage proceeds. + // that the orchestrator stage proceeds. We only need the status — the + // manifest body itself is being streamed back to the client untouched. if method == Method::PUT && response.status().is_success() { if let Some(parsed) = parse_manifest_path(&normalized) { // tag carries either the buildID (preferred) or a digest. Pull the // build context by tag first, then fall back to no-op. if !parsed.tag.starts_with("sha256:") { tracing::info!( + build_id = %ctx.build_id, repo = %parsed.repo, tag = %parsed.tag, "manifest pushed; marking build as image-pushed" ); - state.services.templates.mark_image_pushed(&parsed.tag); + state + .services + .templates + .mark_image_pushed(&parsed.tag, &parsed.repo); } } } @@ -100,7 +209,7 @@ async fn forward( path: &str, query: &str, in_headers: &HeaderMap, - body: Bytes, + body: Option, ) -> AppResult { let upstream = upstream.trim_end_matches('/'); let path = if path.starts_with('/') { @@ -124,8 +233,8 @@ async fn forward( req = req.header(name.clone(), value.clone()); } - if !body.is_empty() { - req = req.body(body.to_vec()); + if let Some(body) = body { + req = req.body(body); } let upstream_resp = req.send().await.map_err(|e| { @@ -137,7 +246,7 @@ async fn forward( let mut headers = HeaderMap::new(); for (name, value) in upstream_resp.headers() { let key = name.as_str().to_ascii_lowercase(); - if HOP_BY_HOP.contains(&key.as_str()) || key == "content-length" { + if HOP_BY_HOP.contains(&key.as_str()) { continue; } if let (Ok(name), Ok(value)) = ( @@ -148,14 +257,14 @@ async fn forward( } } - let body_bytes = upstream_resp - .bytes() - .await - .map_err(|e| AppError::Internal(anyhow::anyhow!("registry response read failed: {}", e)))?; + let resp_stream = upstream_resp + .bytes_stream() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e)); + let resp_body = Body::from_stream(resp_stream); let mut response = Response::builder() .status(StatusCode::from_u16(status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY)) - .body(Body::from(body_bytes)) + .body(resp_body) .map_err(|e| AppError::Internal(anyhow::anyhow!("response build failed: {}", e)))?; *response.headers_mut() = headers; @@ -214,9 +323,164 @@ impl ManifestPath { } } +enum CredentialOutcome { + /// Header present, base64-decoded `user:pass` matches a live build + /// whose stored password equals the presented one. + Authenticated(BuildContext), + /// No `Authorization` header at all. Triggers the standard + /// `WWW-Authenticate: Basic` challenge. + Missing, + /// Header present but not a valid `Basic ` envelope + /// (wrong scheme, bad base64, no colon, …). + Malformed, + /// Header is well-formed but the username is unknown, the build has + /// already been evicted, or the password does not match. + /// + /// Note: we deliberately do not distinguish "unknown user" from "bad + /// password" in the response, to avoid an enumeration oracle. The + /// internal log lines do record the difference for ops debugging. + Rejected, +} + +fn resolve_build_credential(state: &AppState, headers: &HeaderMap) -> CredentialOutcome { + let Some(raw) = headers.get(header::AUTHORIZATION) else { + return CredentialOutcome::Missing; + }; + let Ok(value) = raw.to_str() else { + return CredentialOutcome::Malformed; + }; + let Some(b64) = value + .strip_prefix("Basic ") + .or_else(|| value.strip_prefix("basic ")) + else { + return CredentialOutcome::Malformed; + }; + let Ok(decoded) = BASE64.decode(b64.trim()) else { + return CredentialOutcome::Malformed; + }; + let Ok(decoded_str) = std::str::from_utf8(&decoded) else { + return CredentialOutcome::Malformed; + }; + let Some((user, pass)) = decoded_str.split_once(':') else { + return CredentialOutcome::Malformed; + }; + + let Some(ctx) = state.services.builds.find_by_registry_username(user) else { + return CredentialOutcome::Rejected; + }; + + if !constant_time_eq_strings(pass, &ctx.credential.password) { + return CredentialOutcome::Rejected; + } + CredentialOutcome::Authenticated(ctx) +} + +fn constant_time_eq_strings(a: &str, b: &str) -> bool { + if a.is_empty() || b.is_empty() { + return false; + } + if a.len() != b.len() { + // Still walk the longer slice to keep the timing roughly stable. + let longer = if a.len() > b.len() { a } else { b }; + let mut diff = 0u8; + for byte in longer.as_bytes() { + diff |= byte ^ 0; + } + let _ = diff; + return false; + } + let mut diff = 0u8; + for (x, y) in a.as_bytes().iter().zip(b.as_bytes()) { + diff |= x ^ y; + } + diff == 0 +} + +/// Extract the `` segment from any well-formed v2 distribution path +/// (`/v2//{blobs,manifests,tags,referrers}/...`). Returns `None` for +/// the bare ping (`/v2/`), for catalog endpoints, and for paths that don't +/// match the v2 layout at all. +fn parse_repo(path: &str) -> Option<&str> { + let stripped = path.strip_prefix("/v2/")?; + if stripped.is_empty() { + return None; + } + if stripped.starts_with('_') { + return None; + } + for verb in ["/manifests/", "/blobs/", "/tags/", "/referrers/"] { + if let Some(idx) = stripped.rfind(verb) { + if idx == 0 { + return None; + } + return Some(&stripped[..idx]); + } + } + None +} + +fn repo_allowed(ctx: &BuildContext, repo: &str) -> bool { + let Some(expected) = image_ref_repo(&ctx.image_ref) else { + return false; + }; + expected == repo +} + +fn image_ref_repo(image_ref: &str) -> Option { + let without_tag = image_ref.rsplit_once(':').map(|(l, _)| l).unwrap_or(image_ref); + // Drop everything up to and including the first `/`, which is the host. + let (_, repo) = without_tag.split_once('/')?; + if repo.is_empty() { + return None; + } + Some(repo.to_string()) +} + +fn challenge_response(status: StatusCode, message: &str) -> Response { + let body = serde_json::to_vec(&ApiError::new(status.as_u16() as i32, message.to_string())) + .unwrap_or_default(); + let mut resp = Response::builder() + .status(status) + .body(Body::from(body)) + .expect("static challenge response is always well-formed"); + resp.headers_mut().insert( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + ); + resp.headers_mut().insert( + header::WWW_AUTHENTICATE, + HeaderValue::from_str(&format!("Basic realm=\"{}\"", REALM)) + .expect("REALM is ASCII"), + ); + resp.headers_mut().insert( + HeaderName::from_static("docker-distribution-api-version"), + HeaderValue::from_static("registry/2.0"), + ); + resp +} + +fn forbidden_response(message: &str) -> Response { + let body = serde_json::to_vec(&ApiError::new(403, message.to_string())).unwrap_or_default(); + let mut resp = Response::builder() + .status(StatusCode::FORBIDDEN) + .body(Body::from(body)) + .expect("static forbidden response is always well-formed"); + resp.headers_mut().insert( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + ); + resp.headers_mut().insert( + HeaderName::from_static("docker-distribution-api-version"), + HeaderValue::from_static("registry/2.0"), + ); + resp +} + #[cfg(test)] mod tests { use super::*; + use crate::models::CreateTemplateRequest; + use crate::services::builds::{BuildRegistry, EvictionPolicy}; #[test] fn parse_manifest_path_accepts_namespaced_repo() { @@ -237,4 +501,139 @@ mod tests { assert_eq!(normalize_subpath("/foo/bar"), "/v2/foo/bar"); assert_eq!(normalize_subpath("/v2/foo/bar"), "/v2/foo/bar"); } + + // ── repo / image_ref helpers ───────────────────────────────────── + + #[test] + fn parse_repo_extracts_namespaced_repo_from_each_verb() { + for path in [ + "/v2/e2b/tpl-abc/manifests/bld-001", + "/v2/e2b/tpl-abc/blobs/sha256:abc", + "/v2/e2b/tpl-abc/blobs/uploads/uuid-123", + "/v2/e2b/tpl-abc/tags/list", + "/v2/e2b/tpl-abc/referrers/sha256:abc", + ] { + assert_eq!( + parse_repo(path), + Some("e2b/tpl-abc"), + "parse_repo failed for {}", path + ); + } + } + + #[test] + fn parse_repo_rejects_non_repo_endpoints() { + assert_eq!(parse_repo("/v2/"), None); + assert_eq!(parse_repo("/v2/_catalog"), None); + assert_eq!(parse_repo("/v2/manifests/foo"), None); + assert_eq!(parse_repo("foo/bar"), None); + } + + #[test] + fn image_ref_repo_strips_host_and_tag() { + assert_eq!( + image_ref_repo("127.0.0.1:5000/e2b/tpl-abc:bld-deadbeef").as_deref(), + Some("e2b/tpl-abc") + ); + assert_eq!( + image_ref_repo("registry.example.com/e2b/tpl-abc").as_deref(), + Some("e2b/tpl-abc") + ); + } + + #[test] + fn repo_allowed_rejects_prefix_collisions() { + let mut ctx = sample_context(); + ctx.image_ref = "127.0.0.1:5000/e2b/tpl-abc:bld-001".to_string(); + assert!(repo_allowed(&ctx, "e2b/tpl-abc")); + assert!(!repo_allowed(&ctx, "e2b/tpl-abc-evil")); + assert!(!repo_allowed(&ctx, "evil/tpl-abc")); + } + + #[test] + fn constant_time_eq_strings_basic_correctness() { + assert!(constant_time_eq_strings("abc", "abc")); + assert!(!constant_time_eq_strings("abc", "abd")); + assert!(!constant_time_eq_strings("abc", "abcd")); + assert!(!constant_time_eq_strings("", "")); + assert!(!constant_time_eq_strings("", "abc")); + assert!(!constant_time_eq_strings("abc", "")); + } + + // ── credential resolution against an in-memory BuildRegistry ───── + + fn sample_request() -> CreateTemplateRequest { + CreateTemplateRequest { + template_id: String::new(), + instance_type: None, + alias: None, + team_id: None, + image: None, + dockerfile: None, + writable_layer_size: None, + exposed_ports: None, + probe_port: None, + probe_path: None, + cpu: None, + memory: None, + cpu_count: None, + memory_mb: None, + env: None, + env_vars: None, + allow_internet_access: None, + network_type: None, + nodes: None, + registry_username: None, + registry_password: None, + command: None, + args: None, + dns: None, + allow_out: None, + deny_out: None, + start_cmd: None, + ready_cmd: None, + } + } + + fn sample_context() -> BuildContext { + let reg = BuildRegistry::with_policy(EvictionPolicy::unbounded()); + let cred = crate::models::RegistryCredential { + url: "http://127.0.0.1:5000".to_string(), + repository: "e2b/tpl-abc".to_string(), + username: "bld_test_user".to_string(), + password: "bld_test_pass_secret".to_string(), + }; + reg.create( + "tpl-abc".to_string(), + sample_request(), + cred, + "127.0.0.1:5000/e2b/tpl-abc:bld".to_string(), + ) + } + + #[test] + fn build_registry_indexes_credential_username() { + let reg = BuildRegistry::with_policy(EvictionPolicy::unbounded()); + let cred = crate::models::RegistryCredential { + url: "http://127.0.0.1:5000".to_string(), + repository: "e2b/tpl-x".to_string(), + username: "bld_unique_user".to_string(), + password: "secret".to_string(), + }; + let ctx = reg.create( + "tpl-x".to_string(), + sample_request(), + cred, + "127.0.0.1:5000/e2b/tpl-x:bld".to_string(), + ); + let resolved = reg.find_by_registry_username("bld_unique_user").unwrap(); + assert_eq!(resolved.build_id, ctx.build_id); + assert!(reg.find_by_registry_username("bld_other_user").is_none()); + } + + #[test] + fn parse_manifest_tag_uses_build_id_after_credential_check() { + let m = parse_manifest_path("/v2/e2b/tpl-abc/manifests/bld-deadbeef").unwrap(); + assert_eq!(m.tag, "bld-deadbeef"); + } } diff --git a/CubeAPI/src/handlers/templates_v3.rs b/CubeAPI/src/handlers/templates_v3.rs index 4319765b8..21bc94738 100644 --- a/CubeAPI/src/handlers/templates_v3.rs +++ b/CubeAPI/src/handlers/templates_v3.rs @@ -30,6 +30,26 @@ pub async fn v3_create_template( /// SDK before uploading build context tarballs. We always answer /// `present=true` because the current CubeMaster pipeline only consumes /// `from_image` references (no Dockerfile-from-context build yet). +/// +/// ### Why `201 Created` on a successful GET? +/// +/// ref: https://github.com/e2b-dev/infra/blob/db88eee0fd5df4a5c90e544faa5c7b44c6719b51/packages/api/internal/handlers/template_layer_files_upload.go#L71 +/// This is intentional and matches the upstream E2B Infra contract: the same +/// endpoint is overloaded as both a *cache probe* and an *upload-slot +/// allocator*. On cache miss the server returns `201 Created` together with +/// a freshly minted presigned upload URL; on cache hit it returns the same +/// `201` without a URL so the SDK can branch purely on the `present` flag +/// without also having to discriminate by status code. Several E2B SDK +/// versions hard-code this: anything other than `2xx` is treated as a +/// fatal error, and at least the JS SDK additionally asserts on `201` for +/// the upload-allocator branch. +/// +/// Switching to `200 OK` here would be more REST-correct, but it would +/// silently break SDK clients in the wild that still do +/// `if (status !== 201) throw ...`. Until we either own all client paths +/// or upstream relaxes the contract, we stick with `201` and pin it via +/// the `v3_template_build_routes_are_reachable` route test in +/// `routes.rs` so it can't drift unnoticed. pub async fn v3_get_files_hash( State(state): State, Path((template_id, hash)): Path<(String, String)>, diff --git a/CubeAPI/src/middleware/rate_limit.rs b/CubeAPI/src/middleware/rate_limit.rs index f399d9d86..cde895d0e 100644 --- a/CubeAPI/src/middleware/rate_limit.rs +++ b/CubeAPI/src/middleware/rate_limit.rs @@ -5,10 +5,13 @@ use crate::error::AppError; use crate::state::AppState; use axum::{ - extract::{Request, State}, + extract::{ConnectInfo, Request, State}, + http::header, middleware::Next, response::Response, }; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; +use std::net::SocketAddr; /// Per-API-key token bucket rate limiter middleware. /// Reads the X-API-Key header and checks the shared governor limiter. @@ -33,3 +36,107 @@ pub async fn rate_limit( )), } } + +/// Rate-limit middleware specialised for the OCI registry reverse-proxy. +/// +/// Docker / oci-distribution clients do **not** send `X-API-Key`; they +/// authenticate with `Authorization: Basic ` instead. The +/// generic `rate_limit` middleware would therefore collapse every docker +/// client onto the single "anonymous" bucket, which is unusable: a +/// runaway client could lock every other operator out of pushing layers. +/// +/// We pick a key in this priority order: +/// +/// 1. `Authorization: Basic` username (i.e. the per-build `bld_<…>` +/// token we minted in `mint_registry_credential`). One bucket per +/// build is the natural granularity — a misbehaving build +/// doesn't impact others. +/// 2. Peer socket address (`ConnectInfo`). Catches the unauthenticated +/// `GET /v2/` ping flood and any other anonymous traffic. +/// 3. The literal string `\"reg:anonymous\"` as the absolute fallback, +/// should `ConnectInfo` somehow be missing. +/// +/// All keys are prefixed with `reg:` so they live in a disjoint key space +/// from the sandbox API's `X-API-Key` buckets — a sandbox abuser cannot +/// starve the registry path and vice versa, even though both share the +/// same governor instance and quota. +pub async fn registry_rate_limit( + State(state): State, + request: Request, + next: Next, +) -> Result { + let key = registry_key_for(&request); + + match state.rate_limiter.check_key(&key) { + Ok(_) => Ok(next.run(request).await), + Err(_) => Err(AppError::TooManyRequests( + "Registry rate limit exceeded for this credential. Slow down.".to_string(), + )), + } +} + +fn registry_key_for(request: &Request) -> String { + if let Some(user) = basic_auth_username(request) { + return format!("reg:user:{}", user); + } + if let Some(ConnectInfo(addr)) = request.extensions().get::>() { + return format!("reg:ip:{}", addr.ip()); + } + "reg:anonymous".to_string() +} + +fn basic_auth_username(request: &Request) -> Option { + let raw = request.headers().get(header::AUTHORIZATION)?.to_str().ok()?; + let b64 = raw + .strip_prefix("Basic ") + .or_else(|| raw.strip_prefix("basic "))?; + let decoded = BASE64.decode(b64.trim()).ok()?; + let s = std::str::from_utf8(&decoded).ok()?; + let (user, _pass) = s.split_once(':')?; + if user.is_empty() { + return None; + } + Some(user.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::body::Body; + use axum::http::{HeaderValue, Request as HttpRequest}; + + fn req_with_auth(value: Option<&str>) -> Request { + let mut builder = HttpRequest::builder().uri("/v2/foo/blobs/sha256:abc"); + if let Some(v) = value { + builder = builder.header(header::AUTHORIZATION, HeaderValue::from_str(v).unwrap()); + } + builder.body(Body::empty()).unwrap() + } + + #[test] + fn registry_key_uses_basic_username_when_present() { + let r = req_with_auth(Some("Basic YmxkX3VzZXI6c2VjcmV0")); + assert_eq!(registry_key_for(&r), "reg:user:bld_user"); + } + + #[test] + fn registry_key_falls_back_to_anonymous_without_connect_info() { + let r = req_with_auth(None); + assert_eq!(registry_key_for(&r), "reg:anonymous"); + } + + #[test] + fn registry_key_ignores_malformed_authorization() { + let r = req_with_auth(Some("Bearer some-token")); + assert_eq!(registry_key_for(&r), "reg:anonymous"); + + let r = req_with_auth(Some("Basic !!!not-base64!!!")); + assert_eq!(registry_key_for(&r), "reg:anonymous"); + + let r = req_with_auth(Some("Basic bm9jb2xvbg==")); + assert_eq!(registry_key_for(&r), "reg:anonymous"); + + let r = req_with_auth(Some("Basic OnB3")); + assert_eq!(registry_key_for(&r), "reg:anonymous"); + } +} diff --git a/CubeAPI/src/models/mod.rs b/CubeAPI/src/models/mod.rs index bf3bef242..acfc992b5 100644 --- a/CubeAPI/src/models/mod.rs +++ b/CubeAPI/src/models/mod.rs @@ -591,7 +591,17 @@ pub struct CreateTemplateRequest { pub start_cmd: Option, /// E2B-style `readyCmd`: shell command used as readiness probe. - /// Translated into a CubeMaster `Probe.Exec` when `probe_port` is empty. + /// + /// **Not** forwarded to the container as a shell command — neither + /// CubeMaster nor Cubelet support `Probe.Exec`, so we cannot run an + /// arbitrary shell snippet as a readiness check end-to-end. Instead + /// `services/templates.rs::v3_trigger_build` performs a best-effort + /// parse of an `http(s)://:[/]` URL embedded in the + /// snippet (the shape produced by the e2b SDK's `wait_for_url(...)`) + /// and synthesises a CubeMaster `Probe.HttpGet` from it. If no URL + /// can be extracted (or no `probe_port` is supplied alongside), the + /// `readyCmd` is recorded in the build log only and **no probe is + /// emitted** — Cubelet treats that as "no readiness check". #[serde(rename = "readyCmd", alias = "ready_cmd", default)] pub ready_cmd: Option, } @@ -905,9 +915,24 @@ pub struct V2TemplateBuildStart { #[serde(rename = "fromImageRegistry", default)] pub from_image_registry: Option, /// Reuse another already-built CubeSandbox template as the base. + /// + /// **Currently rejected with `501 Not Implemented`** by + /// `services/templates.rs::v3_trigger_build`: the downstream stack + /// (CubeMaster `template_image.go` → `docker pull`) has no resolver + /// for the `cube://` source scheme, so honouring this + /// field at the API layer would only push the failure into the build + /// worker as an opaque `invalid reference format`. Resolve the parent + /// template to a concrete OCI reference and pass it via `fromImage` + /// instead, until the resolver lands end-to-end. #[serde(rename = "fromTemplate", default)] pub from_template: Option, - /// E2B `readyCmd` — translated into CubeMaster `Probe.Exec`. + /// E2B `readyCmd` — best-effort translated into a CubeMaster + /// `Probe.HttpGet` by extracting the `http(s)://host:port[/path]` + /// URL embedded in the snippet (CubeMaster/Cubelet don't support + /// `Probe.Exec`, so the shell snippet itself is *not* run). When no + /// URL can be parsed and the V3 body doesn't carry `probePort` / + /// `exposedPorts`, no probe is emitted — see + /// `services/templates.rs::parse_ready_url` and `build_probe`. #[serde(rename = "readyCmd", default)] pub ready_cmd: Option, /// E2B `startCmd` — translated into container `args`. diff --git a/CubeAPI/src/routes.rs b/CubeAPI/src/routes.rs index 7402fc5f9..9b6c997ec 100644 --- a/CubeAPI/src/routes.rs +++ b/CubeAPI/src/routes.rs @@ -22,7 +22,10 @@ use crate::{ agenthub, cluster, config, health, registry, sandboxes, snapshots, store, templates, templates_v3, }, - middleware::{auth::unified_auth, rate_limit::rate_limit}, + middleware::{ + auth::unified_auth, + rate_limit::{rate_limit, registry_rate_limit}, + }, state::AppState, }; @@ -175,9 +178,23 @@ fn build_template_routes(state: &AppState, auth_configured: bool) -> Router Router Router Router { +fn build_registry_router(state: &AppState) -> Router { use axum::routing::{any, get}; + // Registry routes deliberately do NOT go through `unified_auth`. The OCI + // distribution v2 protocol uses a dedicated credential domain (Basic auth + // against the per-build push token we minted in `mint_registry_credential`), + // and that validation lives inside `registry::proxy` itself — it must run + // *after* the docker client's two-step `GET /v2/` → `WWW-Authenticate` → + // retry-with-Basic handshake, which `unified_auth` would short-circuit. + // + // The reverse-proxy is, however, attached to a per-build / + // per-source-IP rate-limit bucket so a single misbehaving CLI cannot + // saturate the upstream. See `registry_rate_limit` for the keying rules. Router::new() .route("/v2/", get(registry::ping)) .route("/v2", get(registry::ping)) .route("/v2/*path", any(registry::proxy)) + .layer(middleware::from_fn_with_state( + state.clone(), + registry_rate_limit, + )) } fn with_auth( @@ -594,4 +641,122 @@ mod tests { let fb: serde_json::Value = r.json(); assert_eq!(fb["present"].as_bool(), Some(true)); } + + /// Spin up a minimal in-process auth callback that always returns 200, + /// returning the absolute URL (`http://127.0.0.1:/`) plus a + /// JoinHandle for the tokio task running it. Used by the rate-limit + /// regression tests below: `with_auth_and_rate_limit` only attaches + /// the rate-limiter when `auth_callback_url` is configured, so we + /// need a real, reachable callback to exercise the production layer + /// stack. + async fn spawn_always_200_auth_server() -> (String, tokio::task::JoinHandle<()>) { + use axum::routing::post; + let app = axum::Router::new().route("/", post(|| async { axum::http::StatusCode::OK })); + let listener = tokio::net::TcpListener::bind("127.0.0.1:0") + .await + .expect("bind mock auth server"); + let addr = listener.local_addr().expect("addr"); + let url = format!("http://{}/", addr); + let handle = tokio::spawn(async move { + let _ = axum::serve(listener, app).await; + }); + (url, handle) + } + + #[tokio::test] + async fn v3_build_pipeline_routes_are_rate_limited() { + let (auth_url, _auth_handle) = spawn_always_200_auth_server().await; + + let mut config = ServerConfig::default(); + config.cubemaster_url = "http://127.0.0.1:9".to_string(); + config.auth_callback_url = Some(auth_url); + config.rate_limit_per_sec = 1; + + let state = AppState::new(config, arc(NoopLogger)).await; + let server = TestServer::new(build_router(state)).expect("router should build"); + + let mut rate_limited = 0usize; + let mut passed = 0usize; + for _ in 0..20 { + let resp = server + .post("/v3/templates") + .add_header( + axum::http::HeaderName::from_static("x-api-key"), + axum::http::HeaderValue::from_static("abuser"), + ) + .json(&serde_json::json!({ + "name": "my-tpl:dev", + "cpuCount": 1, + "memoryMB": 1024, + })) + .await; + match resp.status_code() { + StatusCode::TOO_MANY_REQUESTS => rate_limited += 1, + code if code.is_success() => passed += 1, + _ => passed += 1, + } + } + + assert!( + rate_limited > 0, + "expected at least one POST /v3/templates to be 429-throttled \ + with rate_limit_per_sec=1; got {} successes and 0 throttled \ + responses across 20 requests — the rate-limit middleware is \ + not attached to V3 build pipeline routes", + passed, + ); + assert!( + passed > 0, + "expected at least one POST /v3/templates to pass the gate \ + (the very first burst token); got {} 429s and 0 passes — \ + auth or rate-limit is mis-configured in the test harness", + rate_limited, + ); + } + + /// Companion: plain template *management* endpoints (list / get / + /// CRUD) explicitly stay on `with_auth` rather than + /// `with_auth_and_rate_limit`, because they forward directly to + /// CubeMaster's CRUD layer and don't allocate in-process build + /// state. Pinning that boundary here so a future "let's just rate + /// limit everything" change can't silently regress operator + /// workflows that legitimately list templates faster than the + /// shared governor allows. + #[tokio::test] + async fn template_management_routes_are_not_rate_limited() { + let (auth_url, _auth_handle) = spawn_always_200_auth_server().await; + + let mut config = ServerConfig::default(); + config.cubemaster_url = "http://127.0.0.1:9".to_string(); + config.auth_callback_url = Some(auth_url); + config.rate_limit_per_sec = 1; + + let state = AppState::new(config, arc(NoopLogger)).await; + let server = TestServer::new(build_router(state)).expect("router should build"); + + // Burst the same 20 requests against the management surface. + // Even with quota=1 we should never see 429 here, because the + // rate-limit layer is not attached to this lane. + let mut saw_throttle = false; + for _ in 0..20 { + let resp = server + .get("/templates") + .add_header( + axum::http::HeaderName::from_static("x-api-key"), + axum::http::HeaderValue::from_static("abuser"), + ) + .await; + if resp.status_code() == StatusCode::TOO_MANY_REQUESTS { + saw_throttle = true; + break; + } + } + + assert!( + !saw_throttle, + "GET /templates is on the auth-only lane and must NOT be \ + rate-limited; observing 429 here would mean the management \ + sub-router was accidentally folded into with_auth_and_rate_limit" + ); + } } \ No newline at end of file diff --git a/CubeAPI/src/services/builds.rs b/CubeAPI/src/services/builds.rs index 34ffd2d05..6bdc1fa84 100644 --- a/CubeAPI/src/services/builds.rs +++ b/CubeAPI/src/services/builds.rs @@ -15,14 +15,36 @@ //! - the CubeMaster `jobID` once the build is dispatched, used by every //! subsequent status / logs lookup. //! -//! The store is in-memory + bounded; restart of CubeAPI invalidates inflight -//! builds. This is acceptable for a build flow that always reaches a terminal -//! state within minutes — durable persistence can be added later as a separate -//! storage trait without changing the call sites. +//! ## Eviction +//! +//! The registry is bounded by **two complementary policies** so a long-running +//! CubeAPI process can't accumulate completed builds forever: +//! +//! 1. **TTL on terminal builds** — when a build transitions into +//! `BuildStage::Ready` / `BuildStage::Error`, we stamp `terminal_at` and +//! push it onto an ordered FIFO. A background tokio task wakes up every +//! `gc_interval` and pops everything past `terminal_ttl`. In-flight builds +//! (`WaitingPush`, `Building`) are never evicted by TTL. +//! +//! 2. **Hard size cap** — `create()` checks the cap and synchronously evicts +//! the oldest terminal builds FIFO until the live count is at or below the +//! cap. If every entry is still in-flight, we log a warning and let the +//! cap be exceeded rather than killing an active build mid-flight. +//! +//! Both knobs come from `ServerConfig::build_registry_*` and default to +//! `(ttl=1h, cap=5000, gc_interval=5min)`. Setting any of them to `0` +//! disables that specific protection. +//! +//! Restart of CubeAPI invalidates inflight builds. This is acceptable for a +//! build flow that always reaches a terminal state within minutes — durable +//! persistence can be added later as a separate storage trait without +//! changing the call sites. -use chrono::{DateTime, Utc}; +use chrono::{DateTime, Duration, Utc}; use dashmap::DashMap; -use std::sync::Arc; +use std::collections::VecDeque; +use std::sync::{Arc, Mutex}; +use std::time::Duration as StdDuration; use uuid::Uuid; use crate::models::{CreateTemplateRequest, RegistryCredential}; @@ -50,6 +72,12 @@ impl BuildStage { BuildStage::Error => "error", } } + + /// `Ready` and `Error` are absorbing states — the orchestrator pipeline + /// will not move out of them. Used as the gate for TTL eviction. + pub fn is_terminal(self) -> bool { + matches!(self, BuildStage::Ready | BuildStage::Error) + } } #[derive(Debug, Clone)] @@ -66,6 +94,22 @@ pub struct BuildContext { pub credential: RegistryCredential, /// Image reference CubeMaster will pull from once the client has pushed. pub image_ref: String, + /// Authoritative "the client has actually completed an OCI manifest + /// PUT against `image_ref`" flag. Set **exclusively** by + /// `TemplateService::mark_image_pushed` after both: + /// + /// - the manifest's `repo` segment matches the one we minted at + /// create time (cross-check guarding against tag collisions), and + /// - the upstream registry returned a `2xx` for the PUT. + /// + /// Consumers (especially `v3_trigger_build`) MUST gate the + /// "fall back to `ctx.image_ref` as the source image" branch on this + /// field, not on `stage`. `image_ref` is *predicted* at create time + /// and its non-emptiness alone does not prove anything was pushed; + /// `stage` is also an indirect proxy that the v2/v3 dispatch paths + /// mutate for unrelated reasons. This boolean is the only safe + /// correctness signal. + pub image_pushed: bool, /// CubeMaster `jobID` — empty until the build is actually dispatched. pub job_id: String, /// Append-only log lines (timestamps + plain message). @@ -74,6 +118,9 @@ pub struct BuildContext { pub progress: i32, pub message: String, pub created_at: DateTime, + /// Wall-clock time at which `stage` first became terminal. `None` while + /// the build is still in-flight. Drives the TTL-based eviction path. + pub terminal_at: Option>, // ── V3 protocol-only metadata (populated by POST /v3/templates) ──────── /// Template name (E2B `name`), e.g. "my-template" or "my-template:v1". @@ -95,10 +142,62 @@ pub struct BuildLogLine { pub line: String, } +#[derive(Debug, Clone, Copy)] +pub struct EvictionPolicy { + /// How long a terminal build is kept after reaching Ready/Error. + /// `None` disables TTL-based eviction. + pub terminal_ttl: Option, + /// Hard cap on the number of distinct builds; `None` disables the cap. + pub max_entries: Option, + /// Background GC scan interval; `None` disables the background task + /// (size-cap eviction at create-time still runs). + pub gc_interval: Option, +} + +impl EvictionPolicy { + pub fn from_config(cfg: &crate::config::ServerConfig) -> Self { + Self { + terminal_ttl: (cfg.build_registry_terminal_ttl_secs > 0) + .then(|| Duration::seconds(cfg.build_registry_terminal_ttl_secs as i64)), + max_entries: (cfg.build_registry_max_entries > 0) + .then_some(cfg.build_registry_max_entries), + gc_interval: (cfg.build_registry_gc_interval_secs > 0) + .then(|| StdDuration::from_secs(cfg.build_registry_gc_interval_secs)), + } + } + + pub fn unbounded() -> Self { + Self { + terminal_ttl: None, + max_entries: None, + gc_interval: None, + } + } +} + +/// One entry on the FIFO of terminal builds awaiting eviction. We keep the +/// `template_id` here so the GC path can clear *both* index keys +/// (`bid` and `tid::bid`) without a round-trip through the DashMap. +#[derive(Debug, Clone)] +struct TerminalEntry { + build_id: String, + template_id: String, + terminal_at: DateTime, +} + /// Thread-safe, in-process build registry. -#[derive(Clone, Default)] +#[derive(Clone)] pub struct BuildRegistry { inner: Arc>, + username_index: Arc>, + terminal: Arc>>, + policy: EvictionPolicy, +} + +impl Default for BuildRegistry { + fn default() -> Self { + Self::with_policy(EvictionPolicy::unbounded()) + } } impl BuildRegistry { @@ -106,6 +205,42 @@ impl BuildRegistry { Self::default() } + pub fn with_policy(policy: EvictionPolicy) -> Self { + Self { + inner: Arc::new(DashMap::new()), + username_index: Arc::new(DashMap::new()), + terminal: Arc::new(Mutex::new(VecDeque::new())), + policy, + } + } + + /// Spawn the background TTL GC task. Idempotent in the sense that calling + /// it twice will spawn two tasks — call exactly once from `AppServices` + /// construction. Returns `None` when GC is disabled (`gc_interval = 0`), + /// which is convenient for unit tests. + pub fn spawn_gc(&self) -> Option> { + let interval = self.policy.gc_interval?; + let registry = self.clone(); + let handle = tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + // Skip the immediate firing — let the process settle first. + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + ticker.tick().await; + loop { + ticker.tick().await; + let evicted = registry.evict_expired(Utc::now()); + if evicted > 0 { + tracing::debug!( + evicted, + live = registry.inner.len(), + "build registry GC swept terminal builds" + ); + } + } + }); + Some(handle) + } + /// Register a brand-new build attempt. Returns the freshly allocated /// build_id alongside the stored context (cloned for read-only use by the /// caller). @@ -123,12 +258,14 @@ impl BuildRegistry { create_request: Arc::new(request), credential, image_ref, + image_pushed: false, job_id: String::new(), logs: Vec::new(), stage: BuildStage::WaitingPush, progress: 0, message: "build registered, waiting for image push".to_string(), created_at: Utc::now(), + terminal_at: None, name: String::new(), tags: Vec::new(), cpu_count: 0, @@ -136,12 +273,23 @@ impl BuildRegistry { aliases: Vec::new(), }; - // Index under both bid and (tid, bid) so lookups by either key work. self.inner.insert(build_id.clone(), ctx.clone()); self.inner.insert(compose_key(&template_id, &build_id), ctx.clone()); + + let uname = ctx.credential.username.clone(); + if !uname.is_empty() { + self.username_index.insert(uname, build_id.clone()); + } + self.enforce_size_cap(); + ctx } + pub fn find_by_registry_username(&self, username: &str) -> Option { + let bid = self.username_index.get(username)?.value().clone(); + self.get(&bid) + } + pub fn get(&self, build_id: &str) -> Option { self.inner.get(build_id).map(|r| r.value().clone()) } @@ -153,13 +301,29 @@ impl BuildRegistry { .map(|r| r.value().clone()) } - /// Apply a mutation to a build context. Updates both index entries. + /// Apply a mutation to a build context. Updates both index entries and, + /// if the closure transitions the build into a terminal stage, stamps + /// `terminal_at` and queues the build for TTL eviction. pub fn update(&self, build_id: &str, mutate: F) -> Option where F: FnOnce(&mut BuildContext), { let mut ctx = self.inner.get(build_id).map(|r| r.value().clone())?; + let was_terminal = ctx.stage.is_terminal(); mutate(&mut ctx); + let now_terminal = ctx.stage.is_terminal(); + + if !was_terminal && now_terminal { + let stamp = Utc::now(); + ctx.terminal_at = Some(stamp); + self.push_terminal(TerminalEntry { + build_id: ctx.build_id.clone(), + template_id: ctx.template_id.clone(), + terminal_at: stamp, + }); + } else if was_terminal && !now_terminal { + ctx.terminal_at = None; + } let pair_key = compose_key(&ctx.template_id, &ctx.build_id); self.inner.insert(build_id.to_string(), ctx.clone()); @@ -167,7 +331,6 @@ impl BuildRegistry { Some(ctx) } - /// Append one log line. Truncates the head to bound memory at ~10k lines. pub fn append_log(&self, build_id: &str, line: impl Into) { let line = line.into(); self.update(build_id, |ctx| { @@ -182,8 +345,304 @@ impl BuildRegistry { } }); } + + /// Drop every terminal build whose `terminal_at + ttl <= now`. + /// + /// Returns the number of *logical* builds (not index entries) removed. + /// Exposed `pub(crate)` so tests can drive the GC deterministically + /// without spinning up the background task. + pub(crate) fn evict_expired(&self, now: DateTime) -> usize { + let Some(ttl) = self.policy.terminal_ttl else { + return 0; + }; + let cutoff = now - ttl; + let mut removed = 0usize; + + loop { + let entry = { + let mut q = self.terminal.lock().expect("terminal queue poisoned"); + match q.front() { + Some(e) if e.terminal_at <= cutoff => q.pop_front().unwrap(), + _ => break, + } + }; + + if self.try_evict_one(&entry) { + removed += 1; + } + } + + removed + } + + /// Drive the size cap. Intended to be called right after `create()`. + /// Walks the terminal FIFO and evicts oldest entries until either the + /// live build count is at or below `max_entries`, or the FIFO is empty. + fn enforce_size_cap(&self) { + let Some(cap) = self.policy.max_entries else { + return; + }; + let mut live = self.inner.len() / 2; + if live <= cap { + return; + } + + loop { + if live <= cap { + return; + } + let entry = { + let mut q = self.terminal.lock().expect("terminal queue poisoned"); + match q.pop_front() { + Some(e) => e, + None => break, + } + }; + if self.try_evict_one(&entry) { + live = live.saturating_sub(1); + } + } + + if self.inner.len() / 2 > cap { + tracing::warn!( + cap, + live = self.inner.len() / 2, + "build registry exceeds max_entries but every remaining build is in-flight; \ + not evicting active builds. Increase build_registry_max_entries or wait \ + for in-flight builds to terminate." + ); + } + } + + /// Remove both index entries for one terminal build. + /// Returns `true` if anything was actually removed. + /// A `false` return covers two benign races: + /// - the build was already evicted (e.g. via duplicate FIFO entry), + /// - the build was un-set back to non-terminal (we refuse to drop + /// in-flight contexts here — TTL eviction is for terminal builds + /// only). + fn try_evict_one(&self, entry: &TerminalEntry) -> bool { + let still_terminal = self + .inner + .get(&entry.build_id) + .map(|r| r.value().stage.is_terminal()) + .unwrap_or(false); + if !still_terminal { + return false; + } + let username = self + .inner + .get(&entry.build_id) + .map(|r| r.value().credential.username.clone()) + .unwrap_or_default(); + let removed_bid = self.inner.remove(&entry.build_id).is_some(); + let removed_pair = self + .inner + .remove(&compose_key(&entry.template_id, &entry.build_id)) + .is_some(); + if !username.is_empty() { + self.username_index + .remove_if(&username, |_, v| v == &entry.build_id); + } + removed_bid || removed_pair + } + + fn push_terminal(&self, entry: TerminalEntry) { + if self.policy.terminal_ttl.is_none() && self.policy.max_entries.is_none() { + return; + } + let mut q = self.terminal.lock().expect("terminal queue poisoned"); + q.push_back(entry); + } + + #[cfg(test)] + fn terminal_queue_len(&self) -> usize { + self.terminal + .lock() + .expect("terminal queue poisoned") + .len() + } + + #[cfg(test)] + pub(crate) fn live_count(&self) -> usize { + self.inner.len() / 2 + } } fn compose_key(template_id: &str, build_id: &str) -> String { format!("{}::{}", template_id, build_id) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::CreateTemplateRequest; + + fn empty_request() -> CreateTemplateRequest { + CreateTemplateRequest { + template_id: String::new(), + instance_type: None, + alias: None, + team_id: None, + image: None, + dockerfile: None, + writable_layer_size: None, + exposed_ports: None, + probe_port: None, + probe_path: None, + cpu: None, + memory: None, + cpu_count: None, + memory_mb: None, + env: None, + env_vars: None, + allow_internet_access: None, + network_type: None, + nodes: None, + registry_username: None, + registry_password: None, + command: None, + args: None, + dns: None, + allow_out: None, + deny_out: None, + start_cmd: None, + ready_cmd: None, + } + } + + fn empty_credential() -> RegistryCredential { + RegistryCredential { + url: "http://127.0.0.1:5000".to_string(), + repository: "e2b/tpl".to_string(), + username: "_token".to_string(), + password: "secret".to_string(), + } + } + + fn make_registry(ttl_secs: i64, cap: usize) -> BuildRegistry { + BuildRegistry::with_policy(EvictionPolicy { + terminal_ttl: (ttl_secs > 0).then(|| Duration::seconds(ttl_secs)), + max_entries: (cap > 0).then_some(cap), + gc_interval: None, + }) + } + + fn create_one(reg: &BuildRegistry, tid: &str) -> String { + reg.create( + tid.to_string(), + empty_request(), + empty_credential(), + format!("127.0.0.1:5000/e2b/{}:bld", tid), + ) + .build_id + } + + fn mark_ready(reg: &BuildRegistry, bid: &str) { + reg.update(bid, |c| c.stage = BuildStage::Ready) + .expect("build present"); + } + + #[test] + fn terminal_transition_stamps_terminal_at_and_enqueues() { + let reg = make_registry(3600, 0); + let bid = create_one(®, "tpl-a"); + assert_eq!(reg.terminal_queue_len(), 0); + + mark_ready(®, &bid); + let ctx = reg.get(&bid).unwrap(); + assert!(ctx.terminal_at.is_some(), "terminal_at must be set"); + assert!(ctx.stage.is_terminal()); + assert_eq!(reg.terminal_queue_len(), 1); + } + + #[test] + fn duplicate_terminal_updates_do_not_grow_the_fifo() { + let reg = make_registry(3600, 0); + let bid = create_one(®, "tpl-a"); + mark_ready(®, &bid); + for _ in 0..5 { + reg.update(&bid, |c| c.message = "noise".to_string()); + } + assert_eq!( + reg.terminal_queue_len(), + 1, + "FIFO must dedupe rising-edge transitions" + ); + } + + #[test] + fn evict_expired_drops_terminal_builds_past_ttl() { + let reg = make_registry(60, 0); + let bid_a = create_one(®, "tpl-a"); + let bid_b = create_one(®, "tpl-b"); + mark_ready(®, &bid_a); + mark_ready(®, &bid_b); + + assert_eq!(reg.evict_expired(Utc::now() + Duration::seconds(30)), 0); + assert_eq!(reg.live_count(), 2); + + let removed = reg.evict_expired(Utc::now() + Duration::seconds(120)); + assert_eq!(removed, 2); + assert_eq!(reg.live_count(), 0); + assert!(reg.get(&bid_a).is_none()); + assert!(reg.get(&bid_b).is_none()); + assert_eq!(reg.terminal_queue_len(), 0); + } + + #[test] + fn evict_expired_leaves_in_flight_builds_alone() { + let reg = make_registry(60, 0); + let bid_done = create_one(®, "tpl-done"); + let bid_live = create_one(®, "tpl-live"); + mark_ready(®, &bid_done); + + let removed = reg.evict_expired(Utc::now() + Duration::seconds(120)); + assert_eq!(removed, 1); + assert!(reg.get(&bid_done).is_none(), "terminal build evicted"); + assert!(reg.get(&bid_live).is_some(), "in-flight build retained"); + } + + #[test] + fn size_cap_evicts_oldest_terminal_first_at_create_time() { + let reg = make_registry(0, 2); // cap = 2, no TTL. + let bid_a = create_one(®, "tpl-a"); + let bid_b = create_one(®, "tpl-b"); + mark_ready(®, &bid_a); + mark_ready(®, &bid_b); + + let _bid_c = create_one(®, "tpl-c"); + assert!(reg.get(&bid_a).is_none(), "oldest terminal evicted"); + assert!(reg.get(&bid_b).is_some()); + assert!(reg.live_count() <= 2); + } + + #[test] + fn size_cap_does_not_evict_active_builds() { + let reg = make_registry(0, 1); + let bid_live_1 = create_one(®, "tpl-x"); + let bid_live_2 = create_one(®, "tpl-y"); + assert!(reg.get(&bid_live_1).is_some()); + assert!(reg.get(&bid_live_2).is_some()); + } + + #[test] + fn evict_expired_skips_builds_that_left_terminal_state() { + let reg = make_registry(60, 0); + let bid = create_one(®, "tpl-a"); + mark_ready(®, &bid); + reg.update(&bid, |c| c.stage = BuildStage::Building); + + let removed = reg.evict_expired(Utc::now() + Duration::seconds(120)); + assert_eq!(removed, 0); + assert!(reg.get(&bid).is_some()); + } + + #[test] + fn unbounded_registry_does_not_queue_terminal_entries() { + let reg = BuildRegistry::with_policy(EvictionPolicy::unbounded()); + let bid = create_one(®, "tpl-a"); + mark_ready(®, &bid); + assert_eq!(reg.terminal_queue_len(), 0); + } +} diff --git a/CubeAPI/src/services/mod.rs b/CubeAPI/src/services/mod.rs index 43cb524a7..3e9699084 100644 --- a/CubeAPI/src/services/mod.rs +++ b/CubeAPI/src/services/mod.rs @@ -22,7 +22,9 @@ pub struct AppServices { impl AppServices { pub fn new(config: &ServerConfig, cubemaster: CubeMasterClient) -> Self { - let builds = builds::BuildRegistry::new(); + let policy = builds::EvictionPolicy::from_config(config); + let builds = builds::BuildRegistry::with_policy(policy); + let _gc = builds.spawn_gc(); Self { cluster: cluster::ClusterService::new(cubemaster.clone()), sandboxes: sandboxes::SandboxService::new( diff --git a/CubeAPI/src/services/templates.rs b/CubeAPI/src/services/templates.rs index e65473145..8ba51dbc8 100644 --- a/CubeAPI/src/services/templates.rs +++ b/CubeAPI/src/services/templates.rs @@ -199,16 +199,10 @@ impl TemplateService { format!("https://{}", public_host) }; - let credential = RegistryCredential { - url: credential_url, - repository: format!("{}/{}", repo_prefix, template_id), - username: "_token".to_string(), - password: self - .config - .registry_token - .clone() - .unwrap_or_else(|| "_anon".to_string()), - }; + let credential = mint_registry_credential( + credential_url, + format!("{}/{}", repo_prefix, template_id), + ); // Image ref CubeMaster will pull from once push is complete. let pull_host = self @@ -446,9 +440,47 @@ impl TemplateService { /// Mark a build as image-pushed (called by the registry handler once the /// manifest PUT for `repo:tag` succeeds). Idempotent. - pub fn mark_image_pushed(&self, build_id: &str) { + /// Advance a build from `WaitingPush` → `Building` after the registry + /// reverse-proxy observed a successful manifest PUT. + /// + /// **Defence in depth**: while build IDs are 128-bit UUIDs and therefore + /// hard to guess, we still cross-check that the manifest's repository + /// path matches the one we minted at create time. This stops a leaked + /// (or copy-pasted) build_id from being advanced by a manifest pushed + /// against an unrelated repo, and surfaces config drift in the registry + /// path-prefix as a warning rather than a silent state transition. + /// + /// `repo` is the path between `/v2/` and `/manifests/`, e.g. + /// `e2b/tpl-abc123` for `PUT /v2/e2b/tpl-abc123/manifests/bld-...`. + pub fn mark_image_pushed(&self, build_id: &str, repo: &str) { + let Some(ctx) = self.builds.get(build_id) else { + tracing::debug!( + build_id = %build_id, + repo = %repo, + "manifest PUT received for unknown build_id; ignoring" + ); + return; + }; + + if !manifest_repo_matches(&ctx.image_ref, repo) { + tracing::warn!( + build_id = %build_id, + got_repo = %repo, + expected_image_ref = %ctx.image_ref, + "manifest PUT repo does not match the image_ref \ + minted for this build; refusing to advance build state" + ); + return; + } + self.builds.update(build_id, |ctx| { ctx.append_log_inline("[push] image upload complete"); + // `image_pushed` is the single, authoritative signal that a + // manifest landed under our predicted `image_ref`. It survives + // any subsequent stage mutation (e.g. by status pollers) and + // is what `v3_trigger_build`'s OCI-distribution fallback + // gates on — *not* `stage`, which is an indirect proxy. + ctx.image_pushed = true; if matches!(ctx.stage, BuildStage::WaitingPush) { ctx.stage = BuildStage::Building; ctx.message = "image uploaded, waiting for build dispatch".to_string(); @@ -677,11 +709,65 @@ impl TemplateService { /// `GET /templates/{tid}/files/{hash}` — file-cache probe. /// - /// Until the in-cluster builder lands we don't actually consume uploaded - /// tarballs. We answer `present=true` so the SDK skips uploading; this is - /// safe because `from_image`-based builds (the only flow CubeMaster - /// currently supports) don't need the build context. - pub fn v3_get_file_upload(&self, _template_id: &str, _files_hash: &str) -> AppResult { + /// ## Contract (paired with `v3_trigger_build`) + /// + /// The E2B SDK calls this endpoint to ask "do you already have the + /// build-context tarball identified by ``?". A `present=true` + /// answer makes the SDK *skip* uploading the tarball, on the assumption + /// that the server-side builder will read it from cache. CubeAPI does + /// **not** currently run an in-cluster Dockerfile/steps builder, so + /// strictly speaking we don't have any tarball cache at all. + /// + /// We still answer `present=true` here for two reasons: + /// + /// 1. The SDK calls this endpoint *unconditionally* before every + /// build, including pure `fromImage` flows that don't need a + /// tarball at all. Returning `present=false` would force the SDK + /// to PUT a (typically empty) tarball to a URL we don't have + /// anywhere to put. + /// 2. We compensate by enforcing a strict fail-fast in + /// `v3_trigger_build`: if the dispatch body doesn't carry a + /// `fromImage` / `fromTemplate` / pre-pushed registry image, we + /// reject with `501 Not Implemented` and a message that points the + /// caller back to the supported flows. That means a `dockerfile` + /// / `steps`-driven build can never silently succeed against a + /// non-existent tarball — it just fails one round-trip later than + /// it would in upstream e2b-infra. + /// + /// **A `present=true` reply from this endpoint is therefore not a + /// promise that we accepted a tarball.** It is exactly the + /// "no-op, please proceed" hint the SDK needs to advance to the + /// `POST /v2/.../builds/{bid}` step where the real validation lives. + /// + /// Until the in-cluster builder lands (Phase 4) the warning emitted + /// here gives operators an observability hook for "someone is trying + /// to use a context-based build against a CubeAPI that can't honour + /// it" without having to read trigger-time logs. + /// + /// The handler returns `201 Created` (not `200 OK`) on purpose — see the + /// doc comment on `handlers::templates_v3::v3_get_files_hash` for the + /// E2B SDK compatibility rationale. + pub fn v3_get_file_upload( + &self, + template_id: &str, + files_hash: &str, + ) -> AppResult { + // Cheap heuristic: an SDK invoking the empty-context flow (pure + // `fromImage`) typically still hashes *something*, so we can't tell + // dockerfile vs. fromImage apart purely from `files_hash`. Emit a + // single warn so operators can grep for it; trigger-time fail-fast + // is the authoritative gate. + tracing::warn!( + template_id = %template_id, + files_hash = %files_hash, + "files-hash cache probe answered present=true unconditionally; \ + CubeAPI does not run an in-cluster context builder. \ + Dockerfile-/steps-based builds will be rejected at \ + POST /v2/templates/{{tid}}/builds/{{bid}} with 501. \ + Use `fromImage` (or `docker push` via the bundled OCI registry) \ + to drive the build." + ); + Ok(crate::models::V3TemplateFileUpload { present: true, url: None, @@ -696,12 +782,34 @@ impl TemplateService { /// 1. `body.from_image` — the standard E2B flow, e.g. /// `python:3.11-slim`. /// 2. The image already pushed to the bundled registry under - /// `/:` (when the OCI Distribution path - /// was used). - /// 3. `body.from_template` — copy from another known CubeSandbox - /// template (resolved via CubeMaster `get_template`). + /// `/:` — only used when + /// `BuildContext::image_pushed` is `true`, i.e. the registry + /// reverse proxy has observed a successful manifest PUT and + /// `mark_image_pushed` cross-checked the repo. We deliberately + /// do **not** key off `stage` or "image_ref is non-empty" + /// here: `image_ref` is *predicted* at create time and would + /// otherwise let us dispatch CubeMaster against a registry + /// slot that holds nothing, with the failure surfacing later + /// as `manifest unknown` during pull. When `image_ref` is + /// non-empty but `image_pushed` is still `false`, we surface + /// that mismatch as **`409 Conflict`** so the SDK can retry + /// after `docker push` completes. + /// 3. `body.from_template` — **rejected with 501** until a + /// downstream resolver for `cube://` exists in + /// CubeMaster/Cubelet. Today CubeMaster feeds `SourceImageRef` + /// straight into `docker pull`, so a synthesised `cube://...` + /// ref would silently break image resolution; we fail fast at + /// the API layer instead. Callers who want this flow should + /// resolve the parent template themselves and pass the resulting + /// OCI reference through `from_image`. /// - /// `start_cmd` becomes container `args`; `ready_cmd` becomes a Probe.Exec. + /// `start_cmd` becomes container `args`; `ready_cmd` is *not* forwarded + /// as an exec probe — CubeMaster/Cubelet only accept TcpSocket / Ping / + /// HttpGet handlers, so we instead best-effort parse an embedded + /// `http(s)://host:port[/path]` URL out of the readyCmd and synthesise + /// a `Probe.HttpGet`. If no URL can be parsed and no `probePort` / + /// `exposedPorts` are supplied, no probe is emitted at all — see + /// `parse_ready_url` and `build_probe` for the precise rules. pub async fn v3_trigger_build( &self, template_id: String, @@ -732,18 +840,154 @@ impl TemplateService { .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) { - // Re-use an already-built CubeSandbox template as the base. We - // synthesise a CubeMaster reference of the form `cube://`, - // letting downstream callers resolve it. Adjust to your local - // convention if needed. - format!("cube://{}", parent) - } else if !ctx.image_ref.is_empty() { + // `fromTemplate` is **not yet wired end-to-end**: CubeMaster's + // template_image.go feeds `SourceImageRef` straight into + // `docker pull` / `docker image inspect`, and there is no + // resolver for a `cube://` scheme anywhere downstream + // (Cubelet, CubeMaster, builder). If we synthesised a + // `cube://` ref here, the build would *look* accepted + // at the API layer and only fail several seconds later inside + // the build worker with an opaque `docker pull cube://...: + // invalid reference format` error — exactly the kind of + // "looks supported but isn't" footgun reviewers flagged. + // + // Until the downstream resolver lands (tracked separately), + // surface the gap explicitly. Operators who actually want this + // flow today can resolve `parent` themselves and pass the + // resulting OCI ref via `fromImage`. + self.builds.append_log( + &build_id, + format!( + "[dispatch-v3] rejecting build: fromTemplate={} is not \ + supported by this deployment — no downstream resolver \ + for `cube://` exists in CubeMaster/Cubelet \ + yet. Resolve the parent template to an OCI image \ + reference and pass it via `fromImage` instead.", + parent, + ), + ); + return Err(AppError::NotImplemented(format!( + "build {} of template {} requested `fromTemplate={}`, but \ + CubeAPI cannot honour it: the downstream stack \ + (CubeMaster/Cubelet) does not yet understand the \ + `cube://` source scheme and would attempt to \ + `docker pull` it verbatim. Pass `fromImage` with the \ + already-resolved OCI reference of the parent template, \ + or wait for the cube:// resolver to ship.", + build_id, template_id, parent, + ))); + } else if ctx.image_pushed { + // OCI Distribution path: the caller has actually completed an + // OCI manifest PUT against `image_ref` — `mark_image_pushed` + // verified the repo and flipped `image_pushed` to true. We + // can safely dispatch CubeMaster against the predicted ref + // because we *know* a manifest now lives under it. + // + // Note we deliberately do NOT key off `stage != WaitingPush` + // here. `stage` is mutated by status pollers and the v2 + // dispatch path for unrelated reasons; using it as a proxy + // for "client pushed" would re-open the very gap the + // reviewer flagged: dispatching against `ctx.image_ref` even + // though it's just the *predicted* path minted at create + // time, with no manifest behind it. The CubeMaster pull + // would then fail several seconds later with `manifest + // unknown` — exactly the kind of late-stage error this guard + // is meant to prevent. + debug_assert!( + !ctx.image_ref.is_empty(), + "image_pushed=true must imply non-empty image_ref" + ); ctx.image_ref.clone() } else { - return Err(AppError::BadRequest( - "either fromImage, fromTemplate, or a previously-pushed image is required" - .to_string(), - )); + // Distinguish three remaining failure modes so the error + // message tells the operator *exactly* what to do. + let has_steps = body + .steps + .as_ref() + .map(|s| !s.is_empty()) + .unwrap_or(false); + if has_steps { + // (1) `steps[]` build with no fromImage — needs an + // in-cluster context builder we don't run. + self.builds.append_log( + &build_id, + "[dispatch-v3] rejecting build: steps[] supplied but \ + CubeAPI has no in-cluster context builder; supply \ + fromImage or push a pre-built image to the bundled \ + OCI registry instead", + ); + return Err(AppError::NotImplemented(format!( + "dockerfile-/steps-based builds are not supported by \ + this CubeAPI deployment (build {} of template {} \ + supplied {} step(s) without a fromImage). Either set \ + `fromImage` to a base OCI reference, or `docker push` \ + a pre-built image to the bundled registry under \ + `/:` before calling \ + this endpoint.", + build_id, + template_id, + body.steps.as_ref().map(|s| s.len()).unwrap_or(0), + ))); + } + if !ctx.image_ref.is_empty() { + // (2) `image_ref` is non-empty (it was predicted at + // create time) but the manifest never landed — the + // SDK skipped (or hasn't yet completed) `docker + // push`. Reviewer-driven guard: do NOT silently + // dispatch CubeMaster against an empty registry + // slot. Surface the mismatch *before* CubeMaster + // starts pulling, with an actionable hint. + self.builds.append_log( + &build_id, + format!( + "[dispatch-v3] rejecting build: predicted image_ref \ + {} exists but no successful manifest PUT has been \ + observed by the registry reverse proxy yet \ + (image_pushed=false). Dispatching now would only \ + move the failure into CubeMaster's pull stage as \ + `manifest unknown`.", + ctx.image_ref, + ), + ); + return Err(AppError::Conflict(format!( + "build {} of template {} has not received the \ + OCI manifest PUT yet: the registry reverse proxy \ + has not observed a successful `PUT \ + /v2//manifests/{}` against `image_ref={}`. \ + Note: the SDK's `GET /templates/{{tid}}/files/{{hash}}` \ + cache probe always returns `present=true` and is \ + *not* a commitment that any image was accepted — \ + see `v3_get_file_upload` for the contract. Either \ + wait for `docker push` to complete and retry, or \ + supply `fromImage` to bypass the bundled registry \ + path.", + build_id, template_id, build_id, ctx.image_ref, + ))); + } + // (3) Neither steps nor fromImage nor any push — the SDK + // probably believed the build context was already cached + // server-side (because `/files/{hash}` answered + // present=true); see `v3_get_file_upload` for the + // contract. We surface a 501 here so the failure mode + // is unambiguous. + self.builds.append_log( + &build_id, + "[dispatch-v3] rejecting build: no fromImage / fromTemplate \ + and no image was pushed to the bundled registry before \ + dispatch; CubeAPI cannot synthesise a source image from a \ + build-context tarball alone", + ); + return Err(AppError::NotImplemented(format!( + "build {} of template {} cannot be dispatched: this \ + CubeAPI deployment does not run an in-cluster build-context \ + builder, so a `fromImage` (or pre-pushed registry image, \ + or `fromTemplate`) is required. The SDK's \ + `GET /templates/{{tid}}/files/{{hash}}` cache probe \ + returns `present=true` unconditionally and is *not* a \ + commitment that any tarball was accepted — see the \ + server-side docs on `v3_get_file_upload` for the contract.", + build_id, template_id, + ))); }; // Patch the cached create_request with the V2-time fields and dispatch. @@ -870,14 +1114,73 @@ impl TemplateService { .take(limit) .cloned() .collect(); - let log_entries: Vec = logs - .iter() - .map(|line| V3BuildLogEntry { - timestamp: chrono::Utc::now(), - message: line.clone(), - level: "info".to_string(), - }) - .collect(); + + // Reviewer-flagged bug: previously `log_entries` stamped each line + // with `Utc::now()` at poll time, so the *same* historical line + // would receive a fresh timestamp on every status poll — making + // `logEntries[i].timestamp` jitter forwards in time even though + // the line itself never changed. + // + // The structured timestamps already exist on + // `BuildContext.logs[i].timestamp` (`BuildLogLine`) and were + // stamped at log-write time by `BuildRegistry::append_log`. We + // reach into the registry to pull those write-time timestamps + // back out, taking care to: + // + // - apply the *same* `(logs_offset, limit)` window that + // `get_template_build_status` used to produce + // `internal.logs`, so the i-th entry of `logs` lines up + // with the i-th entry of `log_entries`; + // - clamp the entry count to `logs.len()` so we never emit + // more `log_entries` than `logs` even if a concurrent + // poll appended new lines between the two reads. + // + // The narrow corner case where the build context has been + // evicted between the `get_template_build_status` call above + // and this read (e.g. terminal-state eviction firing during + // an in-flight poll on the same build) falls through to a + // best-effort `created_at`-style fallback — the historical + // bug used `Utc::now()` there too, so behaviour is no worse + // than before, and we still preserve the + // `logs.len() == log_entries.len()` invariant the SDK relies + // on. + let log_entries: Vec = match self.builds.get(build_id) { + Some(ctx) => { + let total = ctx.logs.len(); + let start = (logs_offset.max(0) as usize).min(total); + ctx.logs + .iter() + .skip(start) + .take(logs.len()) + .map(|entry| V3BuildLogEntry { + timestamp: entry.timestamp, + message: entry.line.clone(), + level: "info".to_string(), + }) + .collect() + } + None => { + tracing::debug!( + template_id = %template_id, + build_id = %build_id, + "build context vanished between status poll and \ + log-entry materialisation; falling back to \ + poll-time timestamps for V3 logEntries" + ); + logs.iter() + .map(|line| V3BuildLogEntry { + timestamp: chrono::Utc::now(), + message: line.clone(), + level: "info".to_string(), + }) + .collect() + } + }; + debug_assert_eq!( + logs.len(), + log_entries.len(), + "V3 logs and logEntries must be aligned 1:1" + ); let status = match internal.status.as_str() { "ready" => "ready", @@ -924,16 +1227,11 @@ impl TemplateService { } else { self.config.registry_repo_prefix.trim() }; - RegistryCredential { - url, - repository: format!("{}/{}", repo_prefix, template_id), - username: "_token".to_string(), - password: self - .config - .registry_token - .clone() - .unwrap_or_else(|| "_anon".to_string()), - } + // Per-build short-lived credential — see `mint_registry_credential` + // and the matching comment in `create_template_e2b_mode` for the + // rationale (username is the routing key into `username_index`, + // password is verified by the registry reverse-proxy). + mint_registry_credential(url, format!("{}/{}", repo_prefix, template_id)) } } @@ -1282,6 +1580,44 @@ fn base_url(url: &str) -> String { } } +fn mint_registry_credential(url: String, repository: String) -> RegistryCredential { + use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; + let mut buf = [0u8; 32]; + buf[..16].copy_from_slice(Uuid::new_v4().as_bytes()); + buf[16..].copy_from_slice(Uuid::new_v4().as_bytes()); + let token = URL_SAFE_NO_PAD.encode(buf); + RegistryCredential { + url, + repository, + username: format!("bld_{}", &token[..22]), + password: token, + } +} + +fn manifest_repo_matches(image_ref: &str, repo: &str) -> bool { + let Some(expected) = image_ref_repo(image_ref) else { + return false; + }; + expected == repo +} + +/// Extract the `repo` segment from an `image_ref` of the form +/// `[:port]/:`. Returns `None` when the host or tag is +/// missing, or when the repo would be empty. +fn image_ref_repo(image_ref: &str) -> Option { + let without_tag = match (image_ref.rfind(':'), image_ref.rfind('/')) { + (Some(colon), Some(slash)) if colon > slash => &image_ref[..colon], + _ => image_ref, + }; + let slash = without_tag.find('/')?; + let repo = &without_tag[slash + 1..]; + if repo.is_empty() { + None + } else { + Some(repo.to_string()) + } +} + // Adapter helper used inside dashmap update closures. impl crate::services::builds::BuildContext { pub(crate) fn append_log_inline(&mut self, line: impl Into) { @@ -1521,6 +1857,127 @@ mod tests { ); } + #[test] + fn image_ref_repo_extracts_repo_with_host_port_and_tag() { + assert_eq!( + image_ref_repo("127.0.0.1:5000/e2b/tpl-abc:bld-123").as_deref(), + Some("e2b/tpl-abc") + ); + assert_eq!( + image_ref_repo("registry.example.com/team/tpl-xyz").as_deref(), + Some("team/tpl-xyz") + ); + assert_eq!( + image_ref_repo("reg.local:443/x/y/z:latest").as_deref(), + Some("x/y/z") + ); + } + + #[test] + fn image_ref_repo_returns_none_for_malformed_input() { + assert_eq!(image_ref_repo("only-host").as_deref(), None); + assert_eq!(image_ref_repo("host.example.com/").as_deref(), None); + assert_eq!(image_ref_repo("host:5000/:tag").as_deref(), None); + } + + #[test] + fn manifest_repo_matches_accepts_canonical_image_ref() { + assert!(manifest_repo_matches( + "127.0.0.1:5000/e2b/tpl-abc:bld-123", + "e2b/tpl-abc" + )); + } + + #[test] + fn manifest_repo_matches_rejects_mismatched_repo() { + assert!(!manifest_repo_matches( + "127.0.0.1:5000/e2b/tpl-abc:bld-123", + "attacker/tpl-abc" + )); + assert!(!manifest_repo_matches( + "127.0.0.1:5000/e2b/tpl-abc:bld-123", + "e2b/tpl-other" + )); + } + + #[test] + fn manifest_repo_matches_rejects_malformed_image_ref() { + assert!(!manifest_repo_matches("e2b/tpl-abc:bld-123", "e2b/tpl-abc")); + } + + #[test] + fn mark_image_pushed_advances_stage_when_repo_matches() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let cred = RegistryCredential { + url: "http://127.0.0.1:5000".to_string(), + repository: "e2b/tpl-abc".to_string(), + username: "_token".to_string(), + password: "secret".to_string(), + }; + let ctx = svc.builds.create( + "tpl-abc".to_string(), + empty_request(), + cred, + "127.0.0.1:5000/e2b/tpl-abc:bld-deadbeef".to_string(), + ); + svc.builds.update(&ctx.build_id, |c| { + c.image_ref = format!("127.0.0.1:5000/e2b/tpl-abc:{}", c.build_id); + }); + + svc.mark_image_pushed(&ctx.build_id, "e2b/tpl-abc"); + + let after = svc.builds.get(&ctx.build_id).expect("ctx"); + assert_eq!(after.stage, BuildStage::Building); + assert!( + after.image_pushed, + "mark_image_pushed must flip image_pushed=true on success — \ + this is the authoritative signal v3_trigger_build's \ + OCI fallback gates on" + ); + } + + #[test] + fn mark_image_pushed_refuses_when_repo_does_not_match() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let cred = RegistryCredential { + url: "http://127.0.0.1:5000".to_string(), + repository: "e2b/tpl-abc".to_string(), + username: "_token".to_string(), + password: "secret".to_string(), + }; + let ctx = svc.builds.create( + "tpl-abc".to_string(), + empty_request(), + cred, + "127.0.0.1:5000/e2b/tpl-abc:bld-deadbeef".to_string(), + ); + svc.builds.update(&ctx.build_id, |c| { + c.image_ref = format!("127.0.0.1:5000/e2b/tpl-abc:{}", c.build_id); + }); + + svc.mark_image_pushed(&ctx.build_id, "attacker/tpl-abc"); + + let after = svc.builds.get(&ctx.build_id).expect("ctx"); + assert_eq!( + after.stage, + BuildStage::WaitingPush, + "stage must not advance when repo mismatches" + ); + assert!( + !after.image_pushed, + "image_pushed must stay false when the repo cross-check \ + fails — otherwise v3_trigger_build would later dispatch \ + against an unverified slot" + ); + } + + #[test] + fn mark_image_pushed_is_noop_for_unknown_build_id() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + svc.mark_image_pushed("bld-does-not-exist", "e2b/tpl-abc"); + assert!(svc.builds.get("bld-does-not-exist").is_none()); + } + #[test] fn remap_cubemaster_status_normalizes_phases_to_e2b_tokens() { assert_eq!(remap_cubemaster_status(""), "pending"); @@ -1572,7 +2029,38 @@ mod tests { let cred = job.registry.expect("registry credential"); assert_eq!(cred.url, "http://127.0.0.1:5000"); assert!(cred.repository.starts_with("e2b/tpl-")); - assert_eq!(cred.username, "_token"); + // Per-build short-lived credential: username is `bld_<…>` (i.e. NOT + // the legacy global `_token`), and password is a high-entropy + // random string that the registry reverse-proxy validates against + // the in-memory BuildRegistry on every push request. See + // `mint_registry_credential` for the rationale. + assert!( + cred.username.starts_with("bld_"), + "expected per-build username (bld_<…>), got {:?}", + cred.username + ); + assert!( + cred.password.len() >= 32, + "expected high-entropy random password, got {} chars", + cred.password.len() + ); + assert_ne!( + cred.username, "_token", + "the legacy shared `_token` username must not regress — \ + it would defeat per-build credential validation" + ); + // Issuing a second build must produce a different credential pair + // (i.e. RNG is wired up properly and we're not handing every build + // the same secret). + let mut req2 = empty_request(); + req2.dockerfile = Some("FROM ubuntu".to_string()); + let job2 = svc + .create_template(req2) + .await + .expect("second e2b create should succeed"); + let cred2 = job2.registry.expect("second registry credential"); + assert_ne!(cred.username, cred2.username); + assert_ne!(cred.password, cred2.password); // Internal BuildRegistry now knows about this build and stores the // image_ref CubeMaster will later pull from. @@ -1584,6 +2072,329 @@ mod tests { assert!(ctx.image_ref.ends_with(&format!(":{}", job.build_id))); } + #[tokio::test] + async fn v3_trigger_build_rejects_steps_without_from_image_with_501() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + let body = V2TemplateBuildStart { + steps: Some(vec![serde_json::json!({"type": "RUN", "args": ["echo hi"]})]), + ..Default::default() + }; + let err = svc + .v3_trigger_build(job.template_id.clone(), job.build_id.clone(), body) + .await + .expect_err("steps-only build must be rejected, not dispatched"); + + match err { + AppError::NotImplemented(msg) => { + assert!( + msg.contains("dockerfile-/steps-based builds are not supported"), + "unexpected NotImplemented message: {msg}" + ); + assert!(msg.contains(&job.build_id)); + } + other => panic!("expected NotImplemented, got {other:?}"), + } + + let ctx = svc + .builds + .get(&job.build_id) + .expect("build context preserved on failure"); + assert_eq!(ctx.stage, BuildStage::WaitingPush); + } + + #[tokio::test] + async fn v3_trigger_build_does_not_use_unpushed_image_ref() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + let ctx = svc + .builds + .get(&job.build_id) + .expect("build context exists"); + assert!(!ctx.image_pushed, "fresh build must not be marked pushed"); + assert!( + !ctx.image_ref.is_empty(), + "image_ref is predicted at create time and should already \ + be populated — exactly the trap this guard prevents" + ); + + let body = V2TemplateBuildStart::default(); + let err = svc + .v3_trigger_build(job.template_id.clone(), job.build_id.clone(), body) + .await + .expect_err("unpushed builds must not be dispatched against the predicted ref"); + + match err { + AppError::Conflict(msg) => { + assert!( + msg.contains("manifest PUT"), + "error must name the missing operation: {msg}" + ); + assert!( + msg.contains(&job.build_id), + "error must include the build_id: {msg}" + ); + assert!( + msg.contains("fromImage"), + "error must point operators at the fromImage \ + workaround: {msg}" + ); + } + other => panic!("expected Conflict, got {other:?}"), + } + + let ctx = svc + .builds + .get(&job.build_id) + .expect("build context preserved on failure"); + assert!(!ctx.image_pushed); + } + + #[tokio::test] + async fn v3_trigger_build_uses_image_ref_after_mark_image_pushed_flips_flag() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + svc.builds.update(&job.build_id, |c| { + c.image_ref = format!("127.0.0.1:5000/e2b/tpl-abc:{}", c.build_id); + }); + svc.mark_image_pushed(&job.build_id, "e2b/tpl-abc"); + let ctx = svc.builds.get(&job.build_id).expect("ctx exists"); + assert!( + ctx.image_pushed, + "mark_image_pushed must flip image_pushed=true" + ); + + let body = V2TemplateBuildStart::default(); + let err = svc + .v3_trigger_build(job.template_id.clone(), job.build_id.clone(), body) + .await + .expect_err( + "cubemaster is unreachable in unit tests, so dispatch \ + will fail at transport — but the source-resolution \ + branch must already have been satisfied", + ); + + assert!( + !matches!(err, AppError::Conflict(_)), + "image_pushed=true must defuse the 409 guard: {err:?}" + ); + assert!( + !matches!(err, AppError::NotImplemented(_)), + "image_pushed=true must defuse the 501 source-resolution \ + guard: {err:?}" + ); + } + + #[tokio::test] + async fn v3_get_build_status_preserves_log_write_timestamps_across_polls() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + let baseline_len = svc + .builds + .get(&job.build_id) + .expect("ctx exists") + .logs + .len(); + + svc.builds.append_log(&job.build_id, "first line"); + svc.builds.append_log(&job.build_id, "second line"); + svc.builds.append_log(&job.build_id, "third line"); + + let expected_ts: Vec<_> = svc + .builds + .get(&job.build_id) + .expect("ctx exists") + .logs + .iter() + .map(|l| l.timestamp) + .collect(); + let expected_total = baseline_len + 3; + assert_eq!( + expected_ts.len(), + expected_total, + "test setup must seed exactly three additional log lines" + ); + + let first = svc + .v3_get_build_status(&job.template_id, &job.build_id, 0, 1000) + .await + .expect("first status poll should succeed"); + assert_eq!(first.log_entries.len(), expected_total); + assert_eq!(first.logs.len(), first.log_entries.len()); + for (i, entry) in first.log_entries.iter().enumerate() { + assert_eq!( + entry.timestamp, expected_ts[i], + "logEntries[{i}].timestamp must match the write-time \ + BuildLogLine.timestamp, not Utc::now() at poll time" + ); + } + assert_eq!(first.log_entries[baseline_len].message, "first line"); + assert_eq!(first.log_entries[baseline_len + 1].message, "second line"); + assert_eq!(first.log_entries[baseline_len + 2].message, "third line"); + + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + + let second = svc + .v3_get_build_status(&job.template_id, &job.build_id, 0, 1000) + .await + .expect("second status poll should succeed"); + assert_eq!(second.log_entries.len(), expected_total); + for (i, entry) in second.log_entries.iter().enumerate() { + assert_eq!( + entry.timestamp, first.log_entries[i].timestamp, + "logEntries[{i}].timestamp must be stable across \ + polls — reviewer-flagged regression: previously \ + each poll re-stamped lines with Utc::now() so the \ + same historical line drifted forwards in time" + ); + assert_eq!( + entry.message, first.log_entries[i].message, + "log message must match across polls" + ); + } + + svc.builds.append_log(&job.build_id, "fourth line"); + let third = svc + .v3_get_build_status(&job.template_id, &job.build_id, 0, 1000) + .await + .expect("third status poll should succeed"); + assert_eq!(third.log_entries.len(), expected_total + 1); + for i in 0..expected_total { + assert_eq!( + third.log_entries[i].timestamp, first.log_entries[i].timestamp, + "appending a new line must not perturb existing \ + logEntries[{i}].timestamp" + ); + } + assert_eq!(third.log_entries[expected_total].message, "fourth line"); + assert!( + third.log_entries[expected_total].timestamp + >= first.log_entries[expected_total - 1].timestamp, + "newly appended line must carry a write-time timestamp \ + at or after the previous tail" + ); + } + + #[tokio::test] + async fn v3_get_build_status_log_entries_respect_logs_offset() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + let baseline_len = svc + .builds + .get(&job.build_id) + .expect("ctx") + .logs + .len(); + + svc.builds.append_log(&job.build_id, "alpha"); + svc.builds.append_log(&job.build_id, "beta"); + svc.builds.append_log(&job.build_id, "gamma"); + svc.builds.append_log(&job.build_id, "delta"); + + let expected_ts: Vec<_> = svc + .builds + .get(&job.build_id) + .expect("ctx") + .logs + .iter() + .map(|l| l.timestamp) + .collect(); + + let skip = (baseline_len + 2) as i32; + let resp = svc + .v3_get_build_status(&job.template_id, &job.build_id, skip, 1000) + .await + .expect("paged status poll should succeed"); + + assert_eq!(resp.log_entries.len(), 2); + assert_eq!(resp.logs.len(), resp.log_entries.len()); + assert_eq!(resp.log_entries[0].message, "gamma"); + assert_eq!(resp.log_entries[1].message, "delta"); + assert_eq!( + resp.log_entries[0].timestamp, + expected_ts[baseline_len + 2] + ); + assert_eq!( + resp.log_entries[1].timestamp, + expected_ts[baseline_len + 3] + ); + } + + #[tokio::test] + async fn v3_trigger_build_rejects_from_template_with_501_until_resolver_lands() { + let svc = make_service(Some("http://127.0.0.1:5000".to_string())); + let mut req = empty_request(); + req.dockerfile = Some("FROM ubuntu".to_string()); + let job = svc + .create_template(req) + .await + .expect("e2b create should succeed"); + + let body = V2TemplateBuildStart { + from_template: Some("tpl-parent-xyz".to_string()), + ..Default::default() + }; + let err = svc + .v3_trigger_build(job.template_id.clone(), job.build_id.clone(), body) + .await + .expect_err("fromTemplate must be rejected, not silently dispatched as cube://..."); + + match err { + AppError::NotImplemented(msg) => { + assert!( + msg.contains("tpl-parent-xyz"), + "error must echo the rejected parent: {msg}" + ); + assert!( + msg.contains("fromImage"), + "error must point operators at the fromImage workaround: {msg}" + ); + assert!( + msg.contains("cube://"), + "error should name the unimplemented scheme so \ + operators can grep release notes for it: {msg}" + ); + } + other => panic!("expected NotImplemented, got {other:?}"), + } + + let ctx = svc + .builds + .get(&job.build_id) + .expect("build context preserved on failure"); + assert_eq!(ctx.stage, BuildStage::WaitingPush); + } + /// Regression: CubeMaster validates `writable_layer_size` as required and /// the E2B V3 SDK never sends it. Verify the service injects the /// configured default so the request reaches CubeMaster non-empty. diff --git a/CubeAPI/src/state.rs b/CubeAPI/src/state.rs index 201a4dc52..c88ec960c 100644 --- a/CubeAPI/src/state.rs +++ b/CubeAPI/src/state.rs @@ -66,13 +66,71 @@ impl AppState { None => None, }; - Self { + let s = Self { rate_limiter, http_client, services, logger, config: Arc::new(config), agenthub_store, + }; + s.log_registry_security_posture(); + s + } + + /// Emit a single startup line summarising whether the bundled OCI + /// registry reverse-proxy is on, and — if so — whether the operator + /// has obviously misconfigured the deployment such that the per-build + /// credentials are exposed in the clear. + /// + /// We don't refuse to start: this is a one-click developer-experience + /// product, and a hard failure on `bind=0.0.0.0` would surprise users + /// running on a single VM with a firewall in front of them. But we do + /// log loudly so that production operators see the warning during the + /// first deploy. + fn log_registry_security_posture(&self) { + let upstream = self + .config + .registry_upstream + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()); + let Some(upstream) = upstream else { + tracing::info!("registry reverse-proxy disabled (CUBE_API_REGISTRY_UPSTREAM unset)"); + return; + }; + + let upstream_is_loopback = upstream.contains("127.0.0.1") + || upstream.contains("localhost") + || upstream.contains("[::1]"); + let bind = self.config.bind.as_str(); + let bind_is_loopback = bind.starts_with("127.0.0.1") || bind.starts_with("[::1]"); + let bind_is_public = bind.starts_with("0.0.0.0") || bind.starts_with("[::]"); + + if bind_is_public && upstream_is_loopback { + tracing::warn!( + bind = %bind, + upstream = %upstream, + "registry reverse-proxy is enabled with an unauthenticated loopback \ + upstream while CubeAPI binds on a public interface. CubeAPI's own \ + per-build credential gate is in force, but build push tokens will \ + cross the network in clear text unless this listener is fronted \ + by TLS. Either: (a) terminate TLS in a reverse proxy in front of \ + CubeAPI, or (b) run distribution/distribution with htpasswd auth \ + and rely on the upstream's own TLS+auth. See ServerConfig::registry_upstream." + ); + } else if bind_is_loopback { + tracing::info!( + bind = %bind, + upstream = %upstream, + "registry reverse-proxy enabled on a loopback bind; safe for development" + ); + } else { + tracing::info!( + bind = %bind, + upstream = %upstream, + "registry reverse-proxy enabled; per-build credential gate is in force" + ); } } }