diff --git a/Cargo.lock b/Cargo.lock index 2d0bc6ce2..1253ffffc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -892,6 +892,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-queue" version = "0.3.12" @@ -1455,6 +1464,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1721,7 +1736,16 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", ] [[package]] @@ -2721,6 +2745,52 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "metrics" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" +dependencies = [ + "ahash", + "portable-atomic", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3589659543c04c7dc5526ec858591015b87cd8746583b51b48ef4353f99dbcda" +dependencies = [ + "base64 0.22.1", + "http-body-util", + "hyper", + "hyper-util", + "indexmap 2.14.0", + "ipnet", + "metrics", + "metrics-util", + "quanta", + "thiserror 2.0.18", + "tokio", + "tracing", +] + +[[package]] +name = "metrics-util" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdfb1365fea27e6dd9dc1dbc19f570198bc86914533ad639dae939635f096be4" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.16.1", + "metrics", + "quanta", + "rand 0.9.2", + "rand_xoshiro", + "sketches-ddsketch", +] + [[package]] name = "miette" version = "7.6.0" @@ -3238,6 +3308,8 @@ dependencies = [ "hyper-rustls", "hyper-util", "ipnet", + "metrics", + "metrics-exporter-prometheus", "miette", "openshell-core", "openshell-driver-kubernetes", @@ -3857,6 +3929,21 @@ dependencies = [ "autotools", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quinn" version = "0.11.9" @@ -3998,6 +4085,15 @@ version = "0.10.0-rc-3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f66ee92bc15280519ef199a274fe0cafff4245d31bc39aaa31c011ad56cb1f05" +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "ratatui" version = "0.26.3" @@ -4018,6 +4114,15 @@ dependencies = [ "unicode-width 0.1.14", ] +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags", +] + [[package]] name = "rcgen" version = "0.13.2" @@ -4790,6 +4895,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" + [[package]] name = "slab" version = "0.4.12" diff --git a/Cargo.toml b/Cargo.toml index b51aee3c2..cffad2cc1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,6 +58,10 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tracing-appender = "0.2" +# Metrics +metrics = "0.24" +metrics-exporter-prometheus = { version = "0.18", default-features = false, features = ["http-listener"] } + # Unix/Process nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] } diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index e57253739..d9440745d 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -63,6 +63,12 @@ pub struct Config { #[serde(default)] pub health_bind_address: Option, + /// Address to bind the Prometheus metrics endpoint to. + /// + /// When `None`, the dedicated metrics listener is disabled. + #[serde(default)] + pub metrics_bind_address: Option, + /// Log level (trace, debug, info, warn, error). #[serde(default = "default_log_level")] pub log_level: String, @@ -183,6 +189,7 @@ impl Config { Self { bind_address: default_bind_address(), health_bind_address: None, + metrics_bind_address: None, log_level: default_log_level(), tls, database_url: String::new(), @@ -216,6 +223,12 @@ impl Config { self } + #[must_use] + pub const fn with_metrics_bind_address(mut self, addr: SocketAddr) -> Self { + self.metrics_bind_address = Some(addr); + self + } + /// Create a new configuration with the given log level. #[must_use] pub fn with_log_level(mut self, level: impl Into) -> Self { diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index b2524ff0b..6f8c949c7 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -56,6 +56,10 @@ anyhow = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +# Metrics +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } + # Utilities futures = { workspace = true } bytes = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 796068d21..1ed5a5982 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -28,6 +28,11 @@ struct Args { #[arg(long, default_value_t = 0, env = "OPENSHELL_HEALTH_PORT")] health_port: u16, + /// Port for the Prometheus metrics endpoint (/metrics). + /// Set to 0 to disable the dedicated metrics listener. + #[arg(long, default_value_t = 0, env = "OPENSHELL_METRICS_PORT")] + metrics_port: u16, + /// Log level (trace, debug, info, warn, error). #[arg(long, default_value = "info", env = "OPENSHELL_LOG_LEVEL")] log_level: String, @@ -202,6 +207,7 @@ async fn run_from_args(args: Args) -> Result<()> { ); let bind = SocketAddr::from(([0, 0, 0, 0], args.port)); + let tls = if args.disable_tls { None } else { @@ -241,6 +247,23 @@ async fn run_from_args(args: Args) -> Result<()> { config = config.with_health_bind_address(health_bind); } + if args.metrics_port != 0 { + if args.port == args.metrics_port { + return Err(miette::miette!( + "--port and --metrics-port must be different (both set to {})", + args.port + )); + } + if args.health_port != 0 && args.health_port == args.metrics_port { + return Err(miette::miette!( + "--health-port and --metrics-port must be different (both set to {})", + args.health_port + )); + } + let metrics_bind = SocketAddr::from(([0, 0, 0, 0], args.metrics_port)); + config = config.with_metrics_bind_address(metrics_bind); + } + config = config .with_database_url(args.db_url) .with_compute_drivers(args.drivers) diff --git a/crates/openshell-server/src/http.rs b/crates/openshell-server/src/http.rs index 4d45c5352..7650c2339 100644 --- a/crates/openshell-server/src/http.rs +++ b/crates/openshell-server/src/http.rs @@ -3,7 +3,8 @@ //! HTTP health endpoints using Axum. -use axum::{Json, Router, http::StatusCode, response::IntoResponse, routing::get}; +use axum::{Json, Router, extract::State, http::StatusCode, response::IntoResponse, routing::get}; +use metrics_exporter_prometheus::PrometheusHandle; use serde::Serialize; use std::sync::Arc; @@ -45,6 +46,17 @@ pub fn health_router() -> Router { .route("/readyz", get(readyz)) } +/// Create the metrics router for the dedicated metrics port. +pub fn metrics_router(handle: PrometheusHandle) -> Router { + Router::new() + .route("/metrics", get(render_metrics)) + .with_state(handle) +} + +async fn render_metrics(State(handle): State) -> impl IntoResponse { + handle.render() +} + /// Create the HTTP router. pub fn http_router(state: Arc) -> Router { crate::ssh_tunnel::router(state.clone()) diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index a40794037..0442640d4 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -35,6 +35,7 @@ mod tls; pub mod tracing_bus; mod ws_tunnel; +use metrics_exporter_prometheus::PrometheusBuilder; use openshell_core::{ComputeDriverKind, Config, Error, Result}; use std::collections::HashMap; use std::io::ErrorKind; @@ -45,7 +46,7 @@ use tracing::{debug, error, info}; use compute::{ComputeRuntime, VmComputeConfig}; pub use grpc::OpenShellService; -pub use http::{health_router, http_router}; +pub use http::{health_router, http_router, metrics_router}; pub use multiplex::{MultiplexService, MultiplexedService}; use openshell_driver_kubernetes::KubernetesComputeConfig; use persistence::Store; @@ -205,6 +206,31 @@ pub async fn run_server( info!("Health server disabled"); } + // Bind the Prometheus metrics endpoint on a dedicated port when configured. + if let Some(metrics_bind_address) = config.metrics_bind_address { + let prometheus_handle = PrometheusBuilder::new() + .install_recorder() + .map_err(|e| Error::config(format!("failed to install metrics recorder: {e}")))?; + let metrics_listener = TcpListener::bind(metrics_bind_address).await.map_err(|e| { + Error::transport(format!( + "failed to bind metrics port {metrics_bind_address}: {e}", + )) + })?; + info!(address = %metrics_bind_address, "Metrics server listening"); + tokio::spawn(async move { + if let Err(e) = axum::serve( + metrics_listener, + metrics_router(prometheus_handle).into_make_service(), + ) + .await + { + error!("Metrics server error: {e}"); + } + }); + } else { + info!("Metrics server disabled"); + } + // Build TLS acceptor when TLS is configured; otherwise serve plaintext. let tls_acceptor = if let Some(tls) = &config.tls { Some(TlsAcceptor::from_files( diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index 841aa9263..e0c159958 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -15,6 +15,7 @@ use hyper_util::{ rt::{TokioExecutor, TokioIo}, server::conn::auto::Builder, }; +use metrics::{counter, histogram}; use openshell_core::proto::{ inference_server::InferenceServer, open_shell_server::OpenShellServer, }; @@ -22,7 +23,7 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio::io::{AsyncRead, AsyncWrite}; use tower::{ServiceBuilder, ServiceExt}; use tower_http::trace::TraceLayer; @@ -192,6 +193,8 @@ where .is_some_and(|v| v.as_bytes().starts_with(b"application/grpc")); if is_grpc { + let method = grpc_method_from_path(req.uri().path()); + let start = Instant::now(); let mut grpc = self.grpc.clone(); Box::pin(async move { let (parts, body) = req.into_parts(); @@ -206,11 +209,18 @@ where .await .map_err(Into::into)?; + let code = grpc_status_from_response(&res); + let elapsed = start.elapsed().as_secs_f64(); + counter!("openshell_server_grpc_requests_total", "method" => method.clone(), "code" => code.clone()).increment(1); + histogram!("openshell_server_grpc_request_duration_seconds", "method" => method, "code" => code).record(elapsed); + let (parts, body) = res.into_parts(); let body = body.map_err(Into::into).boxed_unsync(); Ok(Response::from_parts(parts, BoxBody(body))) }) } else { + let path = normalize_http_path(req.uri().path()); + let start = Instant::now(); let mut http = self.http.clone(); Box::pin(async move { let (parts, body) = req.into_parts(); @@ -225,6 +235,11 @@ where .await .map_err(Into::into)?; + let status = res.status().as_u16().to_string(); + let elapsed = start.elapsed().as_secs_f64(); + counter!("openshell_server_http_requests_total", "path" => path, "status" => status.clone()).increment(1); + histogram!("openshell_server_http_request_duration_seconds", "path" => path, "status" => status).record(elapsed); + let (parts, body) = res.into_parts(); let body = body.map_err(Into::into).boxed_unsync(); Ok(Response::from_parts(parts, BoxBody(body))) @@ -258,6 +273,26 @@ fn log_response(res: &Response, latency: Duration, _span: &Span) { ); } +fn grpc_method_from_path(path: &str) -> String { + path.rsplit('/').next().unwrap_or(path).to_string() +} + +fn grpc_status_from_response(res: &Response) -> String { + res.headers() + .get("grpc-status") + .and_then(|v| v.to_str().ok()) + .map_or_else(|| "0".to_string(), ToString::to_string) +} + +fn normalize_http_path(path: &str) -> &'static str { + match path { + p if p.starts_with("/connect/ssh") => "/connect/ssh", + p if p.starts_with("/_ws_tunnel") => "/_ws_tunnel", + p if p.starts_with("/auth/") => "/auth", + _ => "unknown", + } +} + /// Boxed body type for uniform handling. pub struct BoxBody( http_body_util::combinators::UnsyncBoxBody>, @@ -282,3 +317,90 @@ impl Body for BoxBody { self.0.size_hint() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn grpc_method_extracts_last_segment() { + assert_eq!( + grpc_method_from_path("/openshell.v1.OpenShell/CreateSandbox"), + "CreateSandbox" + ); + } + + #[test] + fn grpc_method_extracts_inference_service() { + assert_eq!( + grpc_method_from_path("/openshell.inference.v1.Inference/GetInferenceBundle"), + "GetInferenceBundle" + ); + } + + #[test] + fn grpc_method_handles_bare_path() { + assert_eq!(grpc_method_from_path("Health"), "Health"); + } + + #[test] + fn grpc_method_handles_single_slash() { + assert_eq!(grpc_method_from_path("/"), ""); + } + + #[test] + fn grpc_method_handles_empty_string() { + assert_eq!(grpc_method_from_path(""), ""); + } + + #[test] + fn normalize_ssh_path() { + assert_eq!(normalize_http_path("/connect/ssh"), "/connect/ssh"); + } + + #[test] + fn normalize_ssh_path_with_trailing_segments() { + assert_eq!( + normalize_http_path("/connect/ssh?token=abc"), + "/connect/ssh" + ); + } + + #[test] + fn normalize_ws_tunnel() { + assert_eq!(normalize_http_path("/_ws_tunnel"), "/_ws_tunnel"); + } + + #[test] + fn normalize_ws_tunnel_with_trailing() { + assert_eq!(normalize_http_path("/_ws_tunnel/foo"), "/_ws_tunnel"); + } + + #[test] + fn normalize_auth_path() { + assert_eq!(normalize_http_path("/auth/connect"), "/auth"); + } + + #[test] + fn normalize_auth_with_query() { + assert_eq!( + normalize_http_path("/auth/connect?callback_port=12345&code=AB7-X9KM"), + "/auth" + ); + } + + #[test] + fn normalize_unknown_path_collapses_to_unknown() { + assert_eq!(normalize_http_path("/random/scanner/probe"), "unknown"); + } + + #[test] + fn normalize_empty_path() { + assert_eq!(normalize_http_path(""), "unknown"); + } + + #[test] + fn normalize_root_path() { + assert_eq!(normalize_http_path("/"), "unknown"); + } +} diff --git a/deploy/helm/openshell/templates/service.yaml b/deploy/helm/openshell/templates/service.yaml index 0e4aa4b76..ebad42eab 100644 --- a/deploy/helm/openshell/templates/service.yaml +++ b/deploy/helm/openshell/templates/service.yaml @@ -18,5 +18,11 @@ spec: {{- if and (eq .Values.service.type "NodePort") .Values.service.nodePort }} nodePort: {{ .Values.service.nodePort }} {{- end }} + {{- if .Values.service.metricsPort }} + - port: {{ .Values.service.metricsPort }} + targetPort: metrics + protocol: TCP + name: metrics + {{- end }} selector: {{- include "openshell.selectorLabels" . | nindent 4 }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 826251134..37ebcae80 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -51,6 +51,10 @@ spec: - {{ .Values.service.port | quote }} - --health-port - {{ .Values.service.healthPort | quote }} + {{- if .Values.service.metricsPort }} + - --metrics-port + - {{ .Values.service.metricsPort | quote }} + {{- end }} - --log-level - {{ .Values.server.logLevel }} - --db-url @@ -118,6 +122,11 @@ spec: - name: health containerPort: {{ .Values.service.healthPort }} protocol: TCP + {{- if .Values.service.metricsPort }} + - name: metrics + containerPort: {{ .Values.service.metricsPort }} + protocol: TCP + {{- end }} startupProbe: httpGet: path: /healthz diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 2b8cc4e6f..4ac8ab43a 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -38,6 +38,7 @@ service: port: 8080 nodePort: 30051 healthPort: 8081 + metricsPort: 9090 # Pod restart behavior and health probe tuning. podLifecycle: