diff --git a/Cargo.lock b/Cargo.lock index ae4ad997..719aa96e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5592,6 +5592,50 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "opentelemetry" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "236e667b670a5cdf90c258f5a55794ec5ac5027e960c224bff8367a59e1e6426" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.17", + "tracing", +] + +[[package]] +name = "opentelemetry-prometheus" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765a76ba13ec77043903322f85dc5434d7d01a37e75536d0f871ed7b9b5bbf0d" +dependencies = [ + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "prometheus", + "protobuf", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84dfad6042089c7fc1f6118b7040dc2eb4ab520abbf410b79dc481032af39570" +dependencies = [ + "async-trait", + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "opentelemetry", + "thiserror 2.0.17", + "tokio", + "tokio-stream", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -6068,6 +6112,27 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if 1.0.4", + "fnv", + "lazy_static", + "memchr", + "parking_lot 0.12.5", + "protobuf", + "thiserror 1.0.69", +] + +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "qstring" version = "0.7.2" @@ -11313,6 +11378,7 @@ dependencies = [ "agave-geyser-plugin-interface", "agave-reserved-account-keys", "anchor-lang-idl", + "axum", "base64 0.22.1", "bincode", "blake3", @@ -11340,6 +11406,10 @@ dependencies = [ "litesvm", "litesvm-token", "log 0.4.29", + "opentelemetry", + "opentelemetry-prometheus", + "opentelemetry_sdk", + "prometheus", "reqwest 0.12.28", "serde", "serde_derive", diff --git a/Cargo.toml b/Cargo.toml index 1137ae74..b6214e5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -179,6 +179,12 @@ txtx-core = { version = "0.4.15" } txtx-gql = { version = "0.3.9" } txtx-supervisor-ui = { version = "0.2.10", default-features = false } +opentelemetry = { version = "0.28", default-features = false, features = ["metrics"] } +opentelemetry_sdk = { version = "0.28", default-features = false, features = ["rt-tokio", "metrics"] } +opentelemetry-prometheus = { version = "0.28", default-features = false } +prometheus = { version = "0.13", default-features = false } +axum = { version = "0.8", default-features = false, features = ["tokio", "http1"] } + # [patch.crates-io] ## Local # txtx-addon-kit = { path = "../txtx/crates/txtx-addon-kit" } diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 67da8f01..6c0d47d8 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -79,6 +79,7 @@ postgres = ["surfpool-gql/postgres", "surfpool-core/postgres"] version_check = [] subgraph = ["surfpool-core/subgraph"] register-tracing = ["surfpool-core/register-tracing"] +prometheus = ["surfpool-core/prometheus"] [target.'cfg(not(target_os = "windows"))'.dependencies] fork = "0.2.0" diff --git a/crates/cli/src/cli/mod.rs b/crates/cli/src/cli/mod.rs index 0e660bc6..7741f722 100644 --- a/crates/cli/src/cli/mod.rs +++ b/crates/cli/src/cli/mod.rs @@ -264,6 +264,20 @@ pub struct StartSimnet { /// When multiple files are provided, later files override earlier ones for duplicate keys. #[arg(long = "snapshot")] pub snapshot: Vec, + /// Enable Prometheus metrics endpoint + #[cfg(feature = "prometheus")] + /// Enable Prometheus metrics endpoint + #[arg(long = "metrics-enabled", env = "SURFPOOL_METRICS_ENABLED")] + pub metrics_enabled: bool, + + #[cfg(feature = "prometheus")] + /// Prometheus metrics endpoint address + #[arg( + long = "metrics-addr", + default_value = "0.0.0.0:9000", + env = "SURFPOOL_METRICS_ADDR" + )] + pub metrics_addr: String, /// Skip signature verification for all transactions (eg. surfpool start --skip-signature-verification) #[clap(long = "skip-signature-verification", action=ArgAction::SetTrue, default_value = "false")] pub skip_signature_verification: bool, diff --git a/crates/cli/src/cli/simnet/mod.rs b/crates/cli/src/cli/simnet/mod.rs index e0c69848..f085f1a3 100644 --- a/crates/cli/src/cli/simnet/mod.rs +++ b/crates/cli/src/cli/simnet/mod.rs @@ -63,6 +63,27 @@ pub async fn handle_start_local_surfnet_command( let (mut surfnet_svm, simnet_events_rx, geyser_events_rx) = SurfnetSvm::new_with_db(cmd.db.as_deref(), &cmd.surfnet_id) .map_err(|e| format!("Failed to initialize Surfnet SVM: {}", e))?; + #[cfg(feature = "prometheus")] + { + if cmd.metrics_enabled { + let handle = tokio::runtime::Handle::current(); + match surfpool_core::telemetry::init_from_config(&cmd.metrics_addr, &handle) { + Err(e) => { + let _ = surfnet_svm + .simnet_events_tx + .send(SimnetEvent::warn(format!("Metrics init failed: {}", e))); + } + Ok(_) => { + use surfpool_types::DEFAULT_NETWORK_HOST; + + let _ = surfnet_svm.simnet_events_tx.send(SimnetEvent::info(format!( + "Metrics available at http://{}/metrics", + DEFAULT_NETWORK_HOST + ))); + } + } + } + } // Apply feature configuration from CLI flags let feature_config = cmd.feature_config(); @@ -321,6 +342,9 @@ async fn start_service( let _ = explorer_handle.stop(true).await; } + #[cfg(feature = "prometheus")] + surfpool_core::telemetry::shutdown(); + Ok(()) } diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index b9529a4c..956e3216 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -100,7 +100,12 @@ anchor-lang-idl = { workspace = true } txtx-addon-kit = { workspace = true } txtx-addon-network-svm-types = { workspace = true } txtx-addon-network-svm = { workspace = true } - +# Prometheus metrics - declare normally, control via feature +opentelemetry = { version = "0.28", default-features = false, features = ["metrics"], optional = true } +opentelemetry_sdk = { version = "0.28", default-features = false, features = ["rt-tokio", "metrics"], optional = true } +opentelemetry-prometheus = { version = "0.28", default-features = false, optional = true } +prometheus = { version = "0.13", default-features = false, optional = true } +axum = { version = "0.8", default-features = false, features = ["tokio", "http1"], optional = true } [dev-dependencies] test-case = { workspace = true } @@ -116,3 +121,4 @@ ignore_tests_ci = [] geyser_plugin = [] # Disabled: solana-geyser-plugin-manager conflicts with litesvm 0.9.1 subgraph = ["surfpool-subgraph"] register-tracing = ["litesvm/register-tracing"] +prometheus = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-prometheus", "dep:prometheus", "dep:axum"] \ No newline at end of file diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 9f1d3ebd..147cb00c 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -17,6 +17,7 @@ pub mod runloops; pub mod scenarios; pub mod storage; pub mod surfnet; +pub mod telemetry; pub mod types; use crossbeam_channel::{Receiver, Sender}; diff --git a/crates/core/src/rpc/admin.rs b/crates/core/src/rpc/admin.rs index 16661edf..e1ce2b4c 100644 --- a/crates/core/src/rpc/admin.rs +++ b/crates/core/src/rpc/admin.rs @@ -3,7 +3,7 @@ use std::time::Duration; use jsonrpc_core::{BoxFuture, Result}; use jsonrpc_derive::rpc; use solana_client::rpc_custom_error::RpcCustomError; -use surfpool_types::{SimnetCommand, SimnetEvent}; +use surfpool_types::{SimnetCommand, SimnetEvent, SurfpoolStatus}; use txtx_addon_network_svm_types::subgraph::PluginConfig; use uuid::Uuid; @@ -192,6 +192,9 @@ pub trait AdminRpc { /// - This method is useful for monitoring system uptime and verifying system health. #[rpc(meta, name = "startTime")] fn start_time(&self, meta: Self::Metadata) -> Result; + + #[rpc(meta, name = "surfpoolStatus")] + fn surfpool_status(&self, meta: Self::Metadata) -> Result; } pub struct SurfpoolAdminRpc; @@ -363,4 +366,17 @@ impl AdminRpc for SurfpoolAdminRpc { let datetime_utc: chrono::DateTime = system_time.into(); Ok(datetime_utc.to_rfc3339()) } + + fn surfpool_status(&self, meta: Self::Metadata) -> Result { + // Ensure we have RunloopContext metadata + let Some(ctx) = meta else { + return Err(RpcCustomError::NodeUnhealthy { + num_slots_behind: None, + } + .into()); + }; + + let status = ctx.svm_locker.with_svm_reader(|svm| svm.snapshot_status()); + Ok(status) + } } diff --git a/crates/core/src/runloops/mod.rs b/crates/core/src/runloops/mod.rs index 567ead68..16b0f813 100644 --- a/crates/core/src/runloops/mod.rs +++ b/crates/core/src/runloops/mod.rs @@ -449,6 +449,12 @@ pub async fn start_block_production_runloop( svm_locker .confirm_current_block(&remote_client_with_commitment) .await?; + + #[cfg(feature = "prometheus")] + { + let snapshot = svm_locker.with_svm_reader(|svm| svm.snapshot_status()); + crate::telemetry::try_record_snapshot(&snapshot); + } } } } diff --git a/crates/core/src/surfnet/locker.rs b/crates/core/src/surfnet/locker.rs index a223b24a..909407da 100644 --- a/crates/core/src/surfnet/locker.rs +++ b/crates/core/src/surfnet/locker.rs @@ -1082,9 +1082,23 @@ impl SurfnetSvmLocker { } }; - self.with_svm_writer(|svm_writer| { - svm_writer.write_executed_profile_result(signature, profile_result) - })?; + #[cfg(feature = "prometheus")] + let write_result = { + let (write_result, snapshot) = self.with_svm_writer(|svm| { + let res = svm.write_executed_profile_result(signature, profile_result); + let snap = svm.snapshot_status(); + (res, snap) + }); + crate::telemetry::try_record_snapshot(&snapshot); + write_result + }; + + #[cfg(not(feature = "prometheus"))] + let write_result = self + .with_svm_writer(|svm| svm.write_executed_profile_result(signature, profile_result)); + + write_result?; + Ok(()) } diff --git a/crates/core/src/surfnet/svm.rs b/crates/core/src/surfnet/svm.rs index 25aaf54e..49ded7ad 100644 --- a/crates/core/src/surfnet/svm.rs +++ b/crates/core/src/surfnet/svm.rs @@ -52,8 +52,9 @@ use surfpool_types::{ AccountChange, AccountProfileState, AccountSnapshot, DEFAULT_PROFILING_MAP_CAPACITY, DEFAULT_SLOT_TIME_MS, ExportSnapshotConfig, ExportSnapshotScope, FifoMap, Idl, OverrideInstance, ProfileResult, RpcProfileDepth, RpcProfileResultConfig, - RunbookExecutionStatusReport, SimnetEvent, SvmFeatureConfig, TransactionConfirmationStatus, - TransactionStatusEvent, UiAccountChange, UiAccountProfileState, UiProfileResult, VersionedIdl, + RunbookExecutionStatusReport, SimnetEvent, SurfpoolStatus, SvmFeatureConfig, + TransactionConfirmationStatus, TransactionStatusEvent, UiAccountChange, UiAccountProfileState, + UiProfileResult, VersionedIdl, WsSubscriptions, types::{ ComputeUnitsEstimationResult, KeyedProfileResult, UiKeyedProfileResult, UuidOrSignature, }, @@ -330,6 +331,27 @@ impl SurfnetSvm { self.account_associated_data.shutdown(); } + /// Returns a snapshot of the current SVM status for RPC and metrics. + pub fn snapshot_status(&self) -> SurfpoolStatus { + SurfpoolStatus { + slot: self.latest_epoch_info.absolute_slot, + epoch: self.latest_epoch_info.epoch, + slot_index: self.latest_epoch_info.slot_index, + transactions_count: self.transactions.count().unwrap_or(0), + transactions_processed: self.transactions_processed, + uptime_ms: std::time::SystemTime::now() + .duration_since(self.start_time) + .map(|d| d.as_millis() as u64) + .unwrap_or(0), + ws_subscriptions: WsSubscriptions { + signatures: self.signature_subscriptions.len(), + accounts: self.account_subscriptions.len(), + slots: self.slot_subscriptions.len(), + logs: self.logs_subscriptions.len(), + }, + } + } + /// Creates a clone of the SVM with overlay storage wrappers for all database-backed fields. /// This allows profiling transactions without affecting the underlying database. /// All storage writes are buffered in memory and discarded when the clone is dropped. diff --git a/crates/core/src/telemetry.rs b/crates/core/src/telemetry.rs new file mode 100644 index 00000000..6ca91fad --- /dev/null +++ b/crates/core/src/telemetry.rs @@ -0,0 +1,162 @@ +//! Prometheus metrics for Surfpool +//! +//! Feature `prometheus` enables a `/metrics` HTTP endpoint + +#[cfg(feature = "prometheus")] +mod instrumented { + use std::sync::{Once, OnceLock}; + + use opentelemetry::{ + KeyValue, + metrics::{Gauge, Meter, MeterProvider}, + }; + use opentelemetry_sdk::{Resource, metrics::SdkMeterProvider}; + use prometheus::Encoder; + use surfpool_types::SurfpoolStatus; + + static INIT: Once = Once::new(); + static METRICS: OnceLock = OnceLock::new(); + static METER_PROVIDER: OnceLock = OnceLock::new(); + static SHUTDOWN_TX: OnceLock> = OnceLock::new(); + + pub struct SurfpoolMetrics { + uptime_seconds: Gauge, + transactions_processed_count: Gauge, + } + + impl SurfpoolMetrics { + fn new(meter: Meter) -> Self { + Self { + uptime_seconds: meter + .u64_gauge("surfpool_uptime_seconds") + .with_description("Time since start in seconds") + .build(), + transactions_processed_count: meter + .u64_gauge("surfpool_transactions_processed_count") + .with_description("Total processed transactions") + .build(), + } + } + + fn record_snapshot(&self, status: &SurfpoolStatus) { + self.uptime_seconds.record(status.uptime_ms / 1000, &[]); + self.transactions_processed_count + .record(status.transactions_processed, &[]); + } + } + + /// Record a snapshot if metrics are initialized. Safe to call unconditionally. + pub fn try_record_snapshot(status: &SurfpoolStatus) { + if let Some(m) = METRICS.get() { + m.record_snapshot(status); + } + } + + pub fn init_prometheus( + service_name: &str, + bind_addr: &str, + handle: &tokio::runtime::Handle, + ) -> Result<(), Box> { + let service_name_owned = service_name.to_string(); + let bind_addr_owned = bind_addr.to_string(); + let mut result = Ok(()); + + INIT.call_once(|| { + let registry = prometheus::Registry::new(); + let exporter = match opentelemetry_prometheus::exporter() + .with_registry(registry.clone()) + .build() + { + Ok(exp) => exp, + Err(e) => { + result = Err(Box::new(e) as Box); + return; + } + }; + + let resource = Resource::builder() + .with_attributes(vec![KeyValue::new("service.name", service_name_owned)]) + .build(); + + let provider = SdkMeterProvider::builder() + .with_resource(resource) + .with_reader(exporter) + .build(); + + let meter = provider.meter("surfpool-core"); + let metrics = SurfpoolMetrics::new(meter); + + if let Err(e) = METER_PROVIDER.set(provider) { + result = Err(format!("Meter provider already initialized: {:?}", e).into()); + return; + } + if METRICS.set(metrics).is_err() { + result = Err("Metrics already initialized".into()); + return; + } + + let (shutdown_tx, mut shutdown_rx) = tokio::sync::watch::channel(false); + let _ = SHUTDOWN_TX.set(shutdown_tx); + + handle.spawn(async move { + let registry_clone = registry.clone(); + let app = axum::Router::new().route( + "/metrics", + axum::routing::get(move || { + let reg = registry_clone.clone(); + async move { + let encoder = prometheus::TextEncoder::new(); + let metric_families = reg.gather(); + let mut buffer = vec![]; + if let Err(e) = encoder.encode(&metric_families, &mut buffer) { + return ( + axum::http::StatusCode::INTERNAL_SERVER_ERROR, + format!("Failed to encode: {}", e), + ); + } + let body = String::from_utf8(buffer) + .unwrap_or_else(|_| "Invalid UTF8".to_string()); + (axum::http::StatusCode::OK, body) + } + }), + ); + let listener = match tokio::net::TcpListener::bind(&bind_addr_owned).await { + Ok(l) => l, + Err(e) => { + eprintln!("Failed to bind metrics endpoint: {}", e); + return; + } + }; + if let Err(e) = axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.changed().await; + }) + .await + { + eprintln!("Metrics server error: {}", e); + } + }); + }); + + result + } + + pub fn shutdown() { + if let Some(tx) = SHUTDOWN_TX.get() { + let _ = tx.send(true); + } + if let Some(provider) = METER_PROVIDER.get() { + let _ = provider.shutdown(); + } + } +} + +#[cfg(feature = "prometheus")] +pub use instrumented::*; + +#[cfg(feature = "prometheus")] +pub fn init_from_config(bind_addr: &str, handle: &tokio::runtime::Handle) -> Result<(), String> { + log::info!("Starting Prometheus metrics on {}", bind_addr); + init_prometheus("surfpool", bind_addr, handle) + .map_err(|e| format!("Prometheus init failed: {}", e)) +} diff --git a/crates/types/src/types.rs b/crates/types/src/types.rs index 7cdc3f8a..b79cd09b 100644 --- a/crates/types/src/types.rs +++ b/crates/types/src/types.rs @@ -593,7 +593,7 @@ pub struct SanitizedConfig { pub workspace: Option, } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct SurfpoolConfig { pub simnets: Vec, pub rpc: RpcConfig, @@ -602,7 +602,7 @@ pub struct SurfpoolConfig { pub plugin_config_path: Vec, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct SimnetConfig { pub offline_mode: bool, pub remote_rpc_url: Option, @@ -662,14 +662,14 @@ impl SimnetConfig { } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct SubgraphConfig {} pub const DEFAULT_GOSSIP_PORT: u16 = 8001; pub const DEFAULT_TPU_PORT: u16 = 8003; pub const DEFAULT_TPU_QUIC_PORT: u16 = 8004; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct RpcConfig { pub bind_host: String, pub bind_port: u16, @@ -701,7 +701,7 @@ impl Default for RpcConfig { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct StudioConfig { pub bind_host: String, pub bind_port: u16, @@ -1256,6 +1256,27 @@ impl RunbookExecutionStatusReport { self.errors = error; } } +/// WebSocket subscription counts +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct WsSubscriptions { + pub signatures: usize, + pub accounts: usize, + pub slots: usize, + pub logs: usize, +} + +/// Surfpool node status information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SurfpoolStatus { + pub slot: u64, + pub epoch: u64, + pub slot_index: u64, + pub transactions_count: u64, + pub transactions_processed: u64, + pub uptime_ms: u64, + pub ws_subscriptions: WsSubscriptions, +} + #[cfg(test)] mod tests {