Skip to main content

consortium_nix/
health.rs

1//! Builder health checking — probe builders for SSH connectivity and Nix store access.
2
3use std::process::Command;
4use std::time::Instant;
5
6use crate::config::{Builder, FleetConfig};
7use crate::error::{NixError, Result};
8
9/// Health status for a single builder.
10#[derive(Debug, Clone)]
11pub struct HealthStatus {
12    /// The builder configuration.
13    pub builder: Builder,
14    /// Whether the builder is healthy (SSH + nix store reachable).
15    pub healthy: bool,
16    /// Round-trip latency in milliseconds (if healthy).
17    pub latency_ms: Option<u64>,
18    /// Error message (if unhealthy).
19    pub error: Option<String>,
20}
21
22/// Probe all builders in the fleet and return their health status.
23pub fn check_builders(config: &FleetConfig) -> Vec<HealthStatus> {
24    // TODO: parallelize with consortium's SshWorker + fanout
25    config
26        .builders
27        .values()
28        .map(|builder| check_builder(builder))
29        .collect()
30}
31
32/// Probe a single builder for health.
33pub fn check_builder(builder: &Builder) -> HealthStatus {
34    let start = Instant::now();
35
36    // First check SSH connectivity
37    let ssh_result = Command::new("ssh")
38        .args([
39            "-oStrictHostKeyChecking=no",
40            "-oPasswordAuthentication=no",
41            "-oConnectTimeout=5",
42            "-oBatchMode=yes",
43            "-l",
44            &builder.user,
45            &builder.host,
46            "true",
47        ])
48        .output();
49
50    match ssh_result {
51        Err(e) => HealthStatus {
52            builder: builder.clone(),
53            healthy: false,
54            latency_ms: None,
55            error: Some(format!("SSH exec failed: {}", e)),
56        },
57        Ok(output) if !output.status.success() => {
58            let stderr = String::from_utf8_lossy(&output.stderr);
59            HealthStatus {
60                builder: builder.clone(),
61                healthy: false,
62                latency_ms: None,
63                error: Some(format!("SSH connection failed: {}", stderr.trim())),
64            }
65        }
66        Ok(_) => {
67            let ssh_latency = start.elapsed().as_millis() as u64;
68
69            // Now check nix store accessibility
70            let store_uri = format!("{}://{}@{}", builder.protocol, builder.user, builder.host);
71            let store_result = Command::new("nix")
72                .args(["store", "ping", "--store", &store_uri])
73                .output();
74
75            match store_result {
76                Err(e) => HealthStatus {
77                    builder: builder.clone(),
78                    healthy: false,
79                    latency_ms: Some(ssh_latency),
80                    error: Some(format!("nix store ping failed: {}", e)),
81                },
82                Ok(output) if !output.status.success() => {
83                    let stderr = String::from_utf8_lossy(&output.stderr);
84                    HealthStatus {
85                        builder: builder.clone(),
86                        healthy: false,
87                        latency_ms: Some(ssh_latency),
88                        error: Some(format!("nix store unreachable: {}", stderr.trim())),
89                    }
90                }
91                Ok(_) => HealthStatus {
92                    builder: builder.clone(),
93                    healthy: true,
94                    latency_ms: Some(start.elapsed().as_millis() as u64),
95                    error: None,
96                },
97            }
98        }
99    }
100}
101
102/// Get only healthy builders, sorted by speed factor (highest first).
103pub fn healthy_builders(statuses: &[HealthStatus]) -> Vec<&HealthStatus> {
104    let mut healthy: Vec<_> = statuses.iter().filter(|s| s.healthy).collect();
105    healthy.sort_by(|a, b| b.builder.speed_factor.cmp(&a.builder.speed_factor));
106    healthy
107}
108
109/// Pre-warm SSH connections to builders by establishing ControlMaster sockets.
110pub fn warm_connections(builders: &[&HealthStatus], control_path: &str) -> Result<()> {
111    for status in builders {
112        let b = &status.builder;
113        let output = Command::new("ssh")
114            .args([
115                "-oStrictHostKeyChecking=no",
116                "-oPasswordAuthentication=no",
117                "-oControlMaster=auto",
118                &format!("-oControlPath={}", control_path),
119                "-oControlPersist=10m",
120                "-oBatchMode=yes",
121                "-fN", // background, no command
122                "-l",
123                &b.user,
124                &b.host,
125            ])
126            .output()
127            .map_err(|e| NixError::SshFailed {
128                host: b.host.clone(),
129                message: format!("failed to warm connection: {}", e),
130            })?;
131
132        if !output.status.success() {
133            // Non-fatal: just log and continue
134            eprintln!(
135                "warning: failed to warm SSH connection to {}: {}",
136                b.host,
137                String::from_utf8_lossy(&output.stderr).trim()
138            );
139        }
140    }
141
142    Ok(())
143}