Skip to content

Commit b030553

Browse files
committed
Milestone: Fix block production deadlock and Setup Wizard deployment
- Fixed deadlock in TournamentManager::record_evidence - Improved validator block production loop robustness - Removed unnecessary spawn_blocking in run_battles - Fixed Setup Wizard to auto-start nodes and return deployment info synchronously - Fixed metrics server hanging with timeouts and explicit shutdown
1 parent ec0d770 commit b030553

24 files changed

Lines changed: 1683 additions & 253 deletions

bitcell-launcher.sh

Lines changed: 438 additions & 0 deletions
Large diffs are not rendered by default.

crates/bitcell-admin/src/api/deployment.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ pub struct DeploymentConfig {
2424
pub data_dir: Option<String>,
2525
pub log_level: Option<String>,
2626
pub port_start: Option<u16>,
27+
pub enable_dht: Option<bool>,
28+
pub bootstrap_nodes: Option<Vec<String>>,
29+
pub key_seed: Option<String>,
2730
}
2831

2932
#[derive(Debug, Serialize)]
@@ -32,6 +35,7 @@ pub struct DeploymentResponse {
3235
pub status: String,
3336
pub nodes_deployed: usize,
3437
pub message: String,
38+
pub nodes: Vec<crate::api::NodeInfo>,
3539
}
3640

3741
#[derive(Debug, Serialize)]
@@ -58,26 +62,23 @@ pub async fn deploy_node(
5862
// Generate deployment ID
5963
let deployment_id = format!("deploy-{}", chrono::Utc::now().timestamp());
6064

61-
// Trigger deployment (async)
62-
tokio::spawn({
63-
let deployment = state.deployment.clone();
64-
let deployment_id = deployment_id.clone();
65-
let node_type = req.node_type;
66-
let count = req.count;
65+
let deployment = state.deployment.clone();
66+
let node_type = req.node_type;
67+
let count = req.count;
68+
let config = req.config;
6769

68-
async move {
69-
deployment.deploy_nodes(&deployment_id, node_type, count).await;
70-
}
71-
});
70+
// Perform deployment synchronously to return node info
71+
let nodes = deployment.deploy_nodes(&deployment_id, node_type, count, config).await;
7272

7373
Ok(Json(DeploymentResponse {
7474
deployment_id,
75-
status: "deploying".to_string(),
75+
status: "completed".to_string(),
7676
nodes_deployed: req.count,
7777
message: format!(
78-
"Deploying {} {:?} node(s)",
78+
"Deployed {} {:?} node(s)",
7979
req.count, req.node_type
8080
),
81+
nodes,
8182
}))
8283
}
8384

crates/bitcell-admin/src/api/metrics.rs

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pub struct MetricsResponse {
1616
pub network: NetworkMetrics,
1717
pub ebsl: EbslMetrics,
1818
pub system: SystemMetrics,
19+
pub node_metrics: Option<Vec<crate::metrics_client::NodeMetrics>>,
1920
}
2021

2122
#[derive(Debug, Clone, Serialize)]
@@ -58,21 +59,34 @@ pub struct SystemMetrics {
5859
pub async fn get_metrics(
5960
State(state): State<Arc<AppState>>,
6061
) -> Result<Json<MetricsResponse>, (StatusCode, Json<String>)> {
61-
let nodes = state.setup.get_nodes();
62-
63-
if nodes.is_empty() {
62+
// Get all registered nodes from ProcessManager (which has status info)
63+
let all_nodes = state.process.list_nodes();
64+
tracing::info!("get_metrics: Found {} nodes", all_nodes.len());
65+
66+
if all_nodes.is_empty() {
67+
tracing::warn!("get_metrics: No nodes found, returning 503");
6468
return Err((
6569
StatusCode::SERVICE_UNAVAILABLE,
66-
Json("No nodes configured. Please complete setup wizard and deploy nodes first.".to_string()),
70+
Json("No nodes configured. Please deploy nodes first.".to_string()),
6771
));
6872
}
6973

70-
// Get endpoints for metrics fetching
71-
let endpoints: Vec<(String, String)> = nodes
74+
// Get endpoints for metrics fetching (try all nodes)
75+
let endpoints: Vec<(String, String)> = all_nodes
7276
.iter()
73-
.map(|n| (n.id.clone(), n.metrics_endpoint.clone()))
77+
.map(|n| {
78+
let metrics_port = n.port + 1; // Metrics port is node port + 1
79+
(n.id.clone(), format!("http://127.0.0.1:{}/metrics", metrics_port))
80+
})
7481
.collect();
7582

83+
if endpoints.is_empty() {
84+
return Err((
85+
StatusCode::SERVICE_UNAVAILABLE,
86+
Json("No running nodes. Please start some nodes first.".to_string()),
87+
));
88+
}
89+
7690
// Fetch aggregated metrics
7791
let aggregated = state.metrics_client.aggregate_metrics(&endpoints)
7892
.await
@@ -111,6 +125,7 @@ pub async fn get_metrics(
111125
memory_usage_mb: 0, // TODO: Requires system metrics collection
112126
disk_usage_mb: 0, // TODO: Requires system metrics collection
113127
},
128+
node_metrics: Some(aggregated.node_metrics),
114129
};
115130

116131
Ok(Json(response))

crates/bitcell-admin/src/api/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ pub struct NodeInfo {
2020
pub address: String,
2121
pub port: u16,
2222
pub started_at: Option<chrono::DateTime<chrono::Utc>>,
23+
pub enable_dht: bool,
24+
pub dht_peer_count: usize,
25+
pub bootstrap_nodes: Vec<String>,
26+
pub key_seed: Option<String>,
2327
}
2428

2529
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]

crates/bitcell-admin/src/api/nodes.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,65 @@ pub async fn stop_node(
126126
)),
127127
}
128128
}
129+
130+
/// Delete a node
131+
pub async fn delete_node(
132+
State(state): State<Arc<AppState>>,
133+
Path(id): Path<String>,
134+
) -> Result<Json<serde_json::Value>, (StatusCode, Json<ErrorResponse>)> {
135+
validate_node_id(&id)?;
136+
137+
match state.process.delete_node(&id) {
138+
Ok(_) => {
139+
tracing::info!("Deleted node '{}' successfully", id);
140+
Ok(Json(serde_json::json!({ "message": format!("Node '{}' deleted", id) })))
141+
}
142+
Err(e) => Err((
143+
StatusCode::INTERNAL_SERVER_ERROR,
144+
Json(ErrorResponse {
145+
error: format!("Failed to delete node '{}': {}", id, e),
146+
}),
147+
)),
148+
}
149+
}
150+
151+
#[derive(Debug, Deserialize)]
152+
pub struct LogParams {
153+
#[serde(default = "default_lines")]
154+
pub lines: usize,
155+
}
156+
157+
fn default_lines() -> usize {
158+
100
159+
}
160+
161+
/// Get logs for a specific node
162+
pub async fn get_node_logs(
163+
State(state): State<Arc<AppState>>,
164+
Path(id): Path<String>,
165+
axum::extract::Query(params): axum::extract::Query<LogParams>,
166+
) -> Result<String, (StatusCode, String)> {
167+
validate_node_id(&id).map_err(|e| (e.0, e.1.error.clone()))?;
168+
169+
// Get log file path
170+
let log_path = state.process.get_log_path(&id)
171+
.ok_or_else(|| (StatusCode::NOT_FOUND, format!("Node '{}' not found", id)))?;
172+
173+
// Read log file
174+
match std::fs::read_to_string(&log_path) {
175+
Ok(content) => {
176+
// Get last N lines
177+
let lines: Vec<&str> = content.lines().collect();
178+
let start = lines.len().saturating_sub(params.lines.min(1000));
179+
let result = lines[start..].join("\n");
180+
Ok(result)
181+
}
182+
Err(e) => {
183+
if e.kind() == std::io::ErrorKind::NotFound {
184+
Ok("Log file not found. Node may not have started yet.".to_string())
185+
} else {
186+
Err((StatusCode::INTERNAL_SERVER_ERROR, format!("Failed to read log file: {}", e)))
187+
}
188+
}
189+
}
190+
}

crates/bitcell-admin/src/deployment.rs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,19 @@ impl DeploymentManager {
1616
Self { process, setup }
1717
}
1818

19-
pub async fn deploy_nodes(&self, deployment_id: &str, node_type: NodeType, count: usize) {
19+
pub async fn deploy_nodes(&self, deployment_id: &str, node_type: NodeType, count: usize, config: Option<crate::api::deployment::DeploymentConfig>) -> Vec<crate::api::NodeInfo> {
2020
tracing::info!(
2121
"Starting deployment {}: deploying {} {:?} nodes",
2222
deployment_id,
2323
count,
2424
node_type
2525
);
2626

27+
// Extract DHT config or use defaults
28+
let enable_dht = config.as_ref().and_then(|c| c.enable_dht).unwrap_or(false);
29+
let bootstrap_nodes = config.as_ref().and_then(|c| c.bootstrap_nodes.clone()).unwrap_or_default();
30+
let key_seed = config.as_ref().and_then(|c| c.key_seed.clone());
31+
2732
// Find the highest used port to avoid conflicts
2833
// Using higher ports (19000+) to avoid conflicts with system services
2934
let mut base_port = match node_type {
@@ -60,6 +65,7 @@ impl DeploymentManager {
6065
}
6166

6267
let base_rpc_port = base_port + 1000;
68+
let mut deployed_nodes = Vec::new();
6369

6470
for i in 0..count {
6571
let node_id = format!("{:?}-{}-{}", node_type, deployment_id, i);
@@ -74,19 +80,13 @@ impl DeploymentManager {
7480
rpc_port,
7581
log_level: "info".to_string(),
7682
network: "testnet".to_string(),
83+
enable_dht,
84+
bootstrap_nodes: bootstrap_nodes.clone(),
85+
key_seed: key_seed.clone(),
7786
};
7887

79-
// Register the node (but don't start it automatically)
80-
// Note: The UI calls start_node separately, or we could start it here.
81-
// But wait, the UI "Deploy" button calls deploy_node, which spawns this task.
82-
// The UI then refreshes the list. It doesn't automatically start them?
83-
// The screenshot shows them as "Running".
84-
// Let's check process.rs start_node again.
85-
// Ah, register_node returns NodeStatus::Stopped.
86-
// So the user must have clicked Start.
87-
// But wait, if I register them in SetupManager, they appear in the list.
88-
89-
self.process.register_node(node_id.clone(), config);
88+
// Register the node
89+
let mut node_info = self.process.register_node(node_id.clone(), config);
9090

9191
// Register in SetupManager so metrics can be fetched
9292
let endpoint = NodeEndpoint {
@@ -100,9 +100,16 @@ impl DeploymentManager {
100100
tracing::info!("Registered node '{}' in deployment {}", node_id, deployment_id);
101101

102102
// Auto-start the node for convenience
103-
if let Err(e) = self.process.start_node(&node_id) {
104-
tracing::error!("Failed to auto-start node {}: {}", node_id, e);
103+
match self.process.start_node(&node_id) {
104+
Ok(started_info) => {
105+
node_info = started_info;
106+
},
107+
Err(e) => {
108+
tracing::error!("Failed to auto-start node {}: {}", node_id, e);
109+
}
105110
}
111+
112+
deployed_nodes.push(node_info);
106113
}
107114

108115
// Save setup state
@@ -117,5 +124,7 @@ impl DeploymentManager {
117124
count,
118125
node_type
119126
);
127+
128+
deployed_nodes
120129
}
121130
}

crates/bitcell-admin/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::sync::Arc;
2121

2222
use axum::{
2323
Router,
24-
routing::{get, post},
24+
routing::{get, post, delete},
2525
};
2626
use tower_http::services::ServeDir;
2727
use tower_http::cors::CorsLayer;
@@ -87,8 +87,10 @@ impl AdminConsole {
8787
// API endpoints
8888
.route("/api/nodes", get(api::nodes::list_nodes))
8989
.route("/api/nodes/:id", get(api::nodes::get_node))
90+
.route("/api/nodes/:id", delete(api::nodes::delete_node))
9091
.route("/api/nodes/:id/start", post(api::nodes::start_node))
9192
.route("/api/nodes/:id/stop", post(api::nodes::stop_node))
93+
.route("/api/nodes/:id/logs", get(api::nodes::get_node_logs))
9294

9395
.route("/api/metrics", get(api::metrics::get_metrics))
9496
.route("/api/metrics/chain", get(api::metrics::chain_metrics))

crates/bitcell-admin/src/metrics_client.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub struct NodeMetrics {
1111
pub chain_height: u64,
1212
pub sync_progress: u64,
1313
pub peer_count: usize,
14+
pub dht_peer_count: usize,
1415
pub bytes_sent: u64,
1516
pub bytes_received: u64,
1617
pub pending_txs: usize,
@@ -31,7 +32,7 @@ impl MetricsClient {
3132
pub fn new() -> Self {
3233
Self {
3334
client: reqwest::Client::builder()
34-
.timeout(Duration::from_secs(2))
35+
.timeout(Duration::from_secs(5))
3536
.build()
3637
.expect("Failed to build HTTP client for metrics"),
3738
}
@@ -88,6 +89,7 @@ impl MetricsClient {
8889
chain_height: metrics.get("bitcell_chain_height").copied().unwrap_or(0.0) as u64,
8990
sync_progress: metrics.get("bitcell_sync_progress").copied().unwrap_or(0.0) as u64,
9091
peer_count: metrics.get("bitcell_peer_count").copied().unwrap_or(0.0) as usize,
92+
dht_peer_count: metrics.get("bitcell_dht_peer_count").copied().unwrap_or(0.0) as usize,
9193
bytes_sent: metrics.get("bitcell_bytes_sent_total").copied().unwrap_or(0.0) as u64,
9294
bytes_received: metrics.get("bitcell_bytes_received_total").copied().unwrap_or(0.0) as u64,
9395
pending_txs: metrics.get("bitcell_pending_txs").copied().unwrap_or(0.0) as usize,
@@ -114,7 +116,11 @@ impl MetricsClient {
114116
Ok(metrics) => node_metrics.push(metrics),
115117
Err(e) => {
116118
errors.push(format!("{}: {}", node_id, e));
117-
tracing::warn!("Failed to fetch metrics from {}: {}", node_id, e);
119+
if e.contains("Connection refused") || e.contains("operation timed out") {
120+
tracing::debug!("Failed to fetch metrics from {}: {}", node_id, e);
121+
} else {
122+
tracing::warn!("Failed to fetch metrics from {}: {}", node_id, e);
123+
}
118124
}
119125
}
120126
}

0 commit comments

Comments
 (0)