fix: prevent hang on gateway restart, fix delete_agent binding loss, add health check grace period

- Add 30s timeout to gateway restart command to prevent UI from hanging indefinitely
- Change delete_agent to reset bindings to "main" instead of removing them, preventing loss of channel binding entries
- Add health status grace period: show "Checking..." badge while gateway is restarting, retry every 2s up to 5 times before showing "Unhealthy"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
zhixian
2026-02-18 00:27:52 +09:00
parent 82bf38b2b2
commit 45e8e4d99b
2 changed files with 110 additions and 34 deletions

View File

@@ -1013,11 +1013,16 @@ pub fn delete_agent(agent_id: String) -> Result<bool, String> {
return Err(format!("Agent '{}' not found", agent_id));
}
// Also remove any bindings that reference this agent
// Reset any bindings that reference this agent back to "main" (default)
// so the channel doesn't lose its binding entry entirely.
if let Some(bindings) = cfg.pointer_mut("/bindings").and_then(Value::as_array_mut) {
bindings.retain(|b| {
b.get("agentId").and_then(Value::as_str) != Some(&agent_id)
});
for b in bindings.iter_mut() {
if b.get("agentId").and_then(Value::as_str) == Some(&agent_id) {
if let Some(obj) = b.as_object_mut() {
obj.insert("agentId".into(), Value::String("main".into()));
}
}
}
}
write_config_with_snapshot(&paths, &current, &cfg, "delete-agent")?;
@@ -1402,26 +1407,77 @@ fn run_external_command_raw(parts: &[&str]) -> Result<OpenclawCommandOutput, Str
}
fn run_openclaw_raw(args: &[&str]) -> Result<OpenclawCommandOutput, String> {
let mut command = Command::new("openclaw");
command.args(args);
let output = command
.output()
run_openclaw_raw_timeout(args, None)
}
fn run_openclaw_raw_timeout(args: &[&str], timeout_secs: Option<u64>) -> Result<OpenclawCommandOutput, String> {
let mut child = Command::new("openclaw")
.args(args)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|error| format!("failed to run openclaw: {error}"))?;
let exit_code = output.status.code().unwrap_or(-1);
let result = OpenclawCommandOutput {
stdout: String::from_utf8_lossy(&output.stdout).trim_end().to_string(),
stderr: String::from_utf8_lossy(&output.stderr).trim_end().to_string(),
exit_code,
};
if exit_code != 0 {
let details = if !result.stderr.is_empty() {
result.stderr.clone()
} else {
result.stdout.clone()
if let Some(secs) = timeout_secs {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(secs);
loop {
match child.try_wait().map_err(|e| e.to_string())? {
Some(status) => {
let mut stdout_buf = Vec::new();
let mut stderr_buf = Vec::new();
if let Some(mut out) = child.stdout.take() {
std::io::Read::read_to_end(&mut out, &mut stdout_buf).ok();
}
if let Some(mut err) = child.stderr.take() {
std::io::Read::read_to_end(&mut err, &mut stderr_buf).ok();
}
let exit_code = status.code().unwrap_or(-1);
let result = OpenclawCommandOutput {
stdout: String::from_utf8_lossy(&stdout_buf).trim_end().to_string(),
stderr: String::from_utf8_lossy(&stderr_buf).trim_end().to_string(),
exit_code,
};
if exit_code != 0 {
let details = if !result.stderr.is_empty() {
result.stderr.clone()
} else {
result.stdout.clone()
};
return Err(format!("openclaw command failed ({exit_code}): {details}"));
}
return Ok(result);
}
None => {
if std::time::Instant::now() >= deadline {
let _ = child.kill();
return Err(format!(
"Command timed out after {secs}s. The gateway may still be restarting in the background."
));
}
std::thread::sleep(std::time::Duration::from_millis(250));
}
}
}
} else {
let output = child
.wait_with_output()
.map_err(|error| format!("failed to run openclaw: {error}"))?;
let exit_code = output.status.code().unwrap_or(-1);
let result = OpenclawCommandOutput {
stdout: String::from_utf8_lossy(&output.stdout).trim_end().to_string(),
stderr: String::from_utf8_lossy(&output.stderr).trim_end().to_string(),
exit_code,
};
return Err(format!("openclaw command failed ({exit_code}): {details}"));
if exit_code != 0 {
let details = if !result.stderr.is_empty() {
result.stderr.clone()
} else {
result.stdout.clone()
};
return Err(format!("openclaw command failed ({exit_code}): {details}"));
}
Ok(result)
}
Ok(result)
}
/// Strip leading non-JSON lines from CLI output (plugin logs, ANSI codes, etc.)
@@ -3209,8 +3265,8 @@ pub async fn apply_pending_changes() -> Result<bool, String> {
fs::create_dir_all(bp.parent().unwrap()).map_err(|e| e.to_string())?;
fs::write(&bp, &text).map_err(|e| e.to_string())?;
// Restart gateway
run_openclaw_raw(&["gateway", "restart"])?;
// Restart gateway (30s timeout to prevent indefinite hang)
run_openclaw_raw_timeout(&["gateway", "restart"], Some(30))?;
Ok(true)
}).await.map_err(|e| e.to_string())?
}

View File

@@ -1,4 +1,4 @@
import { useEffect, useMemo, useState } from "react";
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
import { api } from "../lib/api";
import { Card, CardContent } from "@/components/ui/card";
import { Badge } from "@/components/ui/badge";
@@ -76,11 +76,31 @@ export function Home({ onCook }: { onCook?: (recipeId: string, source?: string)
const [creatingAgent, setCreatingAgent] = useState(false);
const [createAgentError, setCreateAgentError] = useState("");
// Fast calls: render immediately
useEffect(() => {
api.getStatusLight().then(setStatus).catch(() => {});
// Health status with grace period: retry quickly when unhealthy, then slow-poll
const [statusSettled, setStatusSettled] = useState(false);
const retriesRef = useRef(0);
const fetchStatus = useCallback(() => {
api.getStatusLight().then((s) => {
setStatus(s);
if (s.healthy) {
setStatusSettled(true);
retriesRef.current = 0;
} else if (retriesRef.current < 5) {
retriesRef.current++;
} else {
setStatusSettled(true);
}
}).catch(() => {});
}, []);
useEffect(() => {
fetchStatus();
// Poll fast (2s) while not settled, slow (10s) once settled
const interval = setInterval(fetchStatus, statusSettled ? 10000 : 2000);
return () => clearInterval(interval);
}, [fetchStatus, statusSettled]);
const refreshAgents = () => {
api.listAgentsOverview().then(setAgents).catch(() => {});
};
@@ -166,13 +186,13 @@ export function Home({ onCook }: { onCook?: (recipeId: string, source?: string)
<CardContent className="grid grid-cols-[auto_1fr] gap-x-6 gap-y-3 items-center">
<span className="text-sm text-muted-foreground">Health</span>
<span className="text-sm font-medium">
{status ? (
status.healthy ? (
<Badge className="bg-green-100 text-green-700 border-0">Healthy</Badge>
) : (
<Badge className="bg-red-100 text-red-700 border-0">Unhealthy</Badge>
)
) : "..."}
{!status ? "..." : status.healthy ? (
<Badge className="bg-green-100 text-green-700 border-0">Healthy</Badge>
) : !statusSettled ? (
<Badge className="bg-amber-100 text-amber-700 border-0">Checking...</Badge>
) : (
<Badge className="bg-red-100 text-red-700 border-0">Unhealthy</Badge>
)}
</span>
<span className="text-sm text-muted-foreground">Version</span>